├── .gitignore ├── .golangci.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DEV.md ├── Dockerfile.device-faker ├── Dockerfile.gaudi ├── Dockerfile.gaudi-test ├── Dockerfile.gpu ├── Dockerfile.qat ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── SECURITY.md ├── charts ├── intel-gaudi-resource-driver │ ├── .helmignore │ ├── Chart.yaml │ ├── README.md │ ├── templates │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── device-class.yaml │ │ ├── nfd.yaml │ │ ├── resource-driver-namespace.yaml │ │ ├── resource-driver.yaml │ │ ├── serviceaccount.yaml │ │ ├── validating-admission-policy-binding.yaml │ │ └── validating-admission-policy.yaml │ └── values.yaml ├── intel-gpu-resource-driver │ ├── .helmignore │ ├── Chart.yaml │ ├── README.md │ ├── templates │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── device-class.yaml │ │ ├── node-feature-rules.yaml │ │ ├── resource-driver.yaml │ │ ├── serviceaccount.yaml │ │ ├── validating-admission-policy-binding.yaml │ │ └── validating-admission-policy.yaml │ └── values.yaml └── intel-qat-resource-driver │ ├── Chart.yaml │ ├── README.md │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── clusterrole.yaml │ ├── clusterrolebinding.yaml │ ├── device-class.yaml │ ├── nfd.yaml │ ├── resource-driver-namespace.yaml │ ├── resource-driver.yaml │ ├── serviceaccount.yaml │ ├── validating-admission-policy-binding.yaml │ └── validating-admission-policy.yaml │ └── values.yaml ├── cmd ├── cdi-specs-generator │ └── main.go ├── device-faker │ └── main.go ├── kubelet-gaudi-plugin │ ├── driver.go │ ├── driver_test.go │ ├── healthcare.go │ ├── healthcare_test.go │ ├── main.go │ ├── node_state.go │ └── node_state_test.go ├── kubelet-gpu-plugin │ ├── driver.go │ ├── driver_test.go │ ├── main.go │ ├── node_state.go │ ├── node_state_test.go │ └── test-claims │ │ ├── empty.json │ │ ├── invalid.json │ │ └── multi.json ├── kubelet-qat-plugin │ ├── clientsets.go │ ├── config.go │ ├── deviceresources.go │ ├── driver.go │ ├── driver_test.go │ └── main.go └── qat-showdevice │ └── main.go ├── deployments ├── gaudi │ ├── base │ │ ├── device-class.yaml │ │ ├── kustomization.yaml │ │ ├── namespace.yaml │ │ └── resource-driver.yaml │ ├── examples │ │ ├── deployment-inline.yaml │ │ ├── monitor-pod-inline.yaml │ │ └── pod-inline.yaml │ ├── kustomization.yaml │ └── overlays │ │ ├── device-faker │ │ ├── device-faker.yaml │ │ ├── kustomization.yaml │ │ └── remove-sysfs.yaml │ │ └── nfd_labeled_nodes │ │ ├── add-nodeselector-intel-gaudi.yaml │ │ ├── kustomization.yaml │ │ └── nfd-intel-gaudi-device-rule.yaml ├── gpu │ ├── base │ │ ├── device-class.yaml │ │ ├── kustomization.yaml │ │ ├── namespace.yaml │ │ └── resource-driver.yaml │ ├── examples │ │ ├── claim-external-gpu.yaml │ │ ├── deployment-inline.yaml │ │ ├── monitor-pod-inline.yaml │ │ ├── pod-for-claim-external-gpu.yaml │ │ └── pod-inline-gpu.yaml │ ├── intel-xpumanager │ │ ├── gpu-monitor-claim.yaml │ │ ├── kustomization.yaml │ │ ├── xpumd-add-dra-resource.yaml │ │ └── xpumd-delete-limits.yaml │ ├── kustomization.yaml │ └── overlays │ │ ├── device-faker │ │ ├── device-faker.yaml │ │ ├── kustomization.yaml │ │ └── remove-sysfs.yaml │ │ └── nfd_labeled_nodes │ │ ├── add-nodeselector-intel-gpu.yaml │ │ ├── kustomization.yaml │ │ ├── nfd-intel-gpu-device-rule.yaml │ │ └── nfd-intel-gpu-platform-labeling.yaml └── qat │ ├── base │ ├── device-class.yaml │ ├── kustomization.yaml │ ├── namespace.yaml │ └── resource-driver.yaml │ ├── examples │ ├── deployment-inline.yaml │ └── intel-qat-resource-driver-configuration.yaml │ ├── kustomization.yaml │ ├── overlays │ └── nfd_labeled_nodes │ │ ├── add-nodeselector-intel-qat.yaml │ │ ├── kustomization.yaml │ │ └── nfd-intel-qat-device-rule.yaml │ └── tests │ ├── openssl-qat-engine │ ├── kustomization.yaml │ └── openssl-qat-engine.yaml │ ├── qat-dpdk-test │ ├── compress-perf.yaml │ ├── crypto-perf.yaml │ ├── file.txt │ ├── kustomization.yaml │ └── modified-cluster-setup.yaml │ ├── qatlib-sample-code │ ├── kustomization.yaml │ └── qatlib-sample-code.yaml │ └── resource-claim-template.yaml ├── doc ├── CLUSTER_SETUP.md ├── cdi-spec-generator │ ├── BUILD.md │ └── README.md ├── device-faker │ └── README.md ├── gaudi │ ├── BUILD.md │ ├── README.md │ └── USAGE.md ├── gpu │ ├── BUILD.md │ ├── README.md │ ├── USAGE.md │ ├── allocation-delayed.puml │ ├── allocation-immediate.puml │ ├── complete-overview.puml │ ├── generate-pngs.sh │ └── high-level-overview.puml └── qat │ ├── BUILD.md │ ├── README.md │ ├── TESTING.md │ └── USAGE.md ├── gaudi.mk ├── go.mod ├── go.sum ├── gpu.mk ├── hack ├── boilerplate.go.txt ├── clusterconfig.yaml ├── fake_libhlml │ ├── Makefile │ ├── README.md │ └── fake_libhlml.c └── tools.go ├── pkg ├── fakehlml │ ├── fake_hlml.go │ └── fake_hlml.h ├── fakesysfs │ ├── fakesysfs.go │ ├── gaudi.go │ ├── gpu.go │ └── qat.go ├── gaudi │ ├── cdihelpers │ │ ├── cdihelpers.go │ │ └── cdihelpers_test.go │ ├── device │ │ ├── device.go │ │ └── device_test.go │ └── discovery │ │ ├── discovery.go │ │ └── discovery_test.go ├── gpu │ ├── cdihelpers │ │ └── cdihelpers.go │ ├── device │ │ └── device.go │ └── discovery │ │ └── discovery.go ├── helpers │ ├── device.go │ ├── device_test.go │ ├── driver.go │ ├── helpers.go │ ├── helpers_test.go │ ├── node_state.go │ └── node_state_test.go ├── plugintesthelpers │ └── plugintesthelpers.go ├── qat │ ├── cdi │ │ └── cdi.go │ └── device │ │ ├── device.go │ │ ├── device_test.go │ │ └── state.go └── version │ └── version.go ├── qat.mk └── test └── e2e ├── dra_suite_test.go ├── qat └── qat.go └── utils └── utils.go /.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /vendor/ 3 | 4 | # macOS 5 | .DS_Store 6 | 7 | # files generated by editors 8 | .idea/ 9 | *.iml 10 | .vscode/ 11 | *.swp 12 | *.sublime-project 13 | *.sublime-workspace 14 | *~ 15 | *.o 16 | *.so 17 | *.out 18 | -------------------------------------------------------------------------------- /.golangci.yaml: -------------------------------------------------------------------------------- 1 | # please keep this alphabetized 2 | linters: 3 | enable: 4 | - asciicheck 5 | - contextcheck 6 | - forcetypeassert 7 | - gocritic 8 | - godot 9 | - gofmt 10 | - goimports 11 | - misspell 12 | - stylecheck 13 | - gocyclo 14 | 15 | run: 16 | tests: true 17 | timeout: 1m 18 | 19 | linters-settings: 20 | gocyclo: 21 | min-complexity: 15 22 | goimports: 23 | local-prefixes: "github.com/intel/intel-resource-drivers-for-kubernetes" 24 | stylecheck: 25 | # default set minus ID - see https://golangci-lint.run/usage/linters/#stylecheck 26 | initialisms: ["ACL", "API", "ASCII", "CPU", "CSS", "DNS", "EOF", "GUID", "HTML", "HTTP", "HTTPS", "IP", "JSON", "QPS", "RAM", "RPC", "SLA", "SMTP", "SQL", "SSH", "TCP", "TLS", "TTL", "UDP", "UI", "GID", "UID", "UUID", "URI", "URL", "UTF8", "VM", "XML", "XMPP", "XSRF", "XSS", "SIP", "RTP", "AMQP", "DB", "TS"] 27 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | Intel Resource Drivers for Kubernetes is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ### Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. 58 | -------------------------------------------------------------------------------- /DEV.md: -------------------------------------------------------------------------------- 1 | Contents: 2 | * [Runtime](#runtime) 3 | * [Enable CDI in Containerd](#enable-cdi-in-containerd) 4 | * [Generated source code](#generated-source-code) 5 | * [Required tools](#required-tools) 6 | 7 | 8 | # Runtime 9 | 10 | Runtime needs to have CDI injection support 11 | 12 | - CRI-O: 1.23+, enabled by default. 13 | - Containerd: v1.7+, disabled by default. 14 | 15 | ## Enable CDI in Containerd 16 | 17 | Containerd config file should have `enable_cdi` and `cdi_specs_dir`. Example `/etc/containerd/config.toml`: 18 | ``` 19 | version = 2 20 | [plugins] 21 | [plugins."io.containerd.grpc.v1.cri"] 22 | enable_cdi = true 23 | cdi_specs_dir = ["/etc/cdi", "/var/run/cdi"] 24 | ``` 25 | 26 | ### Determine your go binaries location from `go install --help`, quote: 27 | > Executables are installed in the directory named by the GOBIN environment 28 | > variable, which defaults to $GOPATH/bin or $HOME/go/bin if the GOPATH 29 | > environment variable is not set. Executables in $GOROOT 30 | > are installed in $GOROOT/bin or $GOTOOLDIR instead of $GOBIN. 31 | 32 | ### Way 1 : install tools with Go: 33 | 34 | #### Add Go binaries directory to PATH 35 | Add this to the end of your `$HOME/.bashrc`: 36 | ```bash 37 | export PATH=":$PATH" 38 | ``` 39 | 40 | #### install tools 41 | ```bash 42 | GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest 43 | GO111MODULE=on go install k8s.io/code-generator/cmd/client-gen@latest 44 | ``` 45 | 46 | ### Way 2 : clone and build it: 47 | ```bash 48 | git clone https://github.com/kubernetes-sigs/controller-tools.git 49 | cd controller-tools 50 | go build ./cmd/controller-gen 51 | cd - 52 | git clone https://github.com/kubernetes/code-generator.git 53 | cd code-generator 54 | go build ./cmd/client-gen 55 | cd - 56 | ``` 57 | 58 | Make them available in PATH, for instance $HOME/go/bin: 59 | ```bash 60 | cp controller-tools/controller-gen code-generator/client-gen $HOME/go/bin 61 | # ensure it's in the path. You may want to add export to $HOME/.bashrc 62 | echo $PATH | grep -q $HOME/go/bin || export PATH=$HOME/go/bin:$PATH 63 | ``` 64 | # Running tests 65 | 66 | Since Q2 '25 Gaudi DRA driver uses `gohlml` to retrieve health-related information. 67 | There is a hardcoded path to the HLML shared library, and `hack/fake_libhlml` was created based 68 | on the `hlml.h` from `gohlml` project - it is effectively a stub / mock with flow control support. 69 | 70 | When health-related tests call `gohlml` - it should in turn call fake `libhlml`, instead of the real 71 | one, on the nodes where there is no real Gaudi HW and SW installed (e.g. CI). This means, if the 72 | tests are run on your development machine - you should either deploy fresh fake `libhlml.so`, or 73 | run tests in a `gaudi-dra-driver-test-image` container like CI does. 74 | 75 | Deploying fake hlml instead of real `libhlml` should allow running tests in VSCode and other IDEs, 76 | after `ldconfig` is [configured properly](hack/fake_libhlml/README.md) 77 | 78 | ## Deploying 79 | ```shell 80 | $ cd hack/fake_libhlml 81 | $ make clean 82 | rm -f fake_libhlml.o fake_libhlml.so 83 | $ make 84 | gcc -O -Wall -Wextra -Wno-unused-parameter -fPIC -c fake_libhlml.c -o fake_libhlml.o 85 | gcc -shared -o fake_libhlml.so fake_libhlml.o 86 | $ sudo cp ./fake_libhlml.so /usr/lib/habanalabs/libhlml.so 87 | $ cat << EOF | sudo tee /etc/ld.so.conf.d/habanalabs.conf 88 | /usr/lib/habanalabs/ 89 | EOF 90 | 91 | $ sudo ldconfig 92 | ``` 93 | 94 | ## Running tests in container 95 | 96 | To have your own user ID inside container image without access / permission issues, build a fresh 97 | container image, then run tests. The CI uses its own user ID. 98 | 99 | ```shell 100 | $ make test-image 101 | $ make test-containerized 102 | ``` 103 | 104 | Tests provide coverage data. If you need to see the coverage report, just run Make target for needed 105 | coverage target, e.g. 106 | 107 | ``` 108 | make gaudi-coverage 109 | ``` 110 | -------------------------------------------------------------------------------- /Dockerfile.device-faker: -------------------------------------------------------------------------------- 1 | FROM golang:1.23.4@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 as build 2 | ARG LOCAL_LICENSES 3 | WORKDIR /build 4 | COPY . . 5 | 6 | RUN make bin/device-faker && \ 7 | mkdir -p /install_root && \ 8 | if [ -z "$LOCAL_LICENSES" ]; then \ 9 | make licenses; \ 10 | fi && \ 11 | cp -r licenses /install_root/ && \ 12 | cp bin/device-faker /install_root/ 13 | 14 | 15 | FROM alpine AS template 16 | COPY --from=build /install_root/device-faker /device-faker 17 | 18 | 19 | RUN mkdir -p /opt/templates && \ 20 | /device-faker gpu -n && \ 21 | mv /tmp/gpu-template-*.json /opt/templates/gpu-template.json && \ 22 | /device-faker gaudi -n && \ 23 | mv /tmp/gaudi-template-*.json /opt/templates/gaudi-template.json && \ 24 | chmod 644 /opt/templates/*.json 25 | 26 | FROM scratch 27 | LABEL description="Intel Device Faker" 28 | COPY --from=build /install_root/device-faker /device-faker 29 | COPY --from=template /opt/templates /opt/templates 30 | ENTRYPOINT ["/device-faker"] 31 | -------------------------------------------------------------------------------- /Dockerfile.gaudi: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG HTTP_PROXY 16 | ARG HTTPS_PROXY 17 | ARG NO_PROXY 18 | 19 | FROM golang:1.23.4@sha256:ccdca3b3bde3bfee2518a087b467f2b452fad9ba3e378d3c1578db494c8cb13b as build 20 | ARG LOCAL_LICENSES 21 | WORKDIR /build 22 | COPY . . 23 | 24 | # install libhlml.so 25 | RUN \ 26 | export http_proxy=${HTTP_PROXY} https_proxy=${HTTPS_PROXY} no_proxy=${NO_PROXY} && \ 27 | curl -fsSL https://vault.habana.ai/artifactory/api/gpg/key/public | gpg --dearmor | tee /etc/apt/trusted.gpg.d/habanalabs.gpg > /dev/null && \ 28 | wget -q -O /etc/apt/sources.list.d/habanalabs_synapseai.list "https://vault.habana.ai/artifactory/gaudi-installer/repos/1.16.2/debian10.10/habanalabs_synapseai.list" > /dev/null && \ 29 | apt-get update && \ 30 | apt-get download habanalabs-firmware-tools && \ 31 | ls -al && \ 32 | dpkg --force-all -i *.deb 33 | 34 | RUN make gaudi && \ 35 | mkdir -p /install_root && \ 36 | if [ -z "$LOCAL_LICENSES" ]; then \ 37 | make licenses; \ 38 | fi && \ 39 | cp -r licenses /install_root/ && \ 40 | mkdir /install_root/licenses/habanalabs && \ 41 | cp /usr/share/doc/habanalabs-firmware-tools/* /install_root/licenses/habanalabs/ && \ 42 | cp bin/kubelet-gaudi-plugin /install_root/ 43 | 44 | # Get libc and sources from Ubuntu24, libhlml needs GLIBC_2.38 45 | FROM ubuntu:24.04@sha256:80dd3c3b9c6cecb9f1667e9290b3bc61b78c2678c02cbdae5f0fea92cc6734ab as ubuntu 46 | RUN \ 47 | cat /etc/apt/sources.list.d/ubuntu.sources && \ 48 | sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources && \ 49 | apt-get update && \ 50 | apt-get install -y dpkg-dev && \ 51 | mkdir /tmp/src && \ 52 | cd /tmp/src && \ 53 | apt-get source libc6 coreutils dash 54 | 55 | FROM scratch 56 | LABEL description="Intel Gaudi resource driver for Kubernetes" 57 | 58 | COPY --from=build /install_root / 59 | COPY --from=build /usr/lib/habanalabs/libhlml.so /usr/lib/habanalabs/libhlml.so 60 | COPY --from=ubuntu /lib/x86_64-linux-gnu/libc.so.6 /lib/x86_64-linux-gnu/libc.so.6 61 | COPY --from=ubuntu /lib64/ld-linux-x86-64.so.2 /lib64/ld-linux-x86-64.so.2 62 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libm.so.6 63 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libdl.so.2 /usr/lib/x86_64-linux-gnu/libdl.so.2 64 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libz.so.1 /usr/lib/x86_64-linux-gnu/libz.so.1 65 | COPY --from=ubuntu /bin/cat /bin/cat 66 | COPY --from=ubuntu /bin/sh /bin/sh 67 | COPY --from=ubuntu /tmp/src/*tar.xz /src/ 68 | 69 | ENV LD_LIBRARY_PATH=/usr/lib/habanalabs:/lib/x86_64-linux-gnu:/lib64:/usr/lib/x86_64-linux-gnu 70 | ENV PATH=/bin 71 | -------------------------------------------------------------------------------- /Dockerfile.gaudi-test: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | FROM golang:1.23.4@sha256:ccdca3b3bde3bfee2518a087b467f2b452fad9ba3e378d3c1578db494c8cb13b as build 15 | WORKDIR /build 16 | COPY . . 17 | 18 | RUN cd hack/fake_libhlml && \ 19 | make clean && make 20 | 21 | FROM golang:1.23.4@sha256:ccdca3b3bde3bfee2518a087b467f2b452fad9ba3e378d3c1578db494c8cb13b 22 | ARG UID=1001 23 | ARG GID=1001 24 | 25 | COPY --from=build /build/hack/fake_libhlml/fake_libhlml.so /usr/lib/habanalabs/libhlml.so 26 | 27 | RUN \ 28 | echo "existing user: $(id $UID)" && \ 29 | groupadd -g ${GID} ubuntu && \ 30 | useradd -m -g ${GID} -u ${UID} -s /bin/bash ubuntu && \ 31 | mkdir /github && \ 32 | chmod 777 /github 33 | 34 | RUN \ 35 | mkdir -m 755 /home/ubuntu/.cache/ && \ 36 | mkdir -m 755 /home/ubuntu/.cache/go-build && \ 37 | mkdir -m 755 /home/ubuntu/.cache/go-mod && \ 38 | chown -R ubuntu:ubuntu /home/ubuntu/.cache && \ 39 | mkdir /home/ubuntu/src && \ 40 | git config --global --add safe.directory /home/ubuntu/src 41 | 42 | ENV GOCACHE=/home/ubuntu/.cache/go-build 43 | ENV GOMODCACHE=/home/ubuntu/.cache/go-mod 44 | 45 | USER ubuntu 46 | WORKDIR /home/ubuntu 47 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM golang:1.23.4@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 as build 16 | ARG LOCAL_LICENSES 17 | WORKDIR /build 18 | COPY . . 19 | 20 | RUN make gpu && \ 21 | mkdir -p /install_root && \ 22 | if [ -z "$LOCAL_LICENSES" ]; then \ 23 | make licenses; \ 24 | fi && \ 25 | cp -r licenses /install_root/ && \ 26 | cp bin/kubelet-gpu-plugin /install_root/ 27 | 28 | FROM scratch 29 | WORKDIR / 30 | LABEL description="Intel GPU resource driver for Kubernetes" 31 | 32 | COPY --from=build /install_root / 33 | -------------------------------------------------------------------------------- /Dockerfile.qat: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM golang:1.23.4@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 as build 16 | ARG LOCAL_LICENSES 17 | WORKDIR /build 18 | COPY . . 19 | 20 | RUN make qat && \ 21 | mkdir -p /install_root && \ 22 | if [ -z "$LOCAL_LICENSES" ]; then \ 23 | make licenses; \ 24 | fi && \ 25 | cp -r licenses /install_root/ && \ 26 | cp bin/kubelet-qat-plugin /install_root/ && \ 27 | cp bin/qat-showdevice /install_root/ 28 | 29 | 30 | FROM scratch 31 | WORKDIR / 32 | LABEL description="Intel QAT resource driver for Kubernetes" 33 | 34 | COPY --from=build /install_root / 35 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | These contents may have been developed with support from one or more Intel-operated generative artificial intelligence solutions. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intel resource drivers for Kubernetes 2 | 3 | CAUTION: This is an beta / non-production software, do not use on production clusters. 4 | 5 | ## This repository containes following resource drivers: 6 | 7 | - [GPU](doc/gpu/README.md) 8 | - [Gaudi](doc/gaudi/README.md) 9 | - [QAT](doc/qat/README.md) 10 | 11 | ## Glossary 12 | 13 | - DRA https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/3063-dynamic-resource-allocation 14 | - CDI https://github.com/cncf-tags/container-device-interface/ 15 | - K8s https://github.com/kubernetes/kubernetes.git 16 | 17 | ## About resource drivers 18 | 19 | Intel resource drivers for Kubernetes is an alternative for 20 | [Intel device plugins](https://github.com/intel/intel-device-plugins-for-kubernetes/), 21 | facilitating workload offloading by providing accelerator access on Kubernetes cluster worker nodes. 22 | 23 | Resource drivers are not Linux kernel mode drivers (KMD), and do not help the operational system on 24 | the worker nodes detect and operate the accelerators. 25 | 26 | The resource drivers are based on Dynamic Resource Allocation (DRA) framework in Kubernetes 27 | 28 | ### About Dynamic Resource Allocation 29 | 30 | Dynamic Resource Allocation (DRA) is a resource management framework in Kubernetes (1.26+), that 31 | allows management of special resources in cluster (typically HW accelerators) by vendor-provided 32 | resource drivers (typically a controller and a node-agent / kubelet-plugin) in a common way. 33 | 34 | Resource drivers are meant to handle discovery, allocation, accounting of specific resources as well 35 | as their preparation for Pod before Pod startup, and cleanup after the Pod has completed successfully 36 | and the resource is no longer needed. More info is 37 | [in the KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/3063-dynamic-resource-allocation) 38 | 39 | 40 | ## Release process 41 | 42 | Every resource driver in this repository has its own releases, release branches and version tags. 43 | 44 | Typical release cadence is quarterly. During the release creation the project's documentation, 45 | deployment files etc. will be changed to point to the newly created version. 46 | 47 | Once the content is available in the main branch and validation PASSes, release branch will be 48 | created (e.g. gpu-release-v0.2.0). The HEAD of release branch will also be tagged with the corresponding 49 | tag (e.g. gpu-v0.2.0). 50 | 51 | During the release creation, the project's documentation, deployment files etc. will be changed to 52 | point to the newly created version. 53 | 54 | Patch releases (e.g. gaudi-v0.1.1) are done on a need basis if there are security issues or minor fixes 55 | for specific supported version. Fixes are always cherry-picked from the main branch to the release 56 | branches. 57 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and 3 | providing clear guidance on the solution, impact, severity and mitigation. 4 | 5 | ## Reporting a Vulnerability 6 | Please report any security vulnerabilities in this project 7 | [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 8 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | # Common backup files 9 | *.swp 10 | *.bak 11 | *.tmp 12 | *.orig 13 | *~ 14 | # Various IDEs 15 | .project 16 | .idea/ 17 | *.tmproj 18 | .vscode/ 19 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: intel-gaudi-resource-driver 3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel Gaudi Resource Driver 4 | 5 | type: application 6 | version: 0.3.0 7 | appVersion: "v0.3.0" 8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts 9 | 10 | dependencies: 11 | - name: node-feature-discovery 12 | alias: nfd 13 | version: "0.17.1" 14 | condition: nfd.enabled 15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts 16 | 17 | annotations: 18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 20 | org.opencontainers.image.version: "0.3.0" 21 | org.opencontainers.image.title: "Intel Gaudi Resource Driver" 22 | org.opencontainers.image.description: "This chart installs the Intel Gaudi resource driver on Kubernetes." 23 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Resource Allocation (DRA) Intel Gaudi Driver Helm Chart 2 | 3 | ## The chart installs Gaudi resource driver: 4 | 5 | - [Gaudi](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/gaudi/README.md) 6 | 7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main) 8 | 9 | 10 | ## Installing the chart 11 | 12 | ``` 13 | helm install intel-gaudi-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver \ 14 | --create-namespace \ 15 | --namespace intel-gaudi-resource-driver 16 | ``` 17 | 18 | ## Uninstalling the chart 19 | ``` 20 | helm uninstall intel-gaudi-resource-driver --namespace intel-gaudi-resource-driver 21 | ``` 22 | (Optional) Delete the namespace: 23 | ``` 24 | kubectl delete ns intel-gaudi-resource-driver 25 | ``` 26 | 27 | ## Configuration 28 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: 29 | 30 | ``` 31 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver 32 | ``` 33 | 34 | You may also run `helm show values` on this chart's dependencies for additional options. 35 | 36 | | Key | Type | Default | 37 | |-----|------|---------| 38 | | image.repository | string | `intel` | 39 | | image.name | string | `"intel-gaudi-resource-driver"` | 40 | | image.pullPolicy | string | `"IfNotPresent"` | 41 | | image.tag | string | `"v0.3.0"` | 42 | 43 | > [!Note] 44 | > If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases. 45 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name }}. -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* Define common helpers */}} 2 | {{- define "intel-gaudi-resource-driver.chart" -}} 3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 4 | {{- end }} 5 | 6 | {{/* Define the base name for the driver */}} 7 | {{- define "intel-gaudi-resource-driver.baseName" -}} 8 | intel-gaudi-resource-driver 9 | {{- end }} 10 | 11 | {{/* Specific helpers */}} 12 | {{- define "intel-gaudi-resource-driver.name" -}} 13 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 14 | {{- end }} 15 | 16 | {{/* Create a default fully qualified app name */}} 17 | {{- define "intel-gaudi-resource-driver.fullname" -}} 18 | {{- if .Values.fullnameOverride -}} 19 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 20 | {{- else -}} 21 | {{- printf "%s-%s" (include "intel-gaudi-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}} 22 | {{- end -}} 23 | {{- end }} 24 | 25 | {{- define "intel-gaudi-resource-driver.namespace" -}} 26 | {{- default .Release.Namespace .Values.namespaceOverride }} 27 | {{- end }} 28 | 29 | {{/* Labels for templates */}} 30 | {{- define "intel-gaudi-resource-driver.labels" -}} 31 | helm.sh/chart: {{ include "intel-gaudi-resource-driver.chart" . }} 32 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 33 | app.kubernetes.io/managed-by: {{ .Release.Service }} 34 | {{- end }} 35 | 36 | {{- define "intel-gaudi-resource-driver.clusterRoleName" -}} 37 | {{- printf "%s-role" (include "intel-gaudi-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 38 | {{- end }} 39 | 40 | {{- define "intel-gaudi-resource-driver.clusterRoleBindingName" -}} 41 | {{- printf "%s-rolebinding" (include "intel-gaudi-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 42 | {{- end }} 43 | 44 | {{- define "intel-gaudi-resource-driver.serviceAccountName" -}} 45 | {{- if .Values.serviceAccount.create -}} 46 | {{- default "intel-gaudi-sa" .Values.serviceAccount.name -}} 47 | {{- end -}} 48 | {{- end }} 49 | 50 | {{/* Define full image name */}} 51 | {{- define "intel-gaudi-resource-driver.fullimage" -}} 52 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}} 53 | {{- end }} 54 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "intel-gaudi-resource-driver.clusterRoleName" . }} 5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }} 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["nodes"] 9 | verbs: ["get"] 10 | - apiGroups: ["resource.k8s.io"] 11 | resources: ["resourceslices"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: ["resource.k8s.io"] 14 | resources: ["resourceclaims"] 15 | verbs: ["get"] 16 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "intel-gaudi-resource-driver.clusterRoleBindingName" . }} 5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }} 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 9 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }} 10 | roleRef: 11 | kind: ClusterRole 12 | name: {{ include "intel-gaudi-resource-driver.clusterRoleName" . }} 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: gaudi.intel.com 5 | 6 | spec: 7 | selectors: 8 | - cel: 9 | expression: device.driver == "gaudi.intel.com" 10 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/nfd.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.nfd.enabled }} 2 | apiVersion: nfd.k8s-sigs.io/v1alpha1 3 | kind: NodeFeatureRule 4 | metadata: 5 | name: intel-gaudi-device-rule 6 | spec: 7 | rules: 8 | - name: "intel.gaudi" 9 | labels: 10 | "intel.feature.node.kubernetes.io/gaudi": "true" 11 | matchFeatures: 12 | - feature: pci.device 13 | matchExpressions: 14 | vendor: {op: In, value: ["1da3"]} 15 | device: {op: In, value: ["1020", "1030"]} 16 | {{- end }} 17 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/resource-driver-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: intel-gaudi-resource-driver 5 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }} 6 | labels: 7 | {{- include "intel-gaudi-resource-driver.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: intel-gaudi-resource-driver-kubelet-plugin 12 | template: 13 | metadata: 14 | labels: 15 | app: intel-gaudi-resource-driver-kubelet-plugin 16 | spec: 17 | serviceAccount: intel-gaudi-resource-driver-service-account 18 | serviceAccountName: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 19 | containers: 20 | - name: kubelet-plugin 21 | image: {{ include "intel-gaudi-resource-driver.fullimage" . }} 22 | imagePullPolicy: {{ .Values.image.pullPolicy }} 23 | command: ["/kubelet-gaudi-plugin"] 24 | env: 25 | - name: NODE_NAME 26 | valueFrom: 27 | fieldRef: 28 | fieldPath: spec.nodeName 29 | - name: POD_NAMESPACE 30 | valueFrom: 31 | fieldRef: 32 | fieldPath: metadata.namespace 33 | - name: SYSFS_ROOT 34 | value: "/sysfs" 35 | volumeMounts: 36 | - name: plugins-registry 37 | mountPath: /var/lib/kubelet/plugins_registry 38 | - name: plugins 39 | mountPath: /var/lib/kubelet/plugins 40 | - name: cdi 41 | mountPath: /etc/cdi 42 | - name: varruncdi 43 | mountPath: /var/run/cdi 44 | # when using fake sysfs - mount at the same place as on host 45 | - name: sysfs 46 | mountPath: "/sysfs" 47 | securityContext: 48 | privileged: false 49 | allowPrivilegeEscalation: false 50 | capabilities: 51 | drop: ["ALL"] 52 | readOnlyRootFilesystem: true 53 | runAsUser: 0 54 | seccompProfile: 55 | type: RuntimeDefault 56 | volumes: 57 | - name: plugins-registry 58 | hostPath: 59 | path: /var/lib/kubelet/plugins_registry 60 | - name: plugins 61 | hostPath: 62 | path: /var/lib/kubelet/plugins 63 | - name: cdi 64 | hostPath: 65 | path: /etc/cdi 66 | - name: varruncdi 67 | hostPath: 68 | path: /var/run/cdi 69 | - name: sysfs 70 | hostPath: 71 | path: /sys 72 | {{- with .Values.kubeletPlugin.tolerations }} 73 | tolerations: 74 | {{- toYaml . | nindent 8 }} 75 | {{- end }} 76 | {{- if .Values.nfd.enabled }} 77 | nodeSelector: 78 | intel.feature.node.kubernetes.io/gaudi: "true" 79 | {{- else }} 80 | {{- with .Values.kubeletPlugin.nodeSelector }} 81 | nodeSelector: 82 | {{- toYaml . | nindent 8 }} 83 | {{- end }} 84 | {{- end }} 85 | {{- with .Values.kubeletPlugin.affinity }} 86 | affinity: 87 | {{- toYaml . | nindent 8 }} 88 | {{- end }} 89 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 5 | namespace: {{ include "intel-gaudi-resource-driver.namespace" . }} 6 | labels: 7 | {{- include "intel-gaudi-resource-driver.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/validating-admission-policy-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gaudi 5 | spec: 6 | policyName: resourceslices-policy-dra-kubelet-plugin-gaudi 7 | validationActions: [Deny] 8 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicy 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gaudi 5 | spec: 6 | failurePolicy: Fail 7 | matchConstraints: 8 | resourceRules: 9 | - apiGroups: ["resource.k8s.io"] 10 | apiVersions: ["v1beta1"] 11 | operations: ["CREATE", "UPDATE", "DELETE"] 12 | resources: ["resourceslices"] 13 | matchConditions: 14 | - name: isRestrictedUser 15 | expression: >- 16 | request.userInfo.username == "system:serviceaccount:intel-gaudi-resource-driver:intel-gaudi-resource-driver-service-account" 17 | variables: 18 | - name: userNodeName 19 | expression: >- 20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 21 | - name: objectNodeName 22 | expression: >- 23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 24 | validations: 25 | - expression: variables.userNodeName != "" 26 | message: >- 27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 28 | - expression: variables.userNodeName == variables.objectNodeName 29 | messageExpression: >- 30 | "this user running on node '"+variables.userNodeName+"' may not modify " + 31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 32 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for intel-gaudi-resource-driver. 2 | nameOverride: "" 3 | namespaceOverride: "intel-gaudi-resource-driver" 4 | fullnameOverride: "" 5 | selectorLabelsOverride: {} 6 | 7 | imagePullSecrets: [] 8 | image: 9 | repository: intel 10 | name: intel-gaudi-resource-driver 11 | pullPolicy: IfNotPresent 12 | tag: "v0.3.0" 13 | 14 | serviceAccount: 15 | create: true 16 | annotations: {} 17 | name: intel-gaudi-resource-driver-service-account 18 | automount: true 19 | 20 | kubeletPlugin: 21 | podAnnotations: {} 22 | nodeSelector: {} 23 | # label used when nfd.enabled is true 24 | #intel.feature.node.kubernetes.io/gaudi: "true" 25 | tolerations: 26 | - key: node-role.kubernetes.io/master 27 | operator: Exists 28 | effect: NoSchedule 29 | - key: node-role.kubernetes.io/control-plane 30 | operator: Exists 31 | effect: NoSchedule 32 | # Refer to the official documentation for Node Feature Discovery (NFD) 33 | # regarding node tainting: 34 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting 35 | - key: "intel.feature.node.kubernetes.io/gaudi" 36 | operator: "Exists" 37 | effect: "NoSchedule" 38 | affinity: {} 39 | 40 | nfd: 41 | enabled: false # change to true to install NFD to the cluster 42 | nameOverride: intel-gaudi-nfd 43 | # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16): 44 | # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters 45 | enableNodeFeatureApi: true 46 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | # Common backup files 9 | *.swp 10 | *.bak 11 | *.tmp 12 | *.orig 13 | *~ 14 | # Various IDEs 15 | .project 16 | .idea/ 17 | *.tmproj 18 | .vscode/ 19 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: intel-gpu-resource-driver 3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Resource Driver 4 | 5 | type: application 6 | version: 0.7.0 7 | appVersion: "v0.7.0" 8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts 9 | 10 | dependencies: 11 | - name: node-feature-discovery 12 | alias: nfd 13 | version: "0.17.1" 14 | condition: nfd.enabled 15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts 16 | 17 | annotations: 18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 20 | org.opencontainers.image.version: "0.7.0" 21 | org.opencontainers.image.title: "Intel GPU Resource Driver" 22 | org.opencontainers.image.description: "This chart installs the Intel GPU resource driver on Kubernetes." 23 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Resource Allocation (DRA) Intel GPU Driver Helm Chart 2 | 3 | ## The chart installs GPU resource driver: 4 | 5 | - [GPU](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/gpu/README.md) 6 | 7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main) 8 | 9 | 10 | ## Installing the chart 11 | 12 | ``` 13 | helm install \ 14 | --namespace "intel-gpu-resource-driver" \ 15 | --create-namespace \ 16 | intel-gpu-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver 17 | ``` 18 | 19 | > [!NOTE] 20 | > For Kubernetes clusters using [Pod Security Standards](https://kubernetes.io/docs/concepts/security/pod-security-standards/), 21 | > pre-create the namespace with the respective label allowing to use HostPath Volumes. 22 | 23 | ``` 24 | kubectl create namespace intel-gpu-resource-driver 25 | kubectl label --overwrite namespace intel-gpu-resource-driver pod-security.kubernetes.io/enforce=privileged 26 | helm install \ 27 | --namespace "intel-gpu-resource-driver" \ 28 | intel-gpu-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver 29 | ``` 30 | 31 | ## Uninstalling the chart 32 | ``` 33 | helm uninstall intel-gpu-resource-driver --namespace intel-gpu-resource-driver 34 | ``` 35 | (Optional) Delete the namespace: 36 | ``` 37 | kubectl delete ns intel-gpu-resource-driver 38 | ``` 39 | 40 | ## Configuration 41 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: 42 | 43 | ```console 44 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver 45 | ``` 46 | 47 | You may also run `helm show values` on this chart's dependencies for additional options. 48 | 49 | | Key | Type | Default | 50 | |-----|------|---------| 51 | | image.repository | string | `intel` | 52 | | image.name | string | `"intel-gpu-resource-driver"` | 53 | | image.pullPolicy | string | `"IfNotPresent"` | 54 | | image.tag | string | `"v0.7.0"` | 55 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name }}. -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* Define common helpers */}} 2 | {{- define "intel-gpu-resource-driver.chart" -}} 3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 4 | {{- end }} 5 | 6 | {{/* Define the base name for the driver */}} 7 | {{- define "intel-gpu-resource-driver.baseName" -}} 8 | intel-gpu-resource-driver 9 | {{- end }} 10 | 11 | {{- define "intel-gpu-resource-driver.name" -}} 12 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 13 | {{- end }} 14 | 15 | {{- define "intel-gpu-resource-driver.fullname" -}} 16 | {{- if .Values.fullnameOverride -}} 17 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 18 | {{- else -}} 19 | {{- printf "%s-%s" (include "intel-gpu-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}} 20 | {{- end -}} 21 | {{- end }} 22 | 23 | {{/* Labels for templates */}} 24 | {{- define "intel-gpu-resource-driver.labels" -}} 25 | helm.sh/chart: {{ include "intel-gpu-resource-driver.chart" . }} 26 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 27 | app.kubernetes.io/managed-by: {{ .Release.Service }} 28 | {{- end }} 29 | 30 | {{- define "intel-gpu-resource-driver.clusterRoleName" -}} 31 | {{- printf "%s-role" (include "intel-gpu-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 32 | {{- end }} 33 | 34 | {{- define "intel-gpu-resource-driver.clusterRoleBindingName" -}} 35 | {{- printf "%s-rolebinding" (include "intel-gpu-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 36 | {{- end }} 37 | 38 | {{- define "intel-gpu-resource-driver.serviceAccountName" -}} 39 | {{- if .Values.serviceAccount.create -}} 40 | {{- default "intel-gpu-sa" .Values.serviceAccount.name -}} 41 | {{- end -}} 42 | {{- end }} 43 | 44 | {{/* Define full image name */}} 45 | {{- define "intel-gpu-resource-driver.fullimage" -}} 46 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}} 47 | {{- end }} 48 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "intel-gpu-resource-driver.clusterRoleName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["nodes"] 9 | verbs: ["get"] 10 | - apiGroups: ["resource.k8s.io"] 11 | resources: ["resourceslices"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: ["resource.k8s.io"] 14 | resources: ["resourceclaims"] 15 | verbs: ["get"] 16 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "intel-gpu-resource-driver.clusterRoleBindingName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} 9 | namespace: {{ .Release.Namespace }} 10 | roleRef: 11 | kind: ClusterRole 12 | name: {{ include "intel-gpu-resource-driver.clusterRoleName" . }} 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: gpu.intel.com 5 | 6 | spec: 7 | selectors: 8 | - cel: 9 | expression: device.driver == "gpu.intel.com" 10 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/node-feature-rules.yaml: -------------------------------------------------------------------------------- 1 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }} 2 | apiVersion: nfd.k8s-sigs.io/v1alpha1 3 | kind: NodeFeatureRule 4 | metadata: 5 | name: intel-gpu-device-rule 6 | spec: 7 | rules: 8 | - name: "intel.gpu" 9 | labels: 10 | "intel.feature.node.kubernetes.io/gpu": "true" 11 | matchFeatures: 12 | - feature: pci.device 13 | matchExpressions: 14 | vendor: {op: In, value: ["8086"]} 15 | class: {op: In, value: ["0300", "0380"]} 16 | matchAny: 17 | - matchFeatures: 18 | - feature: kernel.loadedmodule 19 | matchExpressions: 20 | i915: {op: Exists} 21 | - matchFeatures: 22 | - feature: kernel.enabledmodule 23 | matchExpressions: 24 | i915: {op: Exists} 25 | --- 26 | apiVersion: nfd.k8s-sigs.io/v1alpha1 27 | kind: NodeFeatureRule 28 | metadata: 29 | name: intel-gpu-platform-labeling 30 | spec: 31 | rules: 32 | # A_Series (Alchemist) 33 | - labels: 34 | gpu.intel.com/family: "A_Series" 35 | matchFeatures: 36 | - feature: pci.device 37 | matchExpressions: 38 | class: {op: In, value: ["0300"]} 39 | vendor: {op: In, value: ["8086"]} 40 | device: 41 | op: In 42 | value: 43 | - "56a6" 44 | - "56a5" 45 | - "56a1" 46 | - "56a0" 47 | - "5694" 48 | - "5693" 49 | - "5692" 50 | - "5691" 51 | - "5690" 52 | - "56b3" 53 | - "56b2" 54 | - "56a4" 55 | - "56a3" 56 | - "5697" 57 | - "5696" 58 | - "5695" 59 | - "56b1" 60 | - "56b0" 61 | name: intel.gpu.a.series 62 | # Max_Series 63 | - labels: 64 | gpu.intel.com/family: "Max_Series" 65 | matchFeatures: 66 | - feature: pci.device 67 | matchExpressions: 68 | class: {op: In, value: ["0380"]} 69 | vendor: {op: In, value: ["8086"]} 70 | device: 71 | op: In 72 | value: 73 | - "0bda" 74 | - "0bd5" 75 | - "0bd9" 76 | - "0bdb" 77 | - "0bd7" 78 | - "0bd6" 79 | - "0bd0" 80 | name: intel.gpu.max.series 81 | # Flex_Series 82 | - labels: 83 | gpu.intel.com/family: "Flex_Series" 84 | matchFeatures: 85 | - feature: pci.device 86 | matchExpressions: 87 | class: {op: In, value: ["0300", "0380"]} 88 | vendor: {op: In, value: ["8086"]} 89 | device: 90 | op: In 91 | value: 92 | - "0f00" 93 | - "0f01" 94 | - "0f02" 95 | name: intel.gpu.flex.series 96 | {{- end }} 97 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: intel-gpu-resource-driver 12 | template: 13 | metadata: 14 | labels: 15 | app: intel-gpu-resource-driver 16 | spec: 17 | serviceAccountName: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} 18 | containers: 19 | - name: kubelet-plugin 20 | image: {{ include "intel-gpu-resource-driver.fullimage" . }} 21 | imagePullPolicy: {{ .Values.image.pullPolicy }} 22 | command: ["/kubelet-gpu-plugin"] 23 | env: 24 | - name: NODE_NAME 25 | valueFrom: 26 | fieldRef: 27 | fieldPath: spec.nodeName 28 | - name: POD_NAMESPACE 29 | valueFrom: 30 | fieldRef: 31 | fieldPath: metadata.namespace 32 | - name: SYSFS_ROOT 33 | value: "/sysfs" 34 | volumeMounts: 35 | - name: plugins-registry 36 | mountPath: /var/lib/kubelet/plugins_registry 37 | - name: plugins 38 | mountPath: /var/lib/kubelet/plugins 39 | - name: cdi 40 | mountPath: /etc/cdi 41 | - name: varruncdi 42 | mountPath: /var/run/cdi 43 | # when using fake sysfs - mount at the same place as on host 44 | - name: sysfs 45 | mountPath: "/sysfs" 46 | securityContext: 47 | privileged: false 48 | allowPrivilegeEscalation: false 49 | capabilities: 50 | drop: ["ALL"] 51 | readOnlyRootFilesystem: true 52 | runAsUser: 0 53 | seccompProfile: 54 | type: RuntimeDefault 55 | volumes: 56 | - name: plugins-registry 57 | hostPath: 58 | path: /var/lib/kubelet/plugins_registry 59 | - name: plugins 60 | hostPath: 61 | path: /var/lib/kubelet/plugins 62 | - name: cdi 63 | hostPath: 64 | path: {{ .Values.cdi.staticPath }} 65 | - name: varruncdi 66 | hostPath: 67 | path: {{ .Values.cdi.dynamicPath}} 68 | - name: sysfs 69 | hostPath: 70 | path: /sys 71 | {{- with .Values.kubeletPlugin.tolerations }} 72 | tolerations: 73 | {{- toYaml . | nindent 8 }} 74 | {{- end }} 75 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }} 76 | nodeSelector: 77 | intel.feature.node.kubernetes.io/gpu: "true" 78 | {{- else }} 79 | {{- with .Values.kubeletPlugin.nodeSelector }} 80 | nodeSelector: 81 | {{- toYaml . | nindent 8 }} 82 | {{- end }} 83 | {{- end }} 84 | {{- with .Values.kubeletPlugin.affinity }} 85 | affinity: 86 | {{- toYaml . | nindent 8 }} 87 | {{- end }} 88 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/validating-admission-policy-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gpu 5 | spec: 6 | policyName: resourceslices-policy-dra-kubelet-plugin-gpu 7 | validationActions: [Deny] 8 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicy 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gpu 5 | spec: 6 | failurePolicy: Fail 7 | matchConstraints: 8 | resourceRules: 9 | - apiGroups: ["resource.k8s.io"] 10 | apiVersions: ["v1beta1"] 11 | operations: ["CREATE", "UPDATE", "DELETE"] 12 | resources: ["resourceslices"] 13 | matchConditions: 14 | - name: isRestrictedUser 15 | expression: >- 16 | request.userInfo.username == "system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-gpu-resource-driver.serviceAccountName" . }}" 17 | variables: 18 | - name: userNodeName 19 | expression: >- 20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 21 | - name: objectNodeName 22 | expression: >- 23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 24 | validations: 25 | - expression: variables.userNodeName != "" 26 | message: >- 27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 28 | - expression: variables.userNodeName == variables.objectNodeName 29 | messageExpression: >- 30 | "this user running on node '"+variables.userNodeName+"' may not modify " + 31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 32 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for intel-gpu-resource-driver. 2 | nameOverride: "" 3 | fullnameOverride: "" 4 | selectorLabelsOverride: {} 5 | 6 | imagePullSecrets: [] 7 | image: 8 | repository: intel 9 | name: intel-gpu-resource-driver 10 | pullPolicy: IfNotPresent 11 | tag: "v0.7.0" 12 | 13 | serviceAccount: 14 | create: true 15 | annotations: {} 16 | name: "" 17 | automount: true 18 | 19 | kubeletPlugin: 20 | podAnnotations: {} 21 | nodeSelector: {} # ignored when .Values.nodeFeatureRules.enabled or .Values.nfd.enabled 22 | tolerations: 23 | - key: node-role.kubernetes.io/master 24 | operator: Exists 25 | effect: NoSchedule 26 | - key: node-role.kubernetes.io/control-plane 27 | operator: Exists 28 | effect: NoSchedule 29 | # Refer to the official documentation for Node Feature Discovery (NFD) 30 | # regarding node tainting: 31 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting 32 | - key: "node.kubernetes.io/gpu" 33 | operator: "Exists" 34 | effect: "NoSchedule" 35 | affinity: {} 36 | 37 | cdi: 38 | staticPath: /etc/cdi 39 | dynamicPath: /var/run/cdi 40 | 41 | nodeFeatureRules: 42 | enabled: false 43 | 44 | nfd: 45 | enabled: false # change to true to install NFD to the cluster 46 | nameOverride: intel-gpu-nfd 47 | enableNodeFeatureApi: true 48 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: intel-qat-resource-driver 3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel QAT Resource Driver 4 | 5 | type: application 6 | version: 0.2.0 7 | appVersion: "v0.2.0" 8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts 9 | 10 | dependencies: 11 | - name: node-feature-discovery 12 | alias: nfd 13 | version: "0.17.1" 14 | condition: nfd.enabled 15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts 16 | 17 | annotations: 18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 20 | org.opencontainers.image.version: "0.2.0" 21 | org.opencontainers.image.title: "Intel QAT Resource Driver" 22 | org.opencontainers.image.description: "This chart installs the Intel QAT resource driver on Kubernetes." 23 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Resource Allocation (DRA) Intel QAT Driver Helm Chart 2 | 3 | ## The chart installs QAT resource driver: 4 | 5 | - [QAT](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/qat/README.md) 6 | 7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main) 8 | 9 | 10 | ## Installing the chart 11 | 12 | ``` 13 | helm install intel-qat-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver \ 14 | --create-namespace \ 15 | --namespace intel-qat-resource-driver 16 | ``` 17 | 18 | ## Uninstalling the chart 19 | ``` 20 | helm uninstall intel-qat-resource-driver --namespace intel-qat-resource-driver 21 | ``` 22 | (Optional) Delete the namespace: 23 | ``` 24 | kubectl delete ns intel-qat-resource-driver 25 | ``` 26 | 27 | ## Configuration 28 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: 29 | 30 | ```console 31 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver 32 | ``` 33 | 34 | You may also run `helm show values` on this chart's dependencies for additional options. 35 | 36 | | Key | Type | Default | 37 | |-----|------|---------| 38 | | image.repository | string | `intel` | 39 | | image.name | string | `"intel-qat-resource-driver"` | 40 | | image.pullPolicy | string | `"IfNotPresent"` | 41 | | image.tag | string | `"v0.2.0"` | 42 | 43 | If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases. 44 | 45 | 46 | ## Read-only file system error for QAT 47 | 48 | When the following error appears in the logs of the QAT Kubelet plugin: 49 | ``` 50 | kubectl logs -n intel-qat-resource-driver intel-qat-resource-driver-kubelet-plugin-ttcs6 51 | DRA kubelet plugin 52 | In-cluster config 53 | Setting up CDI 54 | failed to create kubelet plugin driver: cannot enable PF device '0000:6b:00.0': open /sysfs/bus/pci/devices/0000:6b:00.0/sriov_numvfs: read-only file system 55 | ``` 56 | 57 | Try reseting QAT by reloading its kernel driver: 58 | ``` 59 | rmmod qat_4xxx 60 | modprobe qat_4xxx 61 | ``` 62 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name }}. -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* Define common helpers */}} 2 | {{- define "intel-qat-resource-driver.chart" -}} 3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 4 | {{- end }} 5 | 6 | {{/* Define the base name for the driver */}} 7 | {{- define "intel-qat-resource-driver.baseName" -}} 8 | intel-qat-resource-driver 9 | {{- end }} 10 | 11 | {{- define "intel-qat-resource-driver.name" -}} 12 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 13 | {{- end }} 14 | 15 | {{- define "intel-qat-resource-driver.fullname" -}} 16 | {{- if .Values.fullnameOverride -}} 17 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 18 | {{- else -}} 19 | {{- printf "%s-%s" (include "intel-qat-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}} 20 | {{- end -}} 21 | {{- end }} 22 | 23 | {{- define "intel-qat-resource-driver.namespace" -}} 24 | {{- default .Release.Namespace .Values.namespaceOverride }} 25 | {{- end }} 26 | 27 | {{/* Labels for templates */}} 28 | {{- define "intel-qat-resource-driver.labels" -}} 29 | helm.sh/chart: {{ include "intel-qat-resource-driver.chart" . }} 30 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 31 | app.kubernetes.io/managed-by: {{ .Release.Service }} 32 | {{- end }} 33 | 34 | {{- define "intel-qat-resource-driver.clusterRoleName" -}} 35 | {{- printf "%s-role" (include "intel-qat-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 36 | {{- end }} 37 | 38 | {{- define "intel-qat-resource-driver.clusterRoleBindingName" -}} 39 | {{- printf "%s-rolebinding" (include "intel-qat-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 40 | {{- end }} 41 | 42 | {{- define "intel-qat-resource-driver.serviceAccountName" -}} 43 | {{- if .Values.serviceAccount.create -}} 44 | {{- default "intel-qat-sa" .Values.serviceAccount.name -}} 45 | {{- end -}} 46 | {{- end }} 47 | 48 | {{/* Define full image name */}} 49 | {{- define "intel-qat-resource-driver.fullimage" -}} 50 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}} 51 | {{- end }} 52 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "intel-qat-resource-driver.clusterRoleName" . }} 5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }} 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["nodes"] 9 | verbs: ["get"] 10 | - apiGroups: ["resource.k8s.io"] 11 | resources: ["resourceslices"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: ["resource.k8s.io"] 14 | resources: ["resourceclaims"] 15 | verbs: ["get"] 16 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "intel-qat-resource-driver.clusterRoleBindingName" . }} 5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }} 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{ include "intel-qat-resource-driver.serviceAccountName" . }} 9 | namespace: {{ include "intel-qat-resource-driver.namespace" . }} 10 | roleRef: 11 | kind: ClusterRole 12 | name: {{ include "intel-qat-resource-driver.clusterRoleName" . }} 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: qat.intel.com 5 | 6 | spec: 7 | selectors: 8 | - cel: 9 | expression: device.driver == "qat.intel.com" 10 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/nfd.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-qat-device-rule 5 | spec: 6 | rules: 7 | - name: "intel.qat" 8 | labels: 9 | feature.node.kubernetes.io/qat: "true" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | vendor: {op: In, value: ["8086"]} 14 | device: {op: In, value: ["4940", "4941", "4944", "4946"]} 15 | class: {op: In, value: ["0b40"]} 16 | - feature: kernel.loadedmodule 17 | matchExpressions: 18 | intel_qat: {op: Exists} 19 | matchAny: 20 | - matchFeatures: 21 | - feature: kernel.loadedmodule 22 | matchExpressions: 23 | vfio_pci: {op: Exists} 24 | - matchFeatures: 25 | - feature: kernel.enabledmodule 26 | matchExpressions: 27 | vfio-pci: {op: Exists} 28 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/resource-driver-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: intel-qat-resource-driver 5 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-qat-resource-driver-kubelet-plugin 5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }} 6 | labels: 7 | {{- include "intel-qat-resource-driver.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: intel-qat-resource-driver 12 | template: 13 | metadata: 14 | labels: 15 | app: intel-qat-resource-driver 16 | spec: 17 | serviceAccount: intel-qat-resource-driver-service-account 18 | serviceAccountName: {{ include "intel-qat-resource-driver.serviceAccountName" . }} 19 | containers: 20 | - name: kubelet-plugin 21 | image: {{ include "intel-qat-resource-driver.fullimage" . }} 22 | imagePullPolicy: {{ .Values.image.pullPolicy }} 23 | command: ["/kubelet-qat-plugin"] 24 | env: 25 | - name: NODE_NAME 26 | valueFrom: 27 | fieldRef: 28 | fieldPath: spec.nodeName 29 | - name: POD_NAMESPACE 30 | valueFrom: 31 | fieldRef: 32 | fieldPath: metadata.namespace 33 | - name: SYSFS_ROOT 34 | value: "/sysfs" 35 | volumeMounts: 36 | - name: plugins-registry 37 | mountPath: /var/lib/kubelet/plugins_registry 38 | - name: plugins 39 | mountPath: /var/lib/kubelet/plugins 40 | - name: cdi 41 | mountPath: /etc/cdi 42 | - name: varruncdi 43 | mountPath: /var/run/cdi 44 | - name: sysfs 45 | mountPath: /sysfs 46 | - name: qatconfiguration 47 | mountPath: /defaults 48 | securityContext: 49 | privileged: true 50 | readOnlyRootFilesystem: true 51 | seccompProfile: 52 | type: RuntimeDefault 53 | volumes: 54 | - name: plugins-registry 55 | hostPath: 56 | path: /var/lib/kubelet/plugins_registry 57 | - name: plugins 58 | hostPath: 59 | path: /var/lib/kubelet/plugins 60 | - name: cdi 61 | hostPath: 62 | path: /etc/cdi 63 | - name: varruncdi 64 | hostPath: 65 | path: /var/run/cdi 66 | - name: sysfs 67 | hostPath: 68 | path: /sys 69 | - name: qatconfiguration 70 | configMap: 71 | name: intel-qat-resource-driver-configuration 72 | optional: true 73 | {{- with .Values.kubeletPlugin.tolerations }} 74 | tolerations: 75 | {{- toYaml . | nindent 8 }} 76 | {{- end }} 77 | {{- with .Values.kubeletPlugin.nodeSelector }} 78 | nodeSelector: 79 | {{- toYaml . | nindent 8 }} 80 | {{- end }} 81 | {{- with .Values.kubeletPlugin.affinity }} 82 | affinity: 83 | {{- toYaml . | nindent 8 }} 84 | {{- end }} 85 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "intel-qat-resource-driver.serviceAccountName" . }} 5 | namespace: {{ include "intel-qat-resource-driver.namespace" . }} 6 | labels: 7 | {{- include "intel-qat-resource-driver.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/validating-admission-policy-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-qat 5 | spec: 6 | policyName: resourceslices-policy-dra-kubelet-plugin-qat 7 | validationActions: [Deny] 8 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/validating-admission-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicy 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-qat 5 | spec: 6 | failurePolicy: Fail 7 | matchConstraints: 8 | resourceRules: 9 | - apiGroups: ["resource.k8s.io"] 10 | apiVersions: ["v1beta1"] 11 | operations: ["CREATE", "UPDATE", "DELETE"] 12 | resources: ["resourceslices"] 13 | matchConditions: 14 | - name: isRestrictedUser 15 | expression: >- 16 | request.userInfo.username == "system:serviceaccount:intel-qat-resource-driver:intel-qat-resource-driver-service-account" 17 | variables: 18 | - name: userNodeName 19 | expression: >- 20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 21 | - name: objectNodeName 22 | expression: >- 23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 24 | validations: 25 | - expression: variables.userNodeName != "" 26 | message: >- 27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 28 | - expression: variables.userNodeName == variables.objectNodeName 29 | messageExpression: >- 30 | "this user running on node '"+variables.userNodeName+"' may not modify " + 31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 32 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for intel-qat-resource-driver. 2 | nameOverride: "" 3 | namespaceOverride: "intel-qat-resource-driver" 4 | fullnameOverride: "" 5 | selectorLabelsOverride: {} 6 | 7 | imagePullSecrets: [] 8 | image: 9 | repository: intel 10 | name: intel-qat-resource-driver 11 | pullPolicy: IfNotPresent 12 | tag: "v0.2.0" 13 | 14 | serviceAccount: 15 | create: true 16 | annotations: {} 17 | name: "intel-qat-resource-driver-service-account" 18 | automount: true 19 | 20 | kubeletPlugin: 21 | podAnnotations: {} 22 | nodeSelector: 23 | feature.node.kubernetes.io/qat: "true" 24 | tolerations: 25 | - key: node-role.kubernetes.io/master 26 | operator: Exists 27 | effect: NoSchedule 28 | - key: node-role.kubernetes.io/control-plane 29 | operator: Exists 30 | effect: NoSchedule 31 | # Refer to the official documentation for Node Feature Discovery (NFD) 32 | # regarding node tainting: 33 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting 34 | - key: "node.kubernetes.io/qat" 35 | operator: "Exists" 36 | effect: "NoSchedule" 37 | affinity: {} 38 | 39 | nfd: 40 | enabled: false # change to true to install NFD to the cluster 41 | nameOverride: intel-qat-nfd 42 | # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16): 43 | # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters 44 | enableNodeFeatureApi: true 45 | -------------------------------------------------------------------------------- /cmd/kubelet-gaudi-plugin/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | 23 | "github.com/urfave/cli/v2" 24 | 25 | gaudi "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device" 26 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers" 27 | ) 28 | 29 | type GaudiFlags struct { 30 | Healthcare bool 31 | HealthcareInterval int 32 | } 33 | 34 | const ( 35 | HealthCareFlagDefault = false 36 | HealthcareIntervalFlagMin = 1 37 | HealthcareIntervalFlagMax = 3600 38 | HealthcareIntervalFlagDefault = 5 39 | ) 40 | 41 | func main() { 42 | gaudiFlags := GaudiFlags{ 43 | Healthcare: HealthCareFlagDefault, 44 | HealthcareInterval: HealthcareIntervalFlagDefault, 45 | } 46 | 47 | cliFlags := []cli.Flag{ 48 | &cli.BoolFlag{ 49 | Name: "health-monitoring", 50 | Aliases: []string{"m"}, 51 | Usage: "Actively monitor device health and update ResourceSlice. Requires privileges.", 52 | Value: HealthCareFlagDefault, 53 | Destination: &gaudiFlags.Healthcare, 54 | EnvVars: []string{"HEALTH_MONITORING"}, 55 | }, 56 | &cli.IntFlag{ 57 | Name: "health-interval", 58 | Aliases: []string{"i"}, 59 | Usage: fmt.Sprintf("Number of seconds between health-monitoring checks [%v ~ %v]", HealthcareIntervalFlagMin, HealthcareIntervalFlagMax), 60 | Value: HealthcareIntervalFlagDefault, 61 | Destination: &gaudiFlags.HealthcareInterval, 62 | EnvVars: []string{"HEALTH_INTERVAL"}, 63 | }, 64 | } 65 | 66 | if err := helpers.NewApp(gaudi.DriverName, newDriver, cliFlags, &gaudiFlags).Run(os.Args); err != nil { 67 | fmt.Fprintf(os.Stderr, "Error: %v\n", err) 68 | os.Exit(1) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /cmd/kubelet-gaudi-plugin/node_state_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "reflect" 21 | "testing" 22 | 23 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device" 24 | ) 25 | 26 | func TestDeviceInfoDeepCopy(t *testing.T) { 27 | di := device.DeviceInfo{ 28 | UID: "f", 29 | Model: "ff", 30 | } 31 | 32 | dc := di.DeepCopy() 33 | 34 | if !reflect.DeepEqual(&di, dc) { 35 | t.Fatalf("device infos %v and %v do not match", di, dc) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /cmd/kubelet-gpu-plugin/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | 23 | "github.com/urfave/cli/v2" 24 | 25 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gpu/device" 26 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers" 27 | ) 28 | 29 | type GPUFlags struct { 30 | Partitioning bool 31 | } 32 | 33 | const ( 34 | PartitioningDefault = false 35 | ) 36 | 37 | func main() { 38 | gpuFlags := GPUFlags{} 39 | cliFlags := []cli.Flag{ 40 | &cli.BoolFlag{ 41 | Name: "partitioning-management", 42 | Aliases: []string{"p"}, 43 | Usage: "Manage partitioning physical devices into virtual. [Not Supported]", 44 | Value: PartitioningDefault, 45 | Destination: &gpuFlags.Partitioning, 46 | EnvVars: []string{"PARTITIONING"}, 47 | }, 48 | } 49 | 50 | if err := helpers.NewApp(device.DriverName, newDriver, cliFlags, &gpuFlags).Run(os.Args); err != nil { 51 | fmt.Fprintf(os.Stderr, "Error: %v\n", err) 52 | os.Exit(1) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /cmd/kubelet-gpu-plugin/test-claims/empty.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /cmd/kubelet-gpu-plugin/test-claims/invalid.json: -------------------------------------------------------------------------------- 1 | {"foo":"bar",} 2 | -------------------------------------------------------------------------------- /cmd/kubelet-gpu-plugin/test-claims/multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "uid1": [ 3 | { 4 | "request_names": [ 5 | "request1" 6 | ], 7 | "pool_name": "node1", 8 | "device_name": "0000-af-00-1-0xabcd", 9 | "cdi_device_ids": [ 10 | "0000-af-00-1-0xabcd" 11 | ] 12 | } 13 | ], 14 | "uid2": [ 15 | { 16 | "request_names": [ 17 | "request1" 18 | ], 19 | "pool_name": "node1", 20 | "device_name": "0000-af-00-2-0xabcd", 21 | "cdi_device_ids": [ 22 | "0000-af-00-2-0xabcd" 23 | ] 24 | } 25 | ], 26 | "uid3": [ 27 | { 28 | "request_names": [ 29 | "request1" 30 | ], 31 | "pool_name": "node1", 32 | "device_name": "0000-af-00-3-0xabcd", 33 | "cdi_device_ids": [ 34 | "0000-af-00-3-0xabcd" 35 | ] 36 | } 37 | ] 38 | } -------------------------------------------------------------------------------- /cmd/kubelet-qat-plugin/clientsets.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "fmt" 9 | "os" 10 | 11 | "k8s.io/client-go/kubernetes" 12 | "k8s.io/client-go/rest" 13 | "k8s.io/client-go/tools/clientcmd" 14 | "k8s.io/klog/v2" 15 | ) 16 | 17 | type ClientSet struct { 18 | csconfig *rest.Config 19 | } 20 | 21 | type KubeClient kubernetes.Interface 22 | 23 | // Create a new client config. Use KUBECONFIG environment variable if set, 24 | // othewise resort to in-cluster config. 25 | func (c *ClientSet) newClientSetConfig() error { 26 | var err error 27 | 28 | if c.csconfig != nil { 29 | return nil 30 | } 31 | 32 | kubeconfenv := os.Getenv("KUBECONFIG") 33 | if kubeconfenv == "" { 34 | klog.V(5).Info("In-cluster config") 35 | 36 | c.csconfig, err = rest.InClusterConfig() 37 | if err != nil { 38 | return fmt.Errorf("creating in-cluster client configuration: %v", err) 39 | } 40 | } else { 41 | klog.V(5).Infof("Using env variable KUBECONFIG=%s", kubeconfenv) 42 | 43 | c.csconfig, err = clientcmd.BuildConfigFromFlags("", kubeconfenv) 44 | if err != nil { 45 | return fmt.Errorf("creating out-of-cluster client configuration: %v", err) 46 | } 47 | 48 | } 49 | 50 | return nil 51 | } 52 | 53 | func (c *ClientSet) NewKubeClient() (KubeClient, error) { 54 | if err := c.newClientSetConfig(); err != nil { 55 | return nil, err 56 | } 57 | 58 | kubeclient, err := kubernetes.NewForConfig(c.csconfig) 59 | if err != nil { 60 | return nil, fmt.Errorf("creating kubernetes client: %v", err) 61 | } 62 | 63 | return kubeclient, nil 64 | } 65 | -------------------------------------------------------------------------------- /cmd/kubelet-qat-plugin/config.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "encoding/json" 9 | "fmt" 10 | "os" 11 | 12 | "k8s.io/klog/v2" 13 | 14 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 15 | ) 16 | 17 | const defaultConfigFile = "/defaults/qatdefaults.config" 18 | 19 | func readConfigFile(hostname string) (map[string]string, error) { 20 | configBytes, err := os.ReadFile(defaultConfigFile) 21 | if err != nil { 22 | return nil, err 23 | } 24 | 25 | var configFile map[string]map[string]string 26 | if err := json.Unmarshal(configBytes, &configFile); err != nil { 27 | return nil, err 28 | } 29 | 30 | hostConfig, exists := configFile[hostname] 31 | if !exists { 32 | return nil, fmt.Errorf("no config for host '%s' found", hostname) 33 | } 34 | 35 | return hostConfig, nil 36 | } 37 | 38 | func getDefaultConfiguration(hostname string, q device.QATDevices) error { 39 | serviceconfig, err := readConfigFile(hostname) 40 | if err != nil { 41 | klog.Infof("Could not read default config file - leaving unconfigured: %v", err) 42 | return nil 43 | } 44 | 45 | klog.V(5).Infof("Default config for host '%s':", hostname) 46 | for _, pf := range q { 47 | if servicestr, exists := serviceconfig[pf.Device]; exists { 48 | var services device.Services 49 | var err error 50 | 51 | if services, err = device.StringToServices(servicestr); err != nil { 52 | klog.Warningf("Error parsing default config services for PF device '%s': %v", pf.Device, err) 53 | continue 54 | } 55 | 56 | if err := pf.SetServices([]device.Services{services}); err != nil { 57 | klog.Warningf("Error configuring services '%s' for PF device '%s': %v", services.String(), pf.Device, err) 58 | continue 59 | } 60 | 61 | klog.V(5).Infof("PF device '%s' configured with services %s'", pf.Device, services.String()) 62 | } 63 | } 64 | 65 | return nil 66 | } 67 | -------------------------------------------------------------------------------- /cmd/kubelet-qat-plugin/deviceresources.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | resourceapi "k8s.io/api/resource/v1beta1" 9 | "k8s.io/klog/v2" 10 | "k8s.io/utils/ptr" 11 | 12 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 13 | ) 14 | 15 | func deviceResources(qatvfdevices device.VFDevices) *[]resourceapi.Device { 16 | resourcedevices := []resourceapi.Device{} 17 | 18 | for _, qatvfdevice := range qatvfdevices { 19 | device := resourceapi.Device{ 20 | Name: qatvfdevice.UID(), 21 | Basic: &resourceapi.BasicDevice{ 22 | Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{ 23 | "services": { 24 | StringValue: ptr.To(qatvfdevice.Services()), 25 | }, 26 | }, 27 | }, 28 | } 29 | resourcedevices = append(resourcedevices, device) 30 | 31 | klog.V(5).Infof("Adding Device resource: name '%s', service '%s'", device.Name, *device.Basic.Attributes["services"].StringValue) 32 | } 33 | 34 | return &resourcedevices 35 | } 36 | -------------------------------------------------------------------------------- /cmd/kubelet-qat-plugin/main.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "context" 9 | "fmt" 10 | "os" 11 | "os/signal" 12 | "syscall" 13 | 14 | "github.com/spf13/cobra" 15 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 16 | cliflag "k8s.io/component-base/cli/flag" 17 | "k8s.io/component-base/featuregate" 18 | "k8s.io/component-base/logs" 19 | logsapi "k8s.io/component-base/logs/api/v1" 20 | "k8s.io/component-base/term" 21 | "k8s.io/dynamic-resource-allocation/kubeletplugin" 22 | "k8s.io/klog/v2" 23 | 24 | driverVersion "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/version" 25 | ) 26 | 27 | func cmdRun(cmd *cobra.Command, args []string) error { 28 | var ( 29 | d *driver 30 | err error 31 | ) 32 | 33 | klog.Info("DRA QAT kubelet plugin") 34 | driverVersion.PrintDriverVersion(driverName) 35 | 36 | ctx := context.Background() 37 | 38 | if err := os.MkdirAll(driverPluginPath, 0750); err != nil { 39 | return fmt.Errorf("could not create '%s': %v", driverPluginPath, err) 40 | } 41 | 42 | if d, err = newDriver(ctx); err != nil { 43 | return fmt.Errorf("failed to create kubelet plugin driver: %v", err) 44 | } 45 | 46 | plugin, err := kubeletplugin.Start( 47 | ctx, 48 | []any{d}, 49 | kubeletplugin.KubeClient(d.kubeclient), 50 | kubeletplugin.NodeName(d.nodename), 51 | kubeletplugin.DriverName(driverName), 52 | kubeletplugin.RegistrarSocketPath(pluginRegistrationPath), 53 | kubeletplugin.PluginSocketPath(driverPluginSocketPath), 54 | kubeletplugin.KubeletPluginSocketPath(driverPluginSocketPath)) 55 | if err != nil { 56 | return fmt.Errorf("failed to start kubelet plugin: %v", err) 57 | } 58 | 59 | d.plugin = plugin 60 | 61 | if err := d.UpdateDeviceResources(ctx); err != nil { 62 | return fmt.Errorf("failed to publish resources: %v", err) 63 | } 64 | 65 | klog.Infof("DRA kubelet plugin %s running...", driverName) 66 | 67 | sigc := make(chan os.Signal, 1) 68 | signal.Notify(sigc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 69 | <-sigc 70 | 71 | plugin.Stop() 72 | 73 | klog.Infof("DRA kubelet plugin %s done", driverName) 74 | 75 | return nil 76 | } 77 | 78 | func setupCmd() (*cobra.Command, error) { 79 | cmd := &cobra.Command{ 80 | Use: "kubelet-plugin", 81 | Short: "Intel QAT resource driver kubelet plugin", 82 | RunE: cmdRun, 83 | } 84 | 85 | logsconfig := logsapi.NewLoggingConfiguration() 86 | fgate := featuregate.NewFeatureGate() 87 | utilruntime.Must(logsapi.AddFeatureGates(fgate)) 88 | if err := logsapi.ValidateAndApply(logsconfig, fgate); err != nil { 89 | return nil, err 90 | } 91 | 92 | loggingFlags := cliflag.NamedFlagSets{} 93 | fs := loggingFlags.FlagSet("logging") 94 | logsapi.AddFlags(logsconfig, fs) 95 | logs.AddFlags(fs, logs.SkipLoggingConfigurationFlags()) 96 | 97 | cmd.PersistentFlags().AddFlagSet(fs) 98 | 99 | cols, _, _ := term.TerminalSize(cmd.OutOrStdout()) 100 | cliflag.SetUsageAndHelpFunc(cmd, loggingFlags, cols) 101 | 102 | return cmd, nil 103 | } 104 | 105 | func main() { 106 | cmd, err := setupCmd() 107 | if err != nil { 108 | fmt.Printf("Error: failed to start: %v", err) 109 | return 110 | } 111 | 112 | // Execute() already prints out the error. 113 | _ = cmd.Execute() 114 | } 115 | -------------------------------------------------------------------------------- /cmd/qat-showdevice/main.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "fmt" 9 | 10 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 11 | ) 12 | 13 | func printPFDevice(pfdev *device.PFDevice) { 14 | fmt.Printf("PF device: %s\n", pfdev.Device) 15 | fmt.Printf("State: %s\n", pfdev.State.String()) 16 | fmt.Printf("Services: %s\n", pfdev.Services.String()) 17 | fmt.Printf("Num VFs: %d\n", pfdev.NumVFs) 18 | fmt.Printf("Max VFs: %d\n", pfdev.TotalVFs) 19 | 20 | for _, vfdev := range pfdev.AvailableDevices { 21 | fmt.Printf("\tVF UID %s: device %s, device node %s, IOMMU %s, driver %s\n", vfdev.UID(), vfdev.PCIDevice(), vfdev.DeviceNode(), vfdev.Iommu(), vfdev.Driver()) 22 | } 23 | } 24 | 25 | func main() { 26 | pfdevices, err := device.New() 27 | if err != nil { 28 | fmt.Printf("Error: %v\n", err) 29 | return 30 | } 31 | 32 | if len(pfdevices) == 0 { 33 | fmt.Printf("No PF devices found\n") 34 | return 35 | } 36 | 37 | for _, pfdev := range pfdevices { 38 | printPFDevice(pfdev) 39 | fmt.Printf("---\n\n") 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /deployments/gaudi/base/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: gaudi.intel.com 5 | 6 | spec: 7 | selectors: 8 | - cel: 9 | expression: device.driver == "gaudi.intel.com" 10 | -------------------------------------------------------------------------------- /deployments/gaudi/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - device-class.yaml 3 | - namespace.yaml 4 | - resource-driver.yaml 5 | 6 | images: 7 | - name: intel/intel-gaudi-resource-driver 8 | newTag: v0.3.0 9 | -------------------------------------------------------------------------------- /deployments/gaudi/base/namespace.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Namespace 4 | metadata: 5 | name: intel-gaudi-resource-driver 6 | -------------------------------------------------------------------------------- /deployments/gaudi/base/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: DaemonSet 4 | metadata: 5 | name: intel-gaudi-resource-driver-kubelet-plugin 6 | namespace: intel-gaudi-resource-driver 7 | labels: 8 | app: intel-gaudi-resource-driver-kubelet-plugin 9 | spec: 10 | selector: 11 | matchLabels: 12 | app: intel-gaudi-resource-driver-kubelet-plugin 13 | template: 14 | metadata: 15 | labels: 16 | app: intel-gaudi-resource-driver-kubelet-plugin 17 | spec: 18 | serviceAccount: intel-gaudi-resource-driver-service-account 19 | serviceAccountName: intel-gaudi-resource-driver-service-account 20 | containers: 21 | - name: kubelet-plugin 22 | image: intel/intel-gaudi-resource-driver:v0.3.0 23 | imagePullPolicy: IfNotPresent 24 | command: ["/kubelet-gaudi-plugin", "-m"] 25 | env: 26 | - name: NODE_NAME 27 | valueFrom: 28 | fieldRef: 29 | fieldPath: spec.nodeName 30 | - name: POD_NAMESPACE 31 | valueFrom: 32 | fieldRef: 33 | fieldPath: metadata.namespace 34 | - name: SYSFS_ROOT 35 | value: "/sys" 36 | # Only use DEVFS_ROOT when using fake devfs with device-faker 37 | #- name: DEVFS_ROOT 38 | # value: "/devfs" 39 | 40 | volumeMounts: 41 | - name: plugins-registry 42 | mountPath: /var/lib/kubelet/plugins_registry 43 | - name: plugins 44 | mountPath: /var/lib/kubelet/plugins 45 | - name: cdi 46 | mountPath: /etc/cdi 47 | - name: varruncdi 48 | mountPath: /var/run/cdi 49 | - name: sysfs 50 | mountPath: "/sys" 51 | # Only use DEVFS_ROOT when using fake devfs with device-faker 52 | #- name: devfs 53 | # mountPath: "/devfs" 54 | securityContext: 55 | privileged: true 56 | capabilities: 57 | drop: [ "ALL" ] 58 | readOnlyRootFilesystem: true 59 | runAsUser: 0 60 | seccompProfile: 61 | type: RuntimeDefault 62 | volumes: 63 | - name: plugins-registry 64 | hostPath: 65 | path: /var/lib/kubelet/plugins_registry 66 | - name: plugins 67 | hostPath: 68 | path: /var/lib/kubelet/plugins 69 | - name: cdi 70 | hostPath: 71 | path: /etc/cdi 72 | - name: varruncdi 73 | hostPath: 74 | path: /var/run/cdi 75 | - name: sysfs 76 | hostPath: 77 | path: /sys 78 | # Only use DEVFS_ROOT when using fake devfs with device-faker 79 | #- name: devfs 80 | # hostPath: 81 | # path: /dev 82 | 83 | --- 84 | apiVersion: v1 85 | kind: ServiceAccount 86 | metadata: 87 | name: intel-gaudi-resource-driver-service-account 88 | namespace: intel-gaudi-resource-driver 89 | 90 | --- 91 | apiVersion: rbac.authorization.k8s.io/v1 92 | kind: ClusterRole 93 | metadata: 94 | name: intel-gaudi-resource-driver-role 95 | namespace: intel-gaudi-resource-driver 96 | rules: 97 | - apiGroups: [""] 98 | resources: ["nodes"] 99 | verbs: ["get"] 100 | - apiGroups: ["resource.k8s.io"] 101 | resources: ["resourceslices"] 102 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 103 | - apiGroups: ["resource.k8s.io"] 104 | resources: ["resourceclaims"] 105 | verbs: ["get"] 106 | 107 | --- 108 | apiVersion: rbac.authorization.k8s.io/v1 109 | kind: ClusterRoleBinding 110 | metadata: 111 | name: intel-gaudi-resource-driver-role-binding 112 | namespace: intel-gaudi-resource-driver 113 | subjects: 114 | - kind: ServiceAccount 115 | name: intel-gaudi-resource-driver-service-account 116 | namespace: intel-gaudi-resource-driver 117 | roleRef: 118 | kind: ClusterRole 119 | name: intel-gaudi-resource-driver-role 120 | apiGroup: rbac.authorization.k8s.io 121 | 122 | --- 123 | apiVersion: admissionregistration.k8s.io/v1 124 | kind: ValidatingAdmissionPolicy 125 | metadata: 126 | name: resourceslices-policy-dra-kubelet-plugin-gaudi 127 | spec: 128 | failurePolicy: Fail 129 | matchConstraints: 130 | resourceRules: 131 | - apiGroups: ["resource.k8s.io"] 132 | apiVersions: ["v1beta1"] 133 | operations: ["CREATE", "UPDATE", "DELETE"] 134 | resources: ["resourceslices"] 135 | matchConditions: 136 | - name: isRestrictedUser 137 | expression: >- 138 | request.userInfo.username == "system:serviceaccount:intel-gaudi-resource-driver:intel-gaudi-resource-driver-service-account" 139 | variables: 140 | - name: userNodeName 141 | expression: >- 142 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 143 | - name: objectNodeName 144 | expression: >- 145 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 146 | validations: 147 | - expression: variables.userNodeName != "" 148 | message: >- 149 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 150 | - expression: variables.userNodeName == variables.objectNodeName 151 | messageExpression: >- 152 | "this user running on node '"+variables.userNodeName+"' may not modify " + 153 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 154 | --- 155 | apiVersion: admissionregistration.k8s.io/v1 156 | kind: ValidatingAdmissionPolicyBinding 157 | metadata: 158 | name: resourceslices-policy-dra-kubelet-plugin-gaudi 159 | spec: 160 | policyName: resourceslices-policy-dra-kubelet-plugin-gaudi 161 | validationActions: [Deny] 162 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/deployment-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: two-gaudi3 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gaudi 10 | deviceClassName: gaudi.intel.com 11 | count: 2 12 | selectors: 13 | - cel: 14 | expression: device.attributes["gaudi.intel.com"].model == 'Gaudi3' 15 | 16 | --- 17 | apiVersion: apps/v1 18 | kind: Deployment 19 | metadata: 20 | name: gaudi-test 21 | labels: 22 | app: inline-gpu-deployment 23 | spec: 24 | replicas: 1 25 | selector: 26 | matchLabels: 27 | app: inline-gpu-deployment 28 | template: 29 | metadata: 30 | labels: 31 | app: inline-gpu-deployment 32 | spec: 33 | containers: 34 | - name: with-resource 35 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 36 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 300"] 37 | resources: 38 | claims: 39 | - name: resource 40 | - name: without-resource 41 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 42 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"] 43 | resourceClaims: 44 | - name: resource 45 | resourceClaimTemplateName: two-gaudi3 46 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/monitor-pod-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: monitor-claim 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gaudi 10 | deviceClassName: gaudi.intel.com 11 | adminAccess: true 12 | allocationMode: "All" 13 | --- 14 | apiVersion: v1 15 | kind: Pod 16 | metadata: 17 | name: monitor-pod 18 | spec: 19 | restartPolicy: Never 20 | containers: 21 | - name: monitor 22 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 23 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 24 | resources: 25 | claims: 26 | - name: resource 27 | resourceClaims: 28 | - name: resource 29 | resourceClaimTemplateName: monitor-claim 30 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/pod-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaim 3 | metadata: 4 | name: claim1 5 | spec: 6 | devices: 7 | requests: 8 | - name: gaudi 9 | deviceClassName: gaudi.intel.com 10 | ## 11 | ## if one is not enough 12 | # count: 2 13 | ## 14 | ## requesting particular series 15 | # selectors: 16 | # - cel: 17 | # expression: device.attributes["gaudi.intel.com"].model == 'Gaudi2' 18 | ## 19 | ## for monitoring 20 | # adminAccess: true 21 | # allocationMode: "All" 22 | --- 23 | apiVersion: v1 24 | kind: Pod 25 | metadata: 26 | name: test-inline-claim 27 | spec: 28 | restartPolicy: Never 29 | containers: 30 | - name: with-resource 31 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 32 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 60"] 33 | resources: 34 | claims: 35 | - name: resource 36 | - name: without-resource 37 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 38 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 39 | resourceClaims: 40 | - name: resource 41 | resourceClaimName: claim1 42 | -------------------------------------------------------------------------------- /deployments/gaudi/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - base 3 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/device-faker/device-faker.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: intel-gaudi-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | initContainers: 10 | - name: device-faker 11 | image: ger-is-registry.caas.intel.com/dgpu-orchestration/intel-device-faker:v0.1.0 12 | imagePullPolicy: Always 13 | command: ["/device-faker", "gaudi", "-t", "/opt/templates/gaudi-template.json", "-d", "/tmp/fake-root"] 14 | volumeMounts: 15 | - name: fake-root 16 | mountPath: /tmp/fake-root 17 | containers: 18 | - name: kubelet-plugin 19 | env: 20 | - name: SYSFS_ROOT 21 | value: "/fake-sysfs" 22 | volumeMounts: 23 | - name: fake-root 24 | mountPath: /fake-sysfs 25 | subPath: sysfs 26 | - name: fake-root 27 | mountPath: /fake-dev/dri 28 | subPath: dev/dri 29 | - name: fake-root 30 | mountPath: /fake-cdi 31 | subPath: cdi 32 | volumes: 33 | - name: fake-root 34 | emptyDir: {} 35 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/device-faker/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../../base 3 | 4 | patches: 5 | - path: remove-sysfs.yaml 6 | - path: device-faker.yaml 7 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/device-faker/remove-sysfs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: intel-gaudi-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: kubelet-plugin 11 | volumeMounts: 12 | - name: sysfs 13 | mountPath: /sysfs 14 | $patch: delete 15 | volumes: 16 | - name: sysfs 17 | $patch: delete 18 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/nfd_labeled_nodes/add-nodeselector-intel-gaudi.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: intel-gaudi-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | nodeSelector: 10 | intel.feature.node.kubernetes.io/gaudi: "true" 11 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/nfd_labeled_nodes/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - ../../base 6 | - nfd-intel-gaudi-device-rule.yaml 7 | 8 | patches: 9 | - path: add-nodeselector-intel-gaudi.yaml 10 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/nfd_labeled_nodes/nfd-intel-gaudi-device-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-gaudi-device-rule 5 | spec: 6 | rules: 7 | - name: "intel.gaudi" 8 | labels: 9 | "intel.feature.node.kubernetes.io/gaudi": "true" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | vendor: {op: In, value: ["1da3"]} 14 | device: {op: In, value: ["1020", "1030"]} 15 | -------------------------------------------------------------------------------- /deployments/gpu/base/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: gpu.intel.com 5 | 6 | spec: 7 | selectors: 8 | - cel: 9 | expression: device.driver == "gpu.intel.com" 10 | -------------------------------------------------------------------------------- /deployments/gpu/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - device-class.yaml 3 | - namespace.yaml 4 | - resource-driver.yaml 5 | 6 | images: 7 | - name: intel/intel-gpu-resource-driver 8 | newTag: v0.7.0 9 | -------------------------------------------------------------------------------- /deployments/gpu/base/namespace.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Namespace 4 | metadata: 5 | name: intel-gpu-resource-driver 6 | -------------------------------------------------------------------------------- /deployments/gpu/examples/claim-external-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaim 3 | metadata: 4 | name: one-flex 5 | spec: 6 | devices: 7 | requests: 8 | - name: gpu 9 | deviceClassName: gpu.intel.com 10 | selectors: 11 | - cel: 12 | expression: device.attributes["gpu.intel.com"].family == 'Flex' 13 | -------------------------------------------------------------------------------- /deployments/gpu/examples/deployment-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: gpu-4g 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gpu 10 | deviceClassName: gpu.intel.com 11 | selectors: 12 | - cel: 13 | expression: device.capacity["gpu.intel.com"].memory.compareTo(quantity("4Gi")) >= 0 14 | 15 | --- 16 | apiVersion: apps/v1 17 | kind: Deployment 18 | metadata: 19 | name: gpu-test 20 | labels: 21 | app: inline-gpu-deployment 22 | spec: 23 | replicas: 1 24 | selector: 25 | matchLabels: 26 | app: inline-gpu-deployment 27 | template: 28 | metadata: 29 | labels: 30 | app: inline-gpu-deployment 31 | spec: 32 | containers: 33 | - name: with-resource 34 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 35 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 300"] 36 | resources: 37 | claims: 38 | - name: resource 39 | - name: without-resource 40 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 41 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"] 42 | resourceClaims: 43 | - name: resource 44 | resourceClaimTemplateName: gpu-4g 45 | -------------------------------------------------------------------------------- /deployments/gpu/examples/monitor-pod-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: monitor-claim 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gpu 10 | deviceClassName: gpu.intel.com 11 | adminAccess: true 12 | allocationMode: "All" 13 | --- 14 | apiVersion: v1 15 | kind: Pod 16 | metadata: 17 | name: monitor-pod 18 | spec: 19 | restartPolicy: Never 20 | containers: 21 | - name: monitor 22 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 23 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 24 | resources: 25 | claims: 26 | - name: resource 27 | resourceClaims: 28 | - name: resource 29 | resourceClaimTemplateName: monitor-claim 30 | -------------------------------------------------------------------------------- /deployments/gpu/examples/pod-for-claim-external-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: test-one-flex 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: with-resource 9 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 10 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 11 | resources: 12 | claims: 13 | - name: resource 14 | - name: without-resource 15 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 16 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 17 | resourceClaims: 18 | - name: resource 19 | resourceClaimName: one-flex 20 | -------------------------------------------------------------------------------- /deployments/gpu/examples/pod-inline-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: claim1 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gpu 10 | deviceClassName: gpu.intel.com 11 | ## 12 | ## if one is not enough 13 | # count: 2 14 | ## 15 | ## requesting particular series 16 | # selectors: 17 | # - cel: 18 | # expression: device.attributes["gpu.intel.com"].family == 'Flex' 19 | # - cel: 20 | # expression: device.capacity["gpu.intel.com"].memory.compareTo(quantity("4Gi")) >= 0 21 | 22 | ## for monitoring 23 | # adminAccess: true 24 | # allocationMode: "All" 25 | --- 26 | apiVersion: v1 27 | kind: Pod 28 | metadata: 29 | name: test-inline-claim 30 | spec: 31 | restartPolicy: Never 32 | containers: 33 | - name: with-resource 34 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 35 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 36 | resources: 37 | claims: 38 | - name: resource 39 | - name: without-resource 40 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 41 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 42 | resourceClaims: 43 | - name: resource 44 | resourceClaimTemplateName: claim1 45 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/gpu-monitor-claim.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: intel-gpu-monitor-claim 5 | spec: 6 | metadata: 7 | labels: 8 | app: intel-gpu-monitor-claim 9 | spec: 10 | resourceClassName: intel-gpu-monitor 11 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: monitoring 2 | resources: 3 | - https://github.com/intel/xpumanager/deployment/kubernetes/daemonset/base/?ref=V1.2.39 4 | - gpu-monitor-claim.yaml 5 | patches: 6 | - path: xpumd-delete-limits.yaml 7 | target: 8 | kind: DaemonSet 9 | - path: xpumd-add-dra-resource.yaml 10 | target: 11 | kind: DaemonSet 12 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/xpumd-add-dra-resource.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-xpumanager 5 | spec: 6 | template: 7 | spec: 8 | resourceClaims: 9 | - name: intel-gpu-resource 10 | source: 11 | resourceClaimTemplateName: intel-gpu-monitor-claim 12 | containers: 13 | - name: xpumd 14 | resources: 15 | claims: 16 | - name: intel-gpu-resource 17 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/xpumd-delete-limits.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-xpumanager 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: xpumd 10 | resources: 11 | limits: 12 | # gpu.intel.com/i915_monitoring: 1 13 | $patch: delete 14 | -------------------------------------------------------------------------------- /deployments/gpu/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - base 3 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/device-faker/device-faker.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | initContainers: 10 | - name: device-faker 11 | image: ger-is-registry.caas.intel.com/dgpu-orchestration/intel-device-faker:v0.1.0 12 | imagePullPolicy: Always 13 | command: ["/device-faker", "gpu", "-t", "/opt/templates/gpu-template.json", "-d", "/tmp/fake-root"] 14 | volumeMounts: 15 | - name: fake-root 16 | mountPath: /tmp/fake-root 17 | containers: 18 | - name: kubelet-plugin 19 | env: 20 | - name: SYSFS_ROOT 21 | value: "/fake-sysfs" 22 | volumeMounts: 23 | - name: fake-root 24 | mountPath: /fake-sysfs 25 | subPath: sysfs 26 | - name: fake-root 27 | mountPath: /fake-dev/dri 28 | subPath: dev/dri 29 | - name: fake-root 30 | mountPath: /fake-cdi 31 | subPath: cdi 32 | volumes: 33 | - name: fake-root 34 | emptyDir: {} 35 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/device-faker/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../../base 3 | 4 | patches: 5 | - path: remove-sysfs.yaml 6 | - path: device-faker.yaml 7 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/device-faker/remove-sysfs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: kubelet-plugin 11 | volumeMounts: 12 | - name: sysfs 13 | mountPath: /sysfs 14 | $patch: delete 15 | volumes: 16 | - name: sysfs 17 | $patch: delete 18 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/add-nodeselector-intel-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | nodeSelector: 10 | intel.feature.node.kubernetes.io/gpu: "true" 11 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - ../../base 6 | - nfd-intel-gpu-device-rule.yaml 7 | - nfd-intel-gpu-platform-labeling.yaml 8 | 9 | patches: 10 | - path: add-nodeselector-intel-gpu.yaml 11 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/nfd-intel-gpu-device-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-gpu-device-rule 5 | spec: 6 | rules: 7 | - name: intel.gpu.device 8 | labels: 9 | "intel.feature.node.kubernetes.io/gpu": "true" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | vendor: {op: In, value: ["8086"]} 14 | class: {op: In, value: ["0300", "0380"]} 15 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/nfd-intel-gpu-platform-labeling.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-gpu-platform-labeling 5 | spec: 6 | rules: 7 | # A_Series (Alchemist) 8 | - labels: 9 | gpu.intel.com/family: "A_Series" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | class: {op: In, value: ["0300"]} 14 | vendor: {op: In, value: ["8086"]} 15 | device: 16 | op: In 17 | value: 18 | - "56a6" 19 | - "56a5" 20 | - "56a1" 21 | - "56a0" 22 | - "5694" 23 | - "5693" 24 | - "5692" 25 | - "5691" 26 | - "5690" 27 | - "56b3" 28 | - "56b2" 29 | - "56a4" 30 | - "56a3" 31 | - "5697" 32 | - "5696" 33 | - "5695" 34 | - "56b1" 35 | - "56b0" 36 | name: intel.gpu.a.series 37 | # Max_Series 38 | - labels: 39 | gpu.intel.com/family: "Max_Series" 40 | matchFeatures: 41 | - feature: pci.device 42 | matchExpressions: 43 | class: {op: In, value: ["0380"]} 44 | vendor: {op: In, value: ["8086"]} 45 | device: 46 | op: In 47 | value: 48 | - "0bda" 49 | - "0bd5" 50 | - "0bd9" 51 | - "0bdb" 52 | - "0bd7" 53 | - "0bd6" 54 | - "0bd0" 55 | name: intel.gpu.max.series 56 | # Flex_Series 57 | - labels: 58 | gpu.intel.com/family: "Flex_Series" 59 | matchFeatures: 60 | - feature: pci.device 61 | matchExpressions: 62 | class: {op: In, value: ["0300", "0380"]} 63 | vendor: {op: In, value: ["8086"]} 64 | device: 65 | op: In 66 | value: 67 | - "0f00" 68 | - "0f01" 69 | - "0f02" 70 | name: intel.gpu.flex.series 71 | -------------------------------------------------------------------------------- /deployments/qat/base/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: qat.intel.com 5 | 6 | spec: 7 | selectors: 8 | - cel: 9 | expression: device.driver == "qat.intel.com" 10 | -------------------------------------------------------------------------------- /deployments/qat/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - device-class.yaml 3 | - namespace.yaml 4 | - resource-driver.yaml 5 | 6 | images: 7 | - name: intel/intel-qat-resource-driver 8 | newTag: v0.2.0 9 | -------------------------------------------------------------------------------- /deployments/qat/base/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: intel-qat-resource-driver 5 | -------------------------------------------------------------------------------- /deployments/qat/base/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-qat-resource-driver-kubelet-plugin 5 | namespace: intel-qat-resource-driver 6 | labels: 7 | app: intel-qat-resource-driver-kubelet-plugin 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: intel-qat-resource-driver-kubelet-plugin 12 | template: 13 | metadata: 14 | labels: 15 | app: intel-qat-resource-driver-kubelet-plugin 16 | spec: 17 | serviceAccount: intel-qat-resource-driver-service-account 18 | serviceAccountName: intel-qat-resource-driver-service-account 19 | containers: 20 | - name: kubelet-plugin 21 | image: intel/intel-qat-resource-driver:v0.1.0 22 | imagePullPolicy: IfNotPresent 23 | command: ["/kubelet-qat-plugin"] 24 | env: 25 | - name: NODE_NAME 26 | valueFrom: 27 | fieldRef: 28 | fieldPath: spec.nodeName 29 | - name: SYSFS_ROOT 30 | value: "/sysfs" 31 | volumeMounts: 32 | - name: plugins-registry 33 | mountPath: /var/lib/kubelet/plugins_registry 34 | - name: plugins 35 | mountPath: /var/lib/kubelet/plugins 36 | - name: cdi 37 | mountPath: /etc/cdi 38 | - name: varruncdi 39 | mountPath: /var/run/cdi 40 | - name: sysfs 41 | mountPath: /sysfs 42 | - name: qatconfiguration 43 | mountPath: /defaults 44 | securityContext: 45 | privileged: true 46 | readOnlyRootFilesystem: true 47 | seccompProfile: 48 | type: RuntimeDefault 49 | volumes: 50 | - name: plugins-registry 51 | hostPath: 52 | path: /var/lib/kubelet/plugins_registry 53 | - name: plugins 54 | hostPath: 55 | path: /var/lib/kubelet/plugins 56 | - name: cdi 57 | hostPath: 58 | path: /etc/cdi 59 | - name: varruncdi 60 | hostPath: 61 | path: /var/run/cdi 62 | - name: sysfs 63 | hostPath: 64 | path: /sys 65 | - name: qatconfiguration 66 | configMap: 67 | name: intel-qat-resource-driver-configuration 68 | optional: true 69 | 70 | --- 71 | apiVersion: v1 72 | kind: ServiceAccount 73 | metadata: 74 | name: intel-qat-resource-driver-service-account 75 | namespace: intel-qat-resource-driver 76 | 77 | --- 78 | apiVersion: rbac.authorization.k8s.io/v1 79 | kind: ClusterRole 80 | metadata: 81 | name: intel-qat-resource-driver-role 82 | namespace: intel-qat-resource-driver 83 | rules: 84 | - apiGroups: [""] 85 | resources: ["nodes"] 86 | verbs: ["get"] 87 | - apiGroups: ["resource.k8s.io"] 88 | resources: ["resourceslices"] 89 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 90 | - apiGroups: ["resource.k8s.io"] 91 | resources: ["resourceclaims"] 92 | verbs: ["get"] 93 | 94 | --- 95 | apiVersion: rbac.authorization.k8s.io/v1 96 | kind: ClusterRoleBinding 97 | metadata: 98 | name: intel-qat-resource-driver-role-binding 99 | namespace: intel-qat-resource-driver 100 | subjects: 101 | - kind: ServiceAccount 102 | name: intel-qat-resource-driver-service-account 103 | namespace: intel-qat-resource-driver 104 | roleRef: 105 | kind: ClusterRole 106 | name: intel-qat-resource-driver-role 107 | apiGroup: rbac.authorization.k8s.io 108 | --- 109 | apiVersion: admissionregistration.k8s.io/v1 110 | kind: ValidatingAdmissionPolicy 111 | metadata: 112 | name: resourceslices-policy-dra-kubelet-plugin-qat 113 | spec: 114 | failurePolicy: Fail 115 | matchConstraints: 116 | resourceRules: 117 | - apiGroups: ["resource.k8s.io"] 118 | apiVersions: ["v1beta1"] 119 | operations: ["CREATE", "UPDATE", "DELETE"] 120 | resources: ["resourceslices"] 121 | matchConditions: 122 | - name: isRestrictedUser 123 | expression: >- 124 | request.userInfo.username == "system:serviceaccount:intel-qat-resource-driver:intel-qat-resource-driver-service-account" 125 | variables: 126 | - name: userNodeName 127 | expression: >- 128 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 129 | - name: objectNodeName 130 | expression: >- 131 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 132 | validations: 133 | - expression: variables.userNodeName != "" 134 | message: >- 135 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 136 | - expression: variables.userNodeName == variables.objectNodeName 137 | messageExpression: >- 138 | "this user running on node '"+variables.userNodeName+"' may not modify " + 139 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 140 | --- 141 | apiVersion: admissionregistration.k8s.io/v1 142 | kind: ValidatingAdmissionPolicyBinding 143 | metadata: 144 | name: resourceslices-policy-dra-kubelet-plugin-qat 145 | spec: 146 | policyName: resourceslices-policy-dra-kubelet-plugin-qat 147 | validationActions: [Deny] 148 | -------------------------------------------------------------------------------- /deployments/qat/examples/deployment-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: qat-template-sym 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: qat-request-sym 10 | deviceClassName: qat.intel.com 11 | selectors: 12 | - cel: 13 | expression: |- 14 | device.attributes["qat.intel.com"].services == "sym" || 15 | device.attributes["qat.intel.com"].services == "sym;asym" || 16 | device.attributes["qat.intel.com"].services == "sym;dc" || 17 | device.attributes["qat.intel.com"].services == "asym;sym" || 18 | device.attributes["qat.intel.com"].services == "dc;sym" || 19 | 20 | --- 21 | apiVersion: resource.k8s.io/v1beta1 22 | kind: ResourceClaimTemplate 23 | metadata: 24 | name: qat-template-asym 25 | spec: 26 | spec: 27 | devices: 28 | requests: 29 | - name: qat-request-asym 30 | deviceClassName: qat.intel.com 31 | selectors: 32 | - cel: 33 | expression: |- 34 | device.attributes["qat.intel.com"].services == "asym" || 35 | device.attributes["qat.intel.com"].services == "asym;sym" || 36 | device.attributes["qat.intel.com"].services == "asym;dc" || 37 | device.attributes["qat.intel.com"].services == "sym;asym" || 38 | device.attributes["qat.intel.com"].services == "dc;asym" || 39 | 40 | --- 41 | apiVersion: resource.k8s.io/v1beta1 42 | kind: ResourceClaimTemplate 43 | metadata: 44 | name: qat-template-dc 45 | spec: 46 | spec: 47 | devices: 48 | requests: 49 | - name: qat-request-dc 50 | deviceClassName: qat.intel.com 51 | selectors: 52 | - cel: 53 | expression: |- 54 | device.attributes["qat.intel.com"].services == "dc" || 55 | device.attributes["qat.intel.com"].services == "dc;sym" || 56 | device.attributes["qat.intel.com"].services == "dc;asym" || 57 | device.attributes["qat.intel.com"].services == "sym;dc" || 58 | device.attributes["qat.intel.com"].services == "asym;dc" || 59 | device.attributes["qat.intel.com"].services == "dcc" 60 | 61 | --- 62 | apiVersion: v1 63 | kind: Deployment 64 | metadata: 65 | name: qat-sample-sym 66 | labels: 67 | app: inline-qat-deployment 68 | spec: 69 | replicas: 1 70 | selector: 71 | matchLabels: 72 | app: inline-qat-deployment 73 | template: 74 | metadata: 75 | labels: 76 | app: inline-qat-deployment 77 | spec: 78 | containers: 79 | - name: with-resource 80 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 81 | command: ["sh", "-c", "ls -la /dev/vfio/ && sleep 300"] 82 | securityContext: 83 | capabilities: 84 | add: 85 | ["IPC_LOCK"] 86 | resources: 87 | claims: 88 | - name: resource-sym 89 | - name: resource-asym 90 | - name: resource-dc 91 | - name: without-resource 92 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 93 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"] 94 | resourceClaims: 95 | - name: resource-sym 96 | resourceClaimTemplateName: qat-template-sym 97 | - name: resource-asym 98 | resourceClaimTemplateName: qat-template-asym 99 | - name: resource-dc 100 | resourceClaimTemplateName: qat-template-dc 101 | -------------------------------------------------------------------------------- /deployments/qat/examples/intel-qat-resource-driver-configuration.yaml: -------------------------------------------------------------------------------- 1 | kind: ConfigMap 2 | apiVersion: v1 3 | metadata: 4 | name: intel-qat-resource-driver-configuration 5 | namespace: intel-qat-resource-driver 6 | data: 7 | # Map of : in map indexed by hostname 8 | qatdefaults.config: | 9 | { "host-name-here": 10 | { 11 | "0000:aa:00.0": "asym;sym", 12 | "0000:bb:00.0": "dc;sym" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /deployments/qat/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - base 3 | -------------------------------------------------------------------------------- /deployments/qat/overlays/nfd_labeled_nodes/add-nodeselector-intel-qat.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-qat-resource-driver-kubelet-plugin 5 | namespace: intel-qat-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | nodeSelector: 10 | intel.feature.node.kubernetes.io/qat: "true" 11 | -------------------------------------------------------------------------------- /deployments/qat/overlays/nfd_labeled_nodes/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - ../../base 6 | - nfd-intel-qat-device-rule.yaml 7 | 8 | patches: 9 | - path: add-nodeselector-intel-qat.yaml 10 | -------------------------------------------------------------------------------- /deployments/qat/overlays/nfd_labeled_nodes/nfd-intel-qat-device-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-qat-device-rule 5 | spec: 6 | rules: 7 | - name: "intel.qat" 8 | labels: 9 | feature.node.kubernetes.io/qat: "true" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | vendor: {op: In, value: ["8086"]} 14 | device: {op: In, value: ["4940", "4941", "4944", "4946"]} 15 | class: {op: In, value: ["0b40"]} 16 | - feature: kernel.loadedmodule 17 | matchExpressions: 18 | intel_qat: {op: Exists} 19 | matchAny: 20 | - matchFeatures: 21 | - feature: kernel.loadedmodule 22 | matchExpressions: 23 | vfio_pci: {op: Exists} 24 | - matchFeatures: 25 | - feature: kernel.enabledmodule 26 | matchExpressions: 27 | vfio-pci: {op: Exists} 28 | -------------------------------------------------------------------------------- /deployments/qat/tests/openssl-qat-engine/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - openssl-qat-engine.yaml 3 | 4 | apiVersion: kustomize.config.k8s.io/v1beta1 5 | kind: Kustomization 6 | images: 7 | - name: openssl-qat-engine:devel 8 | newName: intel/openssl-qat-engine 9 | newTag: devel 10 | -------------------------------------------------------------------------------- /deployments/qat/tests/openssl-qat-engine/openssl-qat-engine.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: openssl-qat-engine-asym 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: openssl-qat-engine-asym 9 | image: openssl-qat-engine:devel 10 | imagePullPolicy: IfNotPresent 11 | command: ["testapp","-engine","qathwtest","-async_jobs","1","-c","1","-n","1","-nc","1","-v","-hw_algo","0x0029"] 12 | securityContext: 13 | readOnlyRootFilesystem: true 14 | allowPrivilegeEscalation: false 15 | capabilities: 16 | add: 17 | ["IPC_LOCK"] 18 | resources: 19 | claims: 20 | - name: qat-resource-asym 21 | resourceClaims: 22 | - name: qat-resource-asym 23 | resourceClaimTemplateName: qat-template-asym 24 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/compress-perf.yaml: -------------------------------------------------------------------------------- 1 | kind: Pod 2 | apiVersion: v1 3 | metadata: 4 | name: qat-dpdk-test-compress-perf 5 | spec: 6 | containers: 7 | - name: compress-perf 8 | image: crypto-perf:devel 9 | imagePullPolicy: IfNotPresent 10 | env: 11 | - name: TESTCMD 12 | value: "compress" 13 | - name: PTEST 14 | value: "--driver-name compress_qat --input-file /var/data/file.txt --seg-sz 8192 --compress-level 1:1:9 --num-iter 10 --extended-input-sz 1048576 --max-num-sgl-segs 16 --huffman-enc fixed" 15 | volumeMounts: 16 | - mountPath: /dev/hugepages 17 | name: hugepage 18 | - mountPath: /var/run/dpdk 19 | name: dpdk-runtime 20 | - mountPath: /var/data/ 21 | name: testfile 22 | resources: 23 | claims: 24 | - name: qat-resource-dc 25 | requests: 26 | cpu: "3" 27 | memory: "128Mi" 28 | hugepages-2Mi: "128Mi" 29 | limits: 30 | cpu: "3" 31 | memory: "128Mi" 32 | hugepages-2Mi: "128Mi" 33 | securityContext: 34 | readOnlyRootFilesystem: true 35 | allowPrivilegeEscalation: false 36 | capabilities: 37 | add: 38 | ["IPC_LOCK"] 39 | restartPolicy: Never 40 | volumes: 41 | - name: dpdk-runtime 42 | emptyDir: 43 | medium: Memory 44 | - name: hugepage 45 | emptyDir: 46 | medium: HugePages 47 | - name: testfile 48 | configMap: 49 | name: test-data 50 | resourceClaims: 51 | - name: qat-resource-dc 52 | resourceClaimTemplateName: qat-template-dc 53 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/crypto-perf.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: Pod 3 | apiVersion: v1 4 | metadata: 5 | name: qat-dpdk-test-crypto-perf 6 | spec: 7 | containers: 8 | - name: crypto-perf 9 | image: crypto-perf:devel 10 | imagePullPolicy: IfNotPresent 11 | env: 12 | - name: TESTCMD 13 | value: "crypto" 14 | - name: PTEST 15 | value: "--ptest throughput --devtype crypto_qat --optype cipher-only --cipher-algo aes-cbc --cipher-op encrypt --cipher-key-sz 16 --total-ops 10000000 --burst-sz 32 --buffer-sz 64" 16 | volumeMounts: 17 | - mountPath: /dev/hugepages 18 | name: hugepage 19 | - mountPath: /var/run/dpdk 20 | name: dpdk-runtime 21 | resources: 22 | claims: 23 | - name: qat-resource-sym 24 | requests: 25 | cpu: "3" 26 | memory: "128Mi" 27 | hugepages-2Mi: "128Mi" 28 | limits: 29 | cpu: "3" 30 | memory: "128Mi" 31 | hugepages-2Mi: "128Mi" 32 | securityContext: 33 | readOnlyRootFilesystem: true 34 | allowPrivilegeEscalation: false 35 | capabilities: 36 | add: 37 | ["IPC_LOCK"] 38 | restartPolicy: Never 39 | volumes: 40 | - name: dpdk-runtime 41 | emptyDir: 42 | medium: Memory 43 | - name: hugepage 44 | emptyDir: 45 | medium: HugePages 46 | resourceClaims: 47 | - name: qat-resource-sym 48 | resourceClaimTemplateName: qat-template-sym 49 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/kustomization.yaml: -------------------------------------------------------------------------------- 1 | configMapGenerator: 2 | - files: 3 | - file.txt 4 | name: test-data 5 | 6 | resources: 7 | - crypto-perf.yaml 8 | - compress-perf.yaml 9 | 10 | apiVersion: kustomize.config.k8s.io/v1beta1 11 | kind: Kustomization 12 | images: 13 | - name: crypto-perf:devel 14 | newName: intel/crypto-perf 15 | newTag: devel 16 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/modified-cluster-setup.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeadm.k8s.io/v1beta3 2 | kind: ClusterConfiguration 3 | apiServer: 4 | extraArgs: 5 | feature-gates: "DynamicResourceAllocation=true" 6 | runtime-config: "api/alpha=true" 7 | controllerManager: 8 | extraArgs: 9 | feature-gates: "DynamicResourceAllocation=true" 10 | scheduler: 11 | extraArgs: 12 | "feature-gates": "DynamicResourceAllocation=true" 13 | --- 14 | apiVersion: kubelet.config.k8s.io/v1beta1 15 | kind: KubeletConfiguration 16 | featureGates: 17 | DynamicResourceAllocation: true 18 | # DPDK applications that use QAT devices requires cpu manager policy as static. 19 | # In addition, resources should be reserved to enable it. 20 | cpuManagerPolicy: static 21 | kubeReserved: 22 | cpu: "1" 23 | memory: "2Gi" 24 | ephemeral-storage: "1Gi" 25 | --- 26 | apiVersion: kubeadm.k8s.io/v1beta3 27 | kind: InitConfiguration 28 | nodeRegistration: 29 | criSocket: "unix:///var/run/crio/crio.sock" 30 | #criSocket: "unix:///var/run/containerd/containerd.sock" 31 | --- 32 | apiVersion: kubeproxy.config.k8s.io/v1alpha1 33 | kind: KubeProxyConfiguration 34 | featureGates: 35 | DynamicResourceAllocation: true 36 | -------------------------------------------------------------------------------- /deployments/qat/tests/qatlib-sample-code/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - qatlib-sample-code.yaml 3 | 4 | apiVersion: kustomize.config.k8s.io/v1beta1 5 | kind: Kustomization 6 | images: 7 | - name: openssl-qat-engine:devel 8 | newName: intel/openssl-qat-engine 9 | newTag: devel 10 | -------------------------------------------------------------------------------- /deployments/qat/tests/qatlib-sample-code/qatlib-sample-code.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: qatlib-sample-code-sym 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: qatlib-sample-code-sym 9 | image: openssl-qat-engine:devel 10 | imagePullPolicy: IfNotPresent 11 | command: ["cpa_sample_code", "runTests=1"] 12 | securityContext: 13 | readOnlyRootFilesystem: true 14 | allowPrivilegeEscalation: false 15 | capabilities: 16 | add: 17 | ["IPC_LOCK"] 18 | resources: 19 | claims: 20 | - name: qat-resource-sym 21 | resourceClaims: 22 | - name: qat-resource-sym 23 | resourceClaimTemplateName: qat-template-sym 24 | --- 25 | apiVersion: v1 26 | kind: Pod 27 | metadata: 28 | name: qatlib-sample-code-dc 29 | spec: 30 | restartPolicy: Never 31 | containers: 32 | - name: qatlib-sample-code-dc 33 | image: openssl-qat-engine:devel 34 | imagePullPolicy: IfNotPresent 35 | command: ["cpa_sample_code", "runTests=32"] 36 | securityContext: 37 | readOnlyRootFilesystem: true 38 | allowPrivilegeEscalation: false 39 | capabilities: 40 | add: 41 | ["IPC_LOCK"] 42 | resources: 43 | claims: 44 | - name: qat-resource-dc 45 | resourceClaims: 46 | - name: qat-resource-dc 47 | resourceClaimTemplateName: qat-template-dc 48 | -------------------------------------------------------------------------------- /deployments/qat/tests/resource-claim-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: qat-template-sym 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: qat-request-sym 10 | deviceClassName: qat.intel.com 11 | selectors: 12 | - cel: 13 | expression: |- 14 | device.attributes["qat.intel.com"].services == "sym" || 15 | device.attributes["qat.intel.com"].services == "sym;asym" || 16 | device.attributes["qat.intel.com"].services == "sym;dc" || 17 | device.attributes["qat.intel.com"].services == "asym;sym" || 18 | device.attributes["qat.intel.com"].services == "dc;sym" 19 | --- 20 | apiVersion: resource.k8s.io/v1beta1 21 | kind: ResourceClaimTemplate 22 | metadata: 23 | name: qat-template-asym 24 | spec: 25 | spec: 26 | devices: 27 | requests: 28 | - name: qat-request-asym 29 | deviceClassName: qat.intel.com 30 | selectors: 31 | - cel: 32 | expression: |- 33 | device.attributes["qat.intel.com"].services == "asym" || 34 | device.attributes["qat.intel.com"].services == "asym;sym" || 35 | device.attributes["qat.intel.com"].services == "asym;dc" || 36 | device.attributes["qat.intel.com"].services == "sym;asym" || 37 | device.attributes["qat.intel.com"].services == "dc;asym" 38 | --- 39 | apiVersion: resource.k8s.io/v1beta1 40 | kind: ResourceClaimTemplate 41 | metadata: 42 | name: qat-template-dc 43 | spec: 44 | spec: 45 | devices: 46 | requests: 47 | - name: qat-request-dc 48 | deviceClassName: qat.intel.com 49 | selectors: 50 | - cel: 51 | expression: |- 52 | device.attributes["qat.intel.com"].services == "dc" || 53 | device.attributes["qat.intel.com"].services == "dc;sym" || 54 | device.attributes["qat.intel.com"].services == "dc;asym" || 55 | device.attributes["qat.intel.com"].services == "sym;dc" || 56 | device.attributes["qat.intel.com"].services == "asym;dc" || 57 | device.attributes["qat.intel.com"].services == "dcc" 58 | -------------------------------------------------------------------------------- /doc/CLUSTER_SETUP.md: -------------------------------------------------------------------------------- 1 | # Setting up new K8s cluster for usage with Dynamic Resource Allocation resource drivers 2 | 3 | - In any uncertainty, refer to main [Kubernetes installation documentation](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) . 4 | - Check what version of Kubernetes is [required](../README.md#supported-kubernetes-versions) 5 | - Ensure you are running either CRI-O 1.23+ or Containerd 1.7+ with CDI support enabled, and that [cluster-config](../hack/clusterconfig.yaml) file uses `criSocket` matching it. 6 | - Make sure to enable both `DynamicResourceAllocation` 7 | [feature-gate](https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/), 8 | and alpha API for the Kubernetes api-server during your cluster initialization. 9 | - Example cluster initialization is in [cluster-config](../hack/clusterconfig.yaml) file 10 | ```bash 11 | sudo -E kubeadm init --config hack/clusterconfig.yaml 12 | ``` 13 | - Deploy cni . 14 | - Verify that `coredns` pod(s) are up: `kubectl get pods -A | grep dns`. 15 | 16 | ## Enable CDI in Containerd 17 | 18 | Containerd config file should have `enable_cdi` and `cdi_specs_dir`. Example `/etc/containerd/config.toml`: 19 | ``` 20 | version = 2 21 | [plugins] 22 | [plugins."io.containerd.grpc.v1.cri"] 23 | enable_cdi = true 24 | cdi_specs_dir = ["/etc/cdi", "/var/run/cdi"] 25 | ``` 26 | 27 | ## Using minikube 28 | 29 | To create a minikube cluster with DRA, use the command (change the K8s version in the last parameter if needed): 30 | ```shell 31 | minikube start \ 32 | --feature-gates=DynamicResourceAllocation=true \ 33 | --extra-config=apiserver.feature-gates=DynamicResourceAllocation=true \ 34 | --extra-config=apiserver.runtime-config=resource.k8s.io/v1beta1=true \ 35 | --extra-config=scheduler.feature-gates=DynamicResourceAllocation=true \ 36 | --extra-config=controller-manager.feature-gates=DynamicResourceAllocation=true \ 37 | --extra-config=kubelet.feature-gates=DynamicResourceAllocation=true \ 38 | --container-runtime=containerd \ 39 | --kubernetes-version=1.32.0 40 | ``` 41 | 42 | Minikube will start its own Containerd inside the minikube docker container, where CDI needs to be 43 | enabled. Connect to the minikube container and edit containerd config: 44 | ```shell 45 | docker exec -it minikube /bin/bash 46 | vi /etc/containerd/config.toml 47 | ``` 48 | 49 | Add two lines into the `[plugins."io.containerd.grpc.v1.cri"]` section: 50 | ``` 51 | [plugins."io.containerd.grpc.v1.cri"] 52 | enable_cdi = true 53 | cdi_specs_dir = ["/etc/cdi", "/var/run/cdi"] 54 | ``` 55 | 56 | Then save it, exit editor, and restart the containerd that runs inside the minikube 57 | ``` 58 | systemctl restart containerd 59 | ``` 60 | 61 | At last, exit from the minikube container. 62 | -------------------------------------------------------------------------------- /doc/cdi-spec-generator/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel CDI Spec Generator 2 | A pre-compiled binary is already available for download, eliminating the need for manual building. See documentation [README.md](README.md#Releases) 3 | 4 | ## Prerequisites 5 | - Go 1.22 6 | 7 | ## Building 8 | 1. Clone the repository 9 | ```bash 10 | git clone https://github.com/intel/intel-resource-drivers-for-kubernetes.git 11 | cd intel-resource-drivers-for-kubernetes/cmd/cdi-specs-generator 12 | ``` 13 | 14 | 2. Build the executable 15 | ```bash 16 | go build -o intel-cdi-specs-generator main.go 17 | ``` 18 | This command will generate an executable named intel-cdi-specs-generator in the current directory. 19 | 20 | ## Verification 21 | To verify that the build was successful, you can check the version of the tool by running: 22 | ```bash 23 | intel-cdi-specs-generator --version 24 | ``` -------------------------------------------------------------------------------- /doc/cdi-spec-generator/README.md: -------------------------------------------------------------------------------- 1 | # Intel CDI Spec Generator 2 | 3 | ## Overview 4 | The Intel CDI Specs Generator is a command line tool to generate Container Device Interface (CDI) specifications for supported accelerators. 5 | 6 | ## Prerequisites 7 | - Administrative privileges on the system to write CDI specs. 8 | 9 | ## Usage 10 | Execute the built executable with the type of device you wish to generate CDI specs for: 11 | ```bash 12 | intel-cdi-specs-generator 13 | ``` 14 | 15 | Supported device types: 16 | - gpu: Use this option to generate CDI specs for Intel GPUs. 17 | - gaudi: Use this option to generate CDI specs for Intel Gaudi accelerators. 18 | 19 | ## Display Version 20 | To display the version of the binary, use the following command: 21 | ```bash 22 | intel-cdi-specs-generator --version 23 | ``` 24 | 25 | ## Example Usage 26 | To generate CDI specifications for GPUs, run the tool with gpu as an argument: 27 | ```bash 28 | intel-cdi-specs-generator gpu 29 | ``` 30 | This command will detect supported GPUs on the system, and ensure that there is a CDI device record for each of them. 31 | 32 | 33 | ## Building 34 | - [How to build CDI Spec Generator](BUILD.md) 35 | 36 | ## Releases 37 | The binary is available for download in the releases section: 38 | - [Intel Resource Drivers for Kubernetes releases](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases) 39 | - [CDI Spec Generator v0.1.0](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases/tag/specs-generator-v0.1.0) 40 | -------------------------------------------------------------------------------- /doc/gaudi/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel Gaudi Resource Driver container image 2 | 3 | ## Platforms supported 4 | 5 | - Linux 6 | 7 | ## Prerequisites 8 | 9 | - Docker or Podman. 10 | 11 | ## Building 12 | 13 | `Makefile` automates this, only required tool is Docker or Podman. 14 | To build the container image locally, from the root of this Git repository: 15 | ```bash 16 | make gaudi-container-build 17 | ``` 18 | 19 | It is possible to specify custom registry, container image name, and version (tag) as separate 20 | variables to override any part of release container image URL in the build command, e.g.: 21 | ```bash 22 | REGISTRY=myregistry GAUDI_IMAGE_NAME=myimage GAUDI_IMAGE_VERSION=myversion make gaudi-container-build 23 | ``` 24 | 25 | or whole resulting image URL (this will ignore REGISTRY, GAUDI_IMAGE_NAME, GAUDI_IMAGE_VERSION even if specified): 26 | ```bash 27 | GAUDI_IMAGE_TAG=myregistry/myimagename:myversion make gaudi-container-build 28 | ``` 29 | 30 | To build the container image and push image to the destination registry straight away: 31 | ```bash 32 | REGISTRY=registry.local make gaudi-container-push 33 | ``` 34 | or 35 | ```bash 36 | GAUDI_IMAGE_TAG=registry.local/intel-gaudi-resource-driver:latest make gaudi-container-push 37 | ``` 38 | -------------------------------------------------------------------------------- /doc/gaudi/README.md: -------------------------------------------------------------------------------- 1 | # Intel Gaudi resource driver for Kubernetes 2 | 3 | CAUTION: This is an beta / non-production software, do not use on production clusters. 4 | 5 | ## About resource driver 6 | 7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates 8 | the resoruces and resource driver's kubelet-plugin ensures that the allocated devices are prepared 9 | and available for Pods. 10 | 11 | DRA API graduated to v1beta1 in K8s v1.32. Latest DRA drivers support only K8s v1.32+. 12 | 13 | ## Supported Kubernetes Versions 14 | 15 | Supported Kubernetes versions are listed below: 16 | 17 | | Branch | Kubernetes branch/version | Status | DRA | 18 | |:------------------|:--------------------------------|:------------|:-------------------------------| 19 | | v0.1.0 | Kubernetes v1.27 ~ v1.30 | supported | Classic, Structured Parameters | 20 | | v0.2.0 | Kubernetes v1.31 | unsupported | Structured Parameters | 21 | | v0.3.0 | Kubernetes v1.32+ | supported | Structured Parameters | 22 | 23 | ## Documentation 24 | 25 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md) 26 | - [How to deploy and use Intel Gaudi resource driver](USAGE.md) 27 | - Optional: [How to build Intel Gaudi resource driver container image](BUILD.md) 28 | -------------------------------------------------------------------------------- /doc/gpu/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel GPU Resource Driver container image 2 | 3 | ## Platforms supported 4 | 5 | - Linux 6 | 7 | ## Prerequisites 8 | 9 | - Docker or Podman. 10 | 11 | ## Building 12 | 13 | `Makefile` automates this, only required tool is Docker or Podman. 14 | To build the container image locally, from the root of this Git repository: 15 | ```bash 16 | make gpu-container-build 17 | ``` 18 | 19 | It is possible to specify custom registry, container image name, and version (tag) as separate 20 | variables to override any part of release container image URL in the build command, e.g.: 21 | ```bash 22 | REGISTRY=myregistry GPU_IMAGE_NAME=myimage GPU_IMAGE_VERSION=myversion make gpu-container-build 23 | ``` 24 | 25 | or whole resulting image URL (this will ignore REGISTRY, GPU_IMAGE_NAME, GPU_IMAGE_VERSION even if specified): 26 | ```bash 27 | GPU_IMAGE_TAG=myregistry/myimagename:myversion make gpu-container-build 28 | ``` 29 | 30 | To build the container image and push image to the destination registry straight away: 31 | ```bash 32 | REGISTRY=registry.local make gpu-container-push 33 | ``` 34 | or 35 | ```bash 36 | GPU_IMAGE_TAG=registry.local/intel-gpu-resource-driver:latest make gpu-container-push 37 | ``` 38 | -------------------------------------------------------------------------------- /doc/gpu/README.md: -------------------------------------------------------------------------------- 1 | # Intel GPU resource driver for Kubernetes 2 | 3 | CAUTION: This is an beta / non-production software, do not use on production clusters. 4 | 5 | ## About resource driver 6 | 7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates 8 | the resoruces and resource driver's kubelet-plugin ensures that the allocated devices are prepared 9 | and available for Pods. 10 | 11 | DRA API graduated to v1beta1 in K8s v1.32. Latest DRA drivers support only K8s v1.32+. 12 | 13 | ## Supported GPU devices (with Linux kernel Intel `i915` GPU driver): 14 | - Intel® Data Center GPU Max Series 15 | - Intel® Data Center GPU Flex Series 16 | - Intel® Arc A-Series 17 | - Intel® Iris® Xe MAX 18 | - Intel® Integrated graphics 19 | 20 | ## Supported Kubernetes Versions 21 | 22 | Supported Kubernetes versions are listed below: 23 | 24 | | Branch | Kubernetes branch/version | Status | DRA | 25 | |:------------------|:---------------------------------|:------------|:-------------------------------| 26 | | v0.1.0-beta | Kubernetes v1.26 branch v1.26.x | unsupported | Classic | 27 | | v0.1.1-beta | Kubernetes v1.27 branch v1.27.x | unsupported | Classic | 28 | | v0.2.0 | Kubernetes v1.28 branch v1.28.x | unsupported | Classic | 29 | | v0.3.0 | Kubernetes v1.28+ | unsupported | Classic | 30 | | v0.4.0 | Kubernetes v1.28+ | unsupported | Classic | 31 | | v0.5.0 | Kubernetes v1.27 - v1.30 | supported | Classic, Structured Parameters | 32 | | v0.6.0 | Kubernetes v1.31 | unsupported | Structured Parameters | 33 | | v0.7.0 | Kubernetes v1.32+ | supported | Structured Parameters | 34 | 35 | ## Documentation 36 | 37 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md) 38 | - [How to deploy and use Intel GPU resource driver](USAGE.md) 39 | - Optional: [How to build Intel GPU resource driver container image](BUILD.md) -------------------------------------------------------------------------------- /doc/gpu/allocation-delayed.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | title "Delayed allocation" 3 | 4 | actor Actor 5 | participant ResourceClaim 6 | participant Pod 7 | participant Controller 8 | participant Plugin 9 | 10 | Actor -> ResourceClaim : deploy 11 | ResourceClaim -> Controller : notify 12 | note right of Controller 13 | the difference is here 14 | end note 15 | Controller -> Controller : wait for first user 16 | Actor -> Pod : deploy 17 | Pod -> Controller : find suitable nodes 18 | Pod -> Controller : Allocate on Node N 19 | Plugin -> ResourceClaim : prepare resource and mark Ready 20 | 21 | @enduml 22 | 23 | -------------------------------------------------------------------------------- /doc/gpu/allocation-immediate.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | title "Immediate allocation" 3 | 4 | actor Actor 5 | participant ResourceClaim 6 | participant Pod 7 | participant Controller 8 | participant Plugin 9 | 10 | Actor -> ResourceClaim : deploy 11 | ResourceClaim -> Controller : notify 12 | note right of Controller 13 | the difference is here 14 | end note 15 | Controller -> Controller : find suitable nodes 16 | Controller -> Controller : Allocate on Node N 17 | Actor -> Pod : deploy 18 | Plugin -> ResourceClaim : prepare resource and mark Ready 19 | 20 | @enduml 21 | 22 | -------------------------------------------------------------------------------- /doc/gpu/complete-overview.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | 3 | left to right direction 4 | allowmixing 5 | 6 | 7 | component "CRD resource-classes" { 8 | component "resource-class0" { 9 | component "CRD resource-class0-parameters" 10 | } 11 | component "resource-class1" { 12 | component "CRD resource-class1-parameters" 13 | } 14 | } 15 | 16 | component "CRD nodeallocationstats" as crdnas { 17 | cloud "node0" as nasnode0 { 18 | component "allocatable GPUs" as allocatable 19 | component "claim-requests" as requests 20 | component "claim-allocations" as allocations 21 | } 22 | } 23 | 24 | node "control-plane" as cp { 25 | component "Scheduler / DRA-controller" as scheduler 26 | component "R-D controller" as rdcontroller 27 | component "API" as api 28 | } 29 | 30 | node "node0" as wn { 31 | component "Pod" as pod 32 | component "R-D kubelet-plugin" as rdplugin 33 | } 34 | 35 | component "resourceclaim0\n\nresource-class0\nparametersRef:" as resclaim0 { 36 | component resclaimparams0 [ 37 | type: gpu, 38 | memory: 256, 39 | millicores: 100 40 | ] 41 | } 42 | 43 | package "Pod.yaml" as podyaml { 44 | } 45 | 46 | package "ResourceClaim.yaml" as resclaimyaml { 47 | } 48 | 49 | podyaml ..> api : deploy 50 | resclaimyaml ..> api : deploy 51 | 52 | cloud "Schedule Pod" as schedulepod { 53 | } 54 | 55 | api ..> schedulepod 56 | schedulepod ..> scheduler 57 | rdplugin --> allocatable : 0. populate & sync with CDI/CRD 58 | api --> resclaim0 : 1. create 59 | resclaim0 --> rdcontroller : 2. notify 60 | rdcontroller --> requests : 3. create 61 | rdcontroller --> requests : 3. create 62 | crdnas --> rdplugin : 4. allocate and update 63 | 64 | scheduler <=> rdcontroller : unsuitableNodes 65 | rdcontroller --> nasnode0 : enough resources? 66 | 67 | @enduml 68 | -------------------------------------------------------------------------------- /doc/gpu/generate-pngs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if ! type plantuml &> /dev/null; then 4 | echo "ERR: No plantuml found in PATH, plantuml is needed to produce PNG files" 5 | exit 1 6 | fi 7 | 8 | # source files are in script dir 9 | dir=${0%/*} 10 | 11 | for puml in "$dir"/*puml; do 12 | png="${puml%.puml}.png" 13 | # update if PNG missing or older that source file 14 | if test "$puml" -nt "$png"; then 15 | echo "$puml" 16 | plantuml "$puml" "$png" 17 | fi 18 | done 19 | -------------------------------------------------------------------------------- /doc/gpu/high-level-overview.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | 3 | allowmixing 4 | 5 | actor User 6 | 7 | component "resourceclaim0\n\nresourceClass: class0\nparametersRef:" as resclaim0 { 8 | component resclaimparams0 [ 9 | type: gpu, 10 | memory: 256, 11 | millicores: 100, 12 | count: 1, 13 | ] 14 | } 15 | 16 | component "resource-classes" { 17 | component "class0" { 18 | component "class0-parameters" 19 | } 20 | component "class1" { 21 | component "class1-parameters" 22 | } 23 | } 24 | 25 | left to right direction 26 | 27 | User --> resclaim0 : deploy 28 | 29 | @enduml 30 | 31 | -------------------------------------------------------------------------------- /doc/qat/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel® QAT Resource Driver container image 2 | 3 | ## Platforms supported 4 | 5 | - Linux 6 | 7 | ## Prerequisites 8 | 9 | - Docker or Podman. 10 | 11 | ## Building 12 | 13 | `Makefile` automates this, only required tool is Docker or Podman. 14 | To build the container image locally, from the root of this Git repository: 15 | ```bash 16 | make qat-container-build 17 | ``` 18 | 19 | It is possible to specify custom registry, container image name, and version (tag) as separate 20 | variables to override any part of release container image URL in the build command, e.g.: 21 | ```bash 22 | REGISTRY=myregistry QAT_IMAGE_NAME=myimage QAT_IMAGE_VERSION=myversion make qat-container-build 23 | ``` 24 | 25 | or whole resulting image URL (this will ignore REGISTRY, QAT_IMAGE_NAME, QAT_IMAGE_VERSION even if specified): 26 | ```bash 27 | QAT_IMAGE_TAG=myregistry/myimagename:myversion make qat-container-build 28 | ``` 29 | 30 | To build the container image and push image to the destination registry straight away: 31 | ```bash 32 | REGISTRY=registry.local make qat-container-push 33 | ``` 34 | or 35 | ```bash 36 | QAT_IMAGE_TAG=registry.local/intel-qat-resource-driver:latest make qat-container-push 37 | ``` 38 | -------------------------------------------------------------------------------- /doc/qat/README.md: -------------------------------------------------------------------------------- 1 | # Intel® QAT resource driver for Kubernetes 2 | 3 | CAUTION: This is an beta / non-production software, do not use on production clusters. 4 | 5 | ## About resource driver 6 | 7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates 8 | the resources and resource driver's kubelet-plugin ensures that the allocated devices are prepared 9 | and available for Pods. 10 | 11 | DRA API graduated to v1beta1 in K8s v1.32. Latest DRA drivers support only K8s v1.32+. 12 | 13 | ## Host OS requirements 14 | 15 | In order to guarantee proper operation, ensure Linux kernel module `vfio_pci` has been loaded. 16 | 17 | The QAT Kubernetes resource driver is intended to be used on upstream Linux kernels, 18 | see [the in-tree kernel documentation](https://intel.github.io/quickassist/RN/In-Tree/in_tree_firmware_RN.html) 19 | for details. Note though, that the QAT resource driver itself does not depend on 20 | any QAT user space libraries mentioned in that document. 21 | 22 | ## Supported QAT devices 23 | 24 | All 4th Gen Intel® Xeon® Scalable Processor QAT devices handled by the Linux kernel 25 | driver module `qat_4xxx` are supported. 26 | 27 | ## Supported Kubernetes Versions 28 | 29 | Supported Kubernetes versions are listed below: 30 | 31 | | Branch | Kubernetes branch/version | Status | DRA | 32 | |:------------------|:--------------------------------|:------------|:-------------------------------| 33 | | v0.1.0 | Kubernetes v1.31 | unsupported | Structured Parameters | 34 | | v0.2.0 | Kubernetes v1.32+ | supported | Structured Parameters | 35 | 36 | ## QAT service configuration 37 | 38 | In version 0.1.0 static configuration of QAT services is done using a ConfigMap, 39 | please have a look at 40 | [the example ConfigMap yaml](../../deployments/qat/examples/intel-qat-resource-driver-configuration.yaml). 41 | 42 | The ConfigMap and Resource Claims use the same string notation as the QAT kernel 43 | driver when specifying what services are to be configured for the device and Resource 44 | Claim. When two services are requested, the service strings are to be separated by 45 | semicolon (';'). Supported services are: 46 | * Symmetric cryptography: `sym` 47 | * Asymmetric cryptograpy: `asym` 48 | * Compression: `dc` 49 | 50 | ## Documentation 51 | 52 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md) 53 | - [How to deploy and use Intel® QAT resource driver](USAGE.md) 54 | - Optional: [How to build Intel® QAT resource driver container image](BUILD.md) -------------------------------------------------------------------------------- /doc/qat/TESTING.md: -------------------------------------------------------------------------------- 1 | # Test Cases 2 | 3 | ## Intel® QAT Device Plugin 4 | There are test cases made for [Intel® QAT Device Plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/qat_plugin/README.md). 5 | It is possible to run those images using this resource driver. Those images are 6 | available in the following links. 7 | 8 | - [qatlib-sample-code](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/openssl-qat-engine) 9 | - [qat-dpdk-test](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/crypto-perf) 10 | 11 | Build the images in your environment, create a resourceClaimTemplate and run 12 | the pods with the following commands. 13 | ``` 14 | kubectl apply -f deployments/qat/tests/resource-claim-template.yaml 15 | kubectl apply -k deployments/qat/tests/qatlib-sample-code 16 | kubectl apply -k deployments/qat/tests/qat-dpdk-test 17 | ``` 18 | All cases include both crypto and compress tests. 19 | 20 | To run `qat-dpdk-test`, the cluster should have `CPU Manager Policy` as `static` 21 | in its kubelet configuration. In addition, `hugepages-2Mi` resource should be 22 | available. 23 | 24 | There is an example [cluster setup yaml](../../deployments/qat/tests/qat-dpdk-test/modified-cluster-setup.yaml) 25 | for setting cpu manager policy as static. Re-create the cluster with the 26 | configurations enabled. 27 | -------------------------------------------------------------------------------- /doc/qat/USAGE.md: -------------------------------------------------------------------------------- 1 | ## Requirements 2 | 3 | - Kubernetes 1.32+, with `DynamicResourceAllocation` feature-flag enabled, and 4 | [other cluster parameters](../../hack/clusterconfig.yaml) 5 | - Container runtime needs to support CDI: 6 | - CRI-O v1.23.0 or newer 7 | - Containerd v1.7 or newer 8 | 9 | ## Deploy resource-driver 10 | 11 | Deploy DeviceClass, Namespace and ResourceDriver 12 | ```bash 13 | kubectl apply -k deployments/qat/ 14 | ``` 15 | 16 | By default, the kubelet-plugin is deployed on _all_ nodes in the cluster, as no nodeSelector is defined. 17 | To restrict the deployment to QAT-enabled nodes, follow these steps: 18 | 19 | 1. Install Node Feature Discovery (NFD): 20 | 21 | Follow [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery) documentation to install and configure NFD in your cluster. 22 | 23 | ```bash 24 | kubectl apply -k "https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.17.1" 25 | ``` 26 | 27 | 2. Apply NFD Rules: 28 | 29 | ```bash 30 | kubectl apply -k deployments/qat/overlays/nfd_labeled_nodes/ 31 | ``` 32 | After NFD is installed and running, make sure the target node is labeled with: 33 | ```bash 34 | intel.feature.node.kubernetes.io/qat: "true" 35 | ``` 36 | 37 | When deploying custom-built resource driver image, change `image:` lines in 38 | [resource-driver](../../deployments/qat/base/resource-driver.yaml) to match its location. 39 | 40 | 41 | ## `deployment/` directory contains all required YAMLs: 42 | 43 | * `deployments/qat/base/device-class.yaml` - pre-defined DeviceClass that ResourceClaims can refer to. 44 | * `deployments/qat/base/namespace.yaml` - Kubernetes namespace for QAT resource driver. 45 | * `deployments/qat/base/resource-driver.yaml` - actual resource driver with service account and RBAC policy 46 | - kubelet-plugin DaemonSet - node-agent which performs three functions: 47 | 1) discovery of supported hardware on the Kubernetes cluster node and its announcement as a ResourceSlice. 48 | 2) preparation of the hardware allocated to the ResourceClaims for the Pod that is being started on the node. 49 | 3) unpreparation of the hardware allocated to the ResourceClaims for the Pod that has stopped and reached final state on the node. 50 | 51 | ### Example use case: Pod with QAT accelerator 52 | 53 | The simplest way to use the Intel® QAT resource driver is to create a ResourceClaim 54 | and add it to the Pod spec. The Intel® QAT resource driver will take care of allocating 55 | a suitable device to the Resource Claim when Kubernetes schedules the Pod on the node. 56 | 57 | Example: 58 | ``` 59 | apiVersion: resource.k8s.io/v1beta1 60 | kind: ResourceClaimTemplate 61 | metadata: 62 | name: qat-template-sym 63 | spec: 64 | spec: 65 | devices: 66 | requests: 67 | - name: qat-request-sym 68 | deviceClassName: qat.intel.com 69 | selectors: 70 | - cel: 71 | expression: |- 72 | device.attributes["qat.intel.com"].services == "sym" || 73 | device.attributes["qat.intel.com"].services == "sym;asym" || 74 | device.attributes["qat.intel.com"].services == "sym;dc" || 75 | device.attributes["qat.intel.com"].services == "asym;sym" || 76 | device.attributes["qat.intel.com"].services == "dc;sym" || 77 | 78 | --- 79 | apiVersion: v1 80 | kind: Deployment 81 | metadata: 82 | name: qat-sample-sym 83 | labels: 84 | app: inline-qat-deployment 85 | spec: 86 | replicas: 1 87 | selector: 88 | matchLabels: 89 | app: inline-qat-deployment 90 | template: 91 | metadata: 92 | labels: 93 | app: inline-qat-deployment 94 | spec: 95 | containers: 96 | - name: with-resource 97 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 98 | command: ["sh", "-c", "ls -la /dev/vfio/ && sleep 300"] 99 | securityContext: 100 | capabilities: 101 | add: 102 | ["IPC_LOCK"] 103 | resources: 104 | claims: 105 | - name: resource-sym 106 | resourceClaims: 107 | - name: resource-sym 108 | resourceClaimTemplateName: qat-template-sym 109 | ``` 110 | QAT services are matched by CEL expression; in the example above, `sym` and `asym` 111 | services are considered in the regular expression. Examples of other common service 112 | matches include `sym;asym`, `[^a]?sym` and `dc`, see [README](README.md#qat-service-configuration). 113 | 114 | `IPC_LOCK` capability is required sinces VFIO based device access expects IPC_LOCK with the QAT sw stack. 115 | -------------------------------------------------------------------------------- /gaudi.mk: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | GAUDI_VERSION ?= v0.4.1 17 | GAUDI_IMAGE_NAME ?= intel-gaudi-resource-driver 18 | GAUDI_IMAGE_VERSION ?= $(GAUDI_VERSION) 19 | GAUDI_IMAGE_TAG ?= $(REGISTRY)/$(GAUDI_IMAGE_NAME):$(GAUDI_IMAGE_VERSION) 20 | 21 | GAUDI_BINARIES = \ 22 | bin/kubelet-gaudi-plugin 23 | 24 | GAUDI_COMMON_SRC = \ 25 | $(COMMON_SRC) \ 26 | pkg/gaudi/cdihelpers/*.go \ 27 | pkg/gaudi/device/*.go \ 28 | pkg/gaudi/discovery/*.go 29 | 30 | # Gaudi DRA driver is not statically built, it depends on libhlml.so, therefore 31 | # the -extldflags ${EXT_LDFLAGS} is not used. 32 | GAUDI_LDFLAGS = ${LDFLAGS} -X ${PKG}/pkg/version.driverVersion=${GAUDI_VERSION} 33 | 34 | .PHONY: gaudi 35 | gaudi: $(GAUDI_BINARIES) 36 | 37 | bin/kubelet-gaudi-plugin: cmd/kubelet-gaudi-plugin/*.go $(GAUDI_COMMON_SRC) 38 | GOOS=linux GOARCH=${ARCH} \ 39 | go build -a -ldflags "${GAUDI_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-gaudi-plugin 40 | 41 | .PHONY: gaudi-container-build 42 | gaudi-container-build: cleanall vendor 43 | @echo "Building Gaudi resource driver container..." 44 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(GAUDI_IMAGE_TAG) \ 45 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) \ 46 | --build-arg HTTP_PROXY=$(http_proxy) \ 47 | --build-arg HTTPS_PROXY=$(https_proxy) \ 48 | --build-arg NO_PROXY=$(no_proxy) \ 49 | -f Dockerfile.gaudi . 50 | 51 | .PHONY: gaudi-container-push 52 | gaudi-container-push: gaudi-container-build 53 | $(DOCKER) push $(GAUDI_IMAGE_TAG) 54 | -------------------------------------------------------------------------------- /gpu.mk: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Use a custom version for E2E tests if we are testing in CI 16 | GPU_VERSION ?= v0.7.0 17 | GPU_IMAGE_NAME ?= intel-gpu-resource-driver 18 | GPU_IMAGE_VERSION ?= $(GPU_VERSION) 19 | GPU_IMAGE_TAG ?= $(REGISTRY)/$(GPU_IMAGE_NAME):$(GPU_IMAGE_VERSION) 20 | 21 | GPU_BINARIES = \ 22 | bin/kubelet-gpu-plugin 23 | 24 | GPU_COMMON_SRC = \ 25 | $(COMMON_SRC) \ 26 | pkg/gpu/cdihelpers/*.go \ 27 | pkg/gpu/device/*.go \ 28 | pkg/gpu/discovery/*.go 29 | 30 | GPU_LDFLAGS = ${LDFLAGS} -extldflags $(EXT_LDFLAGS) -X ${PKG}/pkg/version.driverVersion=${GPU_VERSION} 31 | 32 | .PHONY: gpu 33 | gpu: $(GPU_BINARIES) 34 | 35 | bin/kubelet-gpu-plugin: cmd/kubelet-gpu-plugin/*.go $(GPU_COMMON_SRC) 36 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 37 | go build -a -ldflags "${GPU_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-gpu-plugin 38 | 39 | bin/alert-webhook: cmd/alert-webhook/*.go $(GPU_COMMON_SRC) 40 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 41 | go build -a -ldflags "${GPU_LDFLAGS}" -mod vendor -o $@ ./cmd/alert-webhook 42 | 43 | .PHONY: gpu-container-build 44 | gpu-container-build: cleanall vendor 45 | @echo "Building GPU resource drivers container..." 46 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(GPU_IMAGE_TAG) \ 47 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) -f Dockerfile.gpu . 48 | 49 | .PHONY: gpu-container-push 50 | gpu-container-push: gpu-container-build 51 | $(DOCKER) push $(GPU_IMAGE_TAG) 52 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /hack/clusterconfig.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeadm.k8s.io/v1beta3 2 | kind: ClusterConfiguration 3 | apiServer: 4 | extraArgs: 5 | feature-gates: "DynamicResourceAllocation=true" 6 | runtime-config: "resource.k8s.io/v1beta1=true" 7 | controllerManager: 8 | extraArgs: 9 | feature-gates: "DynamicResourceAllocation=true" 10 | scheduler: 11 | extraArgs: 12 | "feature-gates": "DynamicResourceAllocation=true" 13 | --- 14 | apiVersion: kubelet.config.k8s.io/v1beta1 15 | kind: KubeletConfiguration 16 | featureGates: 17 | DynamicResourceAllocation: true 18 | --- 19 | apiVersion: kubeadm.k8s.io/v1beta3 20 | kind: InitConfiguration 21 | nodeRegistration: 22 | criSocket: "unix:///var/run/crio/crio.sock" 23 | #criSocket: "unix:///var/run/containerd/containerd.sock" 24 | --- 25 | apiVersion: kubeproxy.config.k8s.io/v1alpha1 26 | kind: KubeProxyConfiguration 27 | featureGates: 28 | DynamicResourceAllocation: true 29 | -------------------------------------------------------------------------------- /hack/fake_libhlml/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -O -Wall -Wextra -Wno-unused-parameter -fPIC 3 | LDFLAGS = -shared 4 | TARGET = fake_libhlml.so 5 | SRCS = $(wildcard *.c) 6 | OBJS = $(SRCS:.c=.o) 7 | 8 | all: $(TARGET) 9 | 10 | $(TARGET): $(OBJS) 11 | $(CC) $(LDFLAGS) -o $@ $^ 12 | 13 | %.o: %.c 14 | $(CC) $(CFLAGS) -c $< -o $@ 15 | 16 | clean: 17 | rm -f $(OBJS) $(TARGET) 18 | 19 | .PHONY: all clean 20 | -------------------------------------------------------------------------------- /hack/fake_libhlml/README.md: -------------------------------------------------------------------------------- 1 | This implements a stub / mock for the interface defined in 2 | https://github.com/HabanaAI/gohlml/blob/main/hlml.h. 3 | 4 | The result is a shared library fake_libhlml.so - it can be used to simulate presense of Gaudi 5 | devices and kernel driver. 6 | 7 | To run tests for Gaudi health monitoring locally, follow these steps: 8 | 9 | - build hack/fake_libhlml 10 | ``` 11 | cd hack/fake_libhlml 12 | make 13 | ``` 14 | - deploy it where Go module expects to find it 15 | ``` 16 | sudo mkdir /usr/lib/habanalabs 17 | sudo cp hack/fake_libhlml/fake_libhlml.so /usr/lib/habanalabs/libhlml.so 18 | ``` 19 | - add ld config to use that library and trigger ldconfig, it will be needed for running tests 20 | with and without VSCode: 21 | ``` 22 | cat << EOF | sudo tee /etc/ld.so.conf.d/habanalabs.conf 23 | /usr/lib/habanalabs/ 24 | EOF 25 | 26 | sudo ldconfig 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /hack/tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | // +build tools 3 | 4 | // This package imports things required by build scripts, to force `go mod` to see them as dependencies 5 | package tools 6 | 7 | import _ "k8s.io/code-generator" 8 | -------------------------------------------------------------------------------- /pkg/fakehlml/fake_hlml.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package fakehlml 18 | 19 | /* 20 | #cgo LDFLAGS: "/usr/lib/habanalabs/libhlml.so" -ldl -Wl,--unresolved-symbols=ignore-all 21 | #include "fake_hlml.h" 22 | #include 23 | */ 24 | import "C" 25 | 26 | import ( 27 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device" 28 | ) 29 | 30 | // KEEP THIS IDENTICAL TO fake_hlml.h call_identity_t 31 | const ( 32 | FakeInit uint32 = iota 33 | FakeInitWithFlags 34 | FakeShutdown 35 | FakeDeviceGetCount 36 | FakeDeviceGetHandleByPCIBusID 37 | FakeDeviceGetHandleByIndex 38 | FakeDeviceGetHandleByUUID 39 | FakeDeviceGetName 40 | FakeDeviceGetPCIInfo 41 | FakeDeviceGetSerial 42 | FakeDeviceRegisterEvents 43 | FakeEventSetCreate 44 | FakeEventSetFree 45 | FakeEventSetWait 46 | ) 47 | 48 | // KEEP THIS IDENTICAL TO hlml.h hlml_return_t 49 | const ( 50 | HLMLSuccess = 0 51 | HLMLErrorUninitialized = 1 52 | HLMLErrorInvalidArgument = 2 53 | HLMLErrorNotSupported = 3 54 | HLMLErrorAlreadyInitialized = 5 55 | HLMLErrorNotFound = 6 56 | HLMLErrorInsufficientSize = 7 57 | HLMLErrorDriverNotLoaded = 9 58 | HLMLErrorTimeout = 10 59 | HLMLErrorAipIsLost = 15 60 | HLMLErrorMemory = 20 61 | HLMLErrorNoData = 21 62 | HLMLErrorUnknown = 49 63 | ) 64 | 65 | func AddDevices(devicesInfo device.DevicesInfo) { 66 | for _, deviceInfo := range devicesInfo { 67 | C.add_device( 68 | C.CString(deviceInfo.PCIAddress), 69 | C.CString(deviceInfo.Model), 70 | C.CString("0x0"), // vendor 71 | C.CString(deviceInfo.Serial), 72 | C.uint(deviceInfo.DeviceIdx), 73 | ) 74 | } 75 | } 76 | 77 | func Reset() { 78 | C.reset() 79 | } 80 | 81 | func SetReturnCode(callId uint32, returnCode uint32) { 82 | C.set_error(C.call_identity_t(callId), C.hlml_return_t(returnCode)) 83 | } 84 | 85 | func AddCriticalEvent(serial string) { 86 | C.add_critical_event(C.CString(serial)) 87 | } 88 | 89 | func ResetRvents() { 90 | C.reset_events() 91 | } 92 | -------------------------------------------------------------------------------- /pkg/fakehlml/fake_hlml.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT 2 | * 3 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 4 | * 5 | */ 6 | 7 | #ifndef __FAKE_HLML_H__ 8 | #define __FAKE_HLML_H__ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | #include "../../vendor/github.com/HabanaAI/gohlml/hlml.h" 15 | 16 | /* Enum for returned values of the different APIs */ 17 | typedef enum call_identity { 18 | FAKE_INIT = 0, 19 | FAKE_INIT_WITH_FLAGS, 20 | FAKE_SHUTDOWN, 21 | FAKE_DEVICE_GET_COUNT, 22 | FAKE_DEVICE_GET_HANDLE_BY_PCI_BUS_ID, 23 | FAKE_DEVICE_GET_HANDLE_BY_INDEX, 24 | FAKE_DEVICE_GET_HANDLE_BY_UUID, 25 | FAKE_DEVICE_GET_NAME, 26 | FAKE_DEVICE_GET_PCI_INFO, 27 | FAKE_DEVICE_GET_SERIAL, 28 | FAKE_DEVICE_REGISTER_EVENTS, 29 | FAKE_EVENT_SET_CREATE, 30 | FAKE_EVENT_SET_FREE, 31 | FAKE_EVENT_SET_WAIT, 32 | FAKE_CALL_IDENTITY_MAX 33 | } call_identity_t; 34 | 35 | void add_device(const char *pci_addr, const char *pci_device_id, const char *pci_vendor_id, const char *serial, unsigned int index); 36 | void reset(void); 37 | 38 | void set_error(call_identity_t call_id, hlml_return_t errCode); 39 | void set_success(call_identity_t call_id); 40 | 41 | void add_critical_event(const char *serial); 42 | void reset_events(void); 43 | 44 | #ifdef __cplusplus 45 | } //extern "C" 46 | #endif 47 | 48 | #endif /* __FAKE_HLML_H__ */ 49 | -------------------------------------------------------------------------------- /pkg/fakesysfs/fakesysfs.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package fakesysfs 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "path" 23 | "strconv" 24 | "strings" 25 | 26 | "golang.org/x/sys/unix" 27 | ) 28 | 29 | const ( 30 | devNullMajor = 1 31 | devNullMinor = 3 32 | devNullType = unix.S_IFCHR 33 | ) 34 | 35 | // newPCIAddress finds next available free PCI address in given directory. 36 | // Returns partial PCI address without function, "0000:00:00.", used in loop 37 | // when fake VFs are generated. 38 | func newPCIAddress(driverDir string, currentAddress string) (string, error) { 39 | domain, err1 := strconv.ParseUint(currentAddress[:4], 10, 64) 40 | bus, err2 := strconv.ParseUint(currentAddress[5:7], 10, 64) 41 | device, err3 := strconv.ParseUint(currentAddress[8:10], 10, 64) 42 | 43 | if err1 != nil || err2 != nil || err3 != nil { 44 | return "", fmt.Errorf("could not parse current PCI address %v", currentAddress) 45 | } 46 | 47 | for ; domain <= 65535; domain++ { 48 | for ; bus <= 255; bus++ { 49 | for ; device <= 255; device++ { 50 | // partial PCI address without function 51 | newAddress := fmt.Sprintf("%04x:%02x:%02x.", domain, bus, device) 52 | // add zero for PCI function part of the address 53 | newSysfsDeviceDir := path.Join(driverDir, fmt.Sprintf("%s0", newAddress)) 54 | if _, err := os.Stat(newSysfsDeviceDir); err != nil { 55 | return newAddress, nil 56 | } 57 | } 58 | } 59 | } 60 | 61 | return "", fmt.Errorf("no addresses left") 62 | } 63 | 64 | // sanitizeFakeSysFsDir ensuring the /tmp location of fake sysfs. 65 | func sanitizeFakeSysFsDir(sysfsRootUntrusted string) error { 66 | // fake sysfsroot should be deletable. 67 | // To prevent disaster mistakes, it is enforced to be in /tmp. 68 | sysfsRoot := path.Join(sysfsRootUntrusted) 69 | if !strings.HasPrefix(sysfsRoot, "/tmp") { 70 | return fmt.Errorf("fake sysfsroot can only be in /tmp, got: %v", sysfsRoot) 71 | } 72 | 73 | return nil 74 | } 75 | 76 | func createDevice(filepath string) error { 77 | mode := uint32(0644 | devNullType) 78 | devid := int(unix.Mkdev(uint32(devNullMajor), uint32(devNullMinor))) 79 | 80 | if err := unix.Mknod(filepath, mode, devid); err != nil { 81 | return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w", 82 | devNullMajor, devNullMinor, filepath, err) 83 | } 84 | 85 | return nil 86 | } 87 | -------------------------------------------------------------------------------- /pkg/gaudi/device/device.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package device 18 | 19 | import ( 20 | "fmt" 21 | "path/filepath" 22 | "regexp" 23 | 24 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers" 25 | ) 26 | 27 | var ( 28 | PciRegexp = regexp.MustCompile(`[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-7]$`) 29 | AccelRegexp = regexp.MustCompile(`^accel[0-9]+$`) 30 | AccelControlRegexp = regexp.MustCompile(`^accel_controlD[0-9]+$`) 31 | ModelNames = map[string]string{ 32 | "0x1000": "Gaudi", 33 | "0x1010": "Gaudi", 34 | "0x1001": "Gaudi", 35 | "0x1011": "Gaudi", 36 | "0x1020": "Gaudi2", 37 | "0x1030": "Gaudi3", 38 | "0x1060": "Gaudi3", 39 | "0x1061": "Gaudi3", 40 | "0x1062": "Gaudi3", 41 | } 42 | ) 43 | 44 | const ( 45 | DevfsAccelPath = "accel" 46 | 47 | // driver.sysfsDriverDir and driver.sysfsAccelDir are sysfsDriverPath and sysfsAccelPath 48 | // respectively prefixed with $SYSFS_ROOT. 49 | SysfsDriverPath = "bus/pci/drivers/habanalabs" 50 | SysfsAccelPath = "devices/virtual/accel/" 51 | 52 | CDIVendor = "intel.com" 53 | CDIClass = "gaudi" 54 | CDIKind = CDIVendor + "/" + CDIClass 55 | DriverName = CDIClass + "." + CDIVendor 56 | PCIAddressLength = len("0000:00:00.0") 57 | 58 | PreparedClaimsFileName = "preparedClaims.json" 59 | PluginRegistrarFileName = DriverName + ".sock" 60 | PluginSocketFileName = "plugin.sock" 61 | 62 | DefaultNamingStyle = "machine" 63 | VisibleDevicesEnvVarName = "HABANA_VISIBLE_DEVICES" 64 | ) 65 | 66 | // DeviceInfo is an internal structure type to store info about discovered device. 67 | type DeviceInfo struct { 68 | // UID is a unique identifier on node, used in ResourceSlice K8s API object as RFC1123-compliant identifier. 69 | // Consists of PCIAddress and Model with colons and dots replaced with hyphens, e.g. 0000-01-02-0-0x12345. 70 | UID string `json:"uid"` 71 | PCIAddress string `json:"pciaddress"` // PCI address in Linux DBDF notation for use with sysfs, e.g. 0000:00:00.0 72 | Model string `json:"model"` // PCI device ID 73 | ModelName string `json:"modelname"` // SKU name of the device, e.g. Gaudi2 74 | DeviceIdx uint64 `json:"deviceidx"` // accel device number (e.g. 0 for /dev/accel/accel0) 75 | ModuleIdx uint64 `json:"moduleidx"` // OAM slot number, needed for Habana Runtime to set networking 76 | PCIRoot string `json:"pciroot"` // PCI Root complex ID 77 | Serial string `json:"serial"` // Serial number obtained through HLML library 78 | Healthy bool `json:"healthy"` // True if device is usable, false otherwise 79 | } 80 | 81 | func (g DeviceInfo) CDIName() string { 82 | return fmt.Sprintf("%s=%s", CDIKind, g.UID) 83 | } 84 | 85 | func (g *DeviceInfo) DeepCopy() *DeviceInfo { 86 | di := *g 87 | return &di 88 | } 89 | 90 | func (g *DeviceInfo) SetModelName() { 91 | if modelName, found := ModelNames[g.Model]; found { 92 | g.ModelName = modelName 93 | return 94 | } 95 | g.ModelName = "Unknown" 96 | } 97 | 98 | // DevicesInfo is a dictionary with DeviceInfo.uid being the key. 99 | type DevicesInfo map[string]*DeviceInfo 100 | 101 | func (g *DevicesInfo) DeepCopy() DevicesInfo { 102 | devicesInfoCopy := DevicesInfo{} 103 | for duid, device := range *g { 104 | devicesInfoCopy[duid] = device.DeepCopy() 105 | } 106 | return devicesInfoCopy 107 | } 108 | func GetAccelDevfsPath() string { 109 | return filepath.Join(helpers.GetDevRoot(helpers.DevfsEnvVarName, DevfsAccelPath), DevfsAccelPath) 110 | } 111 | -------------------------------------------------------------------------------- /pkg/gaudi/device/device_test.go: -------------------------------------------------------------------------------- 1 | package device 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestCDIName(t *testing.T) { 8 | tests := []struct { 9 | name string 10 | device DeviceInfo 11 | expected string 12 | }{ 13 | { 14 | name: "Valid device UID", 15 | device: DeviceInfo{ 16 | UID: "0000-01-02-0-0x12345", 17 | }, 18 | expected: "intel.com/gaudi=0000-01-02-0-0x12345", 19 | }, 20 | { 21 | name: "Another valid device UID", 22 | device: DeviceInfo{ 23 | UID: "0000-02-03-0-0x67890", 24 | }, 25 | expected: "intel.com/gaudi=0000-02-03-0-0x67890", 26 | }, 27 | } 28 | 29 | for _, tt := range tests { 30 | t.Run(tt.name, func(t *testing.T) { 31 | result := tt.device.CDIName() 32 | if result != tt.expected { 33 | t.Errorf("expected %v, got %v", tt.expected, result) 34 | } 35 | }) 36 | } 37 | } 38 | 39 | func TestDevicesInfoDeepCopy(t *testing.T) { 40 | original := DevicesInfo{ 41 | "0000-01-02-0-0x12345": { 42 | UID: "0000-01-02-0-0x12345", 43 | PCIAddress: "0000:01:02.0", 44 | Model: "0x1020", 45 | ModelName: "Gaudi2", 46 | DeviceIdx: 1, 47 | ModuleIdx: 2, 48 | PCIRoot: "0000:00", 49 | Serial: "1234567890", 50 | Healthy: true, 51 | }, 52 | } 53 | 54 | copy := original.DeepCopy() 55 | 56 | if © == &original { 57 | t.Error("DeepCopy() returned the same pointer, expected different pointers") 58 | } 59 | 60 | for key, originalDevice := range original { 61 | copyDevice, exists := copy[key] 62 | if !exists { 63 | t.Errorf("DeepCopy() missing device with key %v", key) 64 | continue 65 | } 66 | 67 | if copyDevice == originalDevice { 68 | t.Errorf("DeepCopy() returned the same pointer for device with key %v, expected different pointers", key) 69 | } 70 | 71 | if *copyDevice != *originalDevice { 72 | t.Errorf("DeepCopy() returned different values for device with key %v, expected identical values", key) 73 | } 74 | } 75 | } 76 | 77 | func TestSetModelName(t *testing.T) { 78 | tests := []struct { 79 | name string 80 | deviceInfo DeviceInfo 81 | expected string 82 | }{ 83 | { 84 | name: "Known model 0x1000", 85 | deviceInfo: DeviceInfo{ 86 | Model: "0x1000", 87 | }, 88 | expected: "Gaudi", 89 | }, 90 | { 91 | name: "Known model 0x1020", 92 | deviceInfo: DeviceInfo{ 93 | Model: "0x1020", 94 | }, 95 | expected: "Gaudi2", 96 | }, 97 | { 98 | name: "Unknown model", 99 | deviceInfo: DeviceInfo{ 100 | Model: "0x9999", 101 | }, 102 | expected: "Unknown", 103 | }, 104 | } 105 | 106 | for _, tt := range tests { 107 | t.Run(tt.name, func(t *testing.T) { 108 | tt.deviceInfo.SetModelName() 109 | if tt.deviceInfo.ModelName != tt.expected { 110 | t.Errorf("expected %v, got %v", tt.expected, tt.deviceInfo.ModelName) 111 | } 112 | }) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /pkg/helpers/device.go: -------------------------------------------------------------------------------- 1 | package helpers 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path" 7 | "strings" 8 | ) 9 | 10 | const ( 11 | SysfsEnvVarName = "SYSFS_ROOT" 12 | sysfsDefaultRoot = "/sys" 13 | 14 | DevfsEnvVarName = "DEVFS_ROOT" 15 | devfsDefaultRoot = "/dev" 16 | 17 | PCIAddressLength = len("0000:00:00.0") 18 | ) 19 | 20 | // GetSysfsRoot tries to get path where sysfs is mounted from 21 | // env var, or fallback to hardcoded path. 22 | func GetSysfsRoot(sysfsPath string) string { 23 | sysfsRoot, found := os.LookupEnv(SysfsEnvVarName) 24 | 25 | if found { 26 | if _, err := os.Stat(path.Join(sysfsRoot, sysfsPath)); err == nil { 27 | fmt.Printf("using custom sysfs location: %v\n", sysfsRoot) 28 | return sysfsRoot 29 | } else { 30 | fmt.Printf("could not find sysfs at '%v' from %v env var: %v\n", sysfsPath, SysfsEnvVarName, err) 31 | } 32 | } 33 | 34 | fmt.Printf("using default sysfs location: %v\n", sysfsDefaultRoot) 35 | // If /sys is not available, devices discovery will fail gracefully. 36 | return sysfsDefaultRoot 37 | } 38 | 39 | func GetDevRoot(devfsRootEnvVarName string, devPath string) string { 40 | devfsRoot, found := os.LookupEnv(devfsRootEnvVarName) 41 | 42 | if found { 43 | if _, err := os.Stat(path.Join(devfsRoot, devPath)); err == nil { 44 | fmt.Printf("using custom devfs location: %v\n", devfsRoot) 45 | return devfsRoot 46 | } else { 47 | fmt.Printf("could not find devfs at '%v' from %v env var: %v\n", devPath, devfsRootEnvVarName, err) 48 | } 49 | } 50 | 51 | fmt.Printf("using default devfs root: %v\n", devfsDefaultRoot) 52 | return devfsDefaultRoot 53 | } 54 | 55 | func PciInfoFromDeviceUID(deviceUID string) (string, string) { 56 | // 0000-00-01-0-0x0000 -> 0000:00:01.0, 0x0000 57 | rfc1123PCIaddress := deviceUID[:PCIAddressLength] 58 | pciAddress := strings.Replace(strings.Replace(rfc1123PCIaddress, "-", ":", 2), "-", ".", 1) 59 | deviceId := deviceUID[PCIAddressLength+1:] 60 | 61 | return pciAddress, deviceId 62 | } 63 | 64 | func DeviceUIDFromPCIinfo(pciAddress string, pciid string) string { 65 | // 0000:00:01.0, 0x0000 -> 0000-00-01-0-0x0000 66 | // Replace colons and the dot in PCI address with hyphens. 67 | rfc1123PCIaddress := strings.ReplaceAll(strings.ReplaceAll(pciAddress, ":", "-"), ".", "-") 68 | newUID := fmt.Sprintf("%v-%v", rfc1123PCIaddress, pciid) 69 | 70 | return newUID 71 | } 72 | -------------------------------------------------------------------------------- /pkg/helpers/device_test.go: -------------------------------------------------------------------------------- 1 | package helpers 2 | 3 | import ( 4 | "os" 5 | "path" 6 | "testing" 7 | ) 8 | 9 | func TestGetSysfsRoot(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | envVarValue string 13 | sysfsPath string 14 | expected string 15 | setupEnv bool 16 | }{ 17 | { 18 | name: "Custom sysfs location exists", 19 | envVarValue: TestSysfsRoot, 20 | sysfsPath: "devices", 21 | expected: TestSysfsRoot, 22 | setupEnv: true, 23 | }, 24 | { 25 | name: "Custom sysfs location does not exist", 26 | envVarValue: "/invalid/sys", 27 | sysfsPath: "devices", 28 | expected: sysfsDefaultRoot, 29 | setupEnv: true, 30 | }, 31 | { 32 | name: "Default sysfs location", 33 | envVarValue: "", 34 | sysfsPath: "devices", 35 | expected: sysfsDefaultRoot, 36 | setupEnv: false, 37 | }, 38 | } 39 | 40 | for _, tt := range tests { 41 | t.Run(tt.name, func(t *testing.T) { 42 | if tt.setupEnv { 43 | os.Setenv(SysfsEnvVarName, tt.envVarValue) 44 | defer os.Unsetenv(SysfsEnvVarName) 45 | } 46 | 47 | if tt.envVarValue != "" { 48 | if err := os.MkdirAll(path.Join(tt.envVarValue, tt.sysfsPath), os.ModePerm); err != nil { 49 | t.Logf("failed to create directory: %v", err) 50 | } 51 | defer os.RemoveAll(tt.envVarValue) 52 | } 53 | 54 | result := GetSysfsRoot(tt.sysfsPath) 55 | if result != tt.expected { 56 | t.Errorf("expected %v, got %v", tt.expected, result) 57 | } 58 | }) 59 | } 60 | } 61 | 62 | func TestGetDevRoot(t *testing.T) { 63 | tests := []struct { 64 | name string 65 | envVarName string 66 | envVarValue string 67 | devPath string 68 | expected string 69 | setupEnv bool 70 | }{ 71 | { 72 | name: "Custom devfs location exists", 73 | envVarName: DevfsEnvVarName, 74 | envVarValue: TestDevfsRoot, 75 | devPath: "devices", 76 | expected: TestDevfsRoot, 77 | setupEnv: true, 78 | }, 79 | { 80 | name: "Custom devfs location does not exist", 81 | envVarName: DevfsEnvVarName, 82 | envVarValue: "/invalid/dev", 83 | devPath: "devices", 84 | expected: devfsDefaultRoot, 85 | setupEnv: true, 86 | }, 87 | { 88 | name: "Default devfs location", 89 | envVarName: DevfsEnvVarName, 90 | envVarValue: "", 91 | devPath: "devices", 92 | expected: devfsDefaultRoot, 93 | setupEnv: false, 94 | }, 95 | } 96 | 97 | for _, tt := range tests { 98 | t.Run(tt.name, func(t *testing.T) { 99 | if tt.setupEnv { 100 | os.Setenv(tt.envVarName, tt.envVarValue) 101 | defer os.Unsetenv(tt.envVarName) 102 | } 103 | 104 | if tt.envVarValue != "" { 105 | if err := os.MkdirAll(path.Join(tt.envVarValue, tt.devPath), os.ModePerm); err != nil { 106 | t.Logf("failed to create directory: %v", err) 107 | } 108 | defer os.RemoveAll(tt.envVarValue) 109 | } 110 | 111 | result := GetDevRoot(tt.envVarName, tt.devPath) 112 | if result != tt.expected { 113 | t.Errorf("expected %v, got %v", tt.expected, result) 114 | } 115 | }) 116 | } 117 | } 118 | 119 | func TestPciInfoFromDeviceUID(t *testing.T) { 120 | tests := []struct { 121 | name string 122 | deviceUID string 123 | expectedPCIAddress string 124 | expectedPCIID string 125 | }{ 126 | { 127 | name: "Valid device UID", 128 | deviceUID: "1234-56-78-9-0x1234", 129 | expectedPCIAddress: "1234:56:78.9", 130 | expectedPCIID: "0x1234", 131 | }, 132 | } 133 | 134 | for _, tt := range tests { 135 | t.Run(tt.name, func(t *testing.T) { 136 | pciAddress, pciID := PciInfoFromDeviceUID(tt.deviceUID) 137 | if pciAddress != tt.expectedPCIAddress || pciID != tt.expectedPCIID { 138 | t.Errorf("expected PCI address %v and PCI ID %v, got PCI address %v and PCI ID %v", tt.expectedPCIAddress, tt.expectedPCIID, pciAddress, pciID) 139 | } 140 | }) 141 | } 142 | } 143 | 144 | func TestDeviceUIDFromPCIinfo(t *testing.T) { 145 | tests := []struct { 146 | name string 147 | pciAddress string 148 | pciid string 149 | expected string 150 | }{ 151 | { 152 | name: "Valid PCI address and ID", 153 | pciAddress: "0000:00:01.0", 154 | pciid: "0x0000", 155 | expected: "0000-00-01-0-0x0000", 156 | }, 157 | } 158 | 159 | for _, tt := range tests { 160 | t.Run(tt.name, func(t *testing.T) { 161 | result := DeviceUIDFromPCIinfo(tt.pciAddress, tt.pciid) 162 | if result != tt.expected { 163 | t.Errorf("expected %v, got %v", tt.expected, result) 164 | } 165 | }) 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /pkg/helpers/driver.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package helpers 18 | 19 | import "context" 20 | 21 | type Driver interface { 22 | Shutdown(ctx context.Context) error 23 | } 24 | -------------------------------------------------------------------------------- /pkg/helpers/helpers_test.go: -------------------------------------------------------------------------------- 1 | package helpers 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/urfave/cli/v2" 7 | 8 | "context" 9 | "flag" 10 | "os" 11 | "testing" 12 | ) 13 | 14 | func TestNewAppWithFlags(t *testing.T) { 15 | driverName := "test-driver" 16 | newDriver := func(ctx context.Context, config *Config) (Driver, error) { 17 | return nil, nil 18 | } 19 | 20 | app := NewApp(driverName, newDriver, []cli.Flag{}, (interface{})(nil)) 21 | set := flag.NewFlagSet("test", 0) 22 | set.String("node-name", "test-node", "doc") 23 | set.String("cdi-root", "/test/cdi", "doc") 24 | set.Int("num-devices", 10, "doc") 25 | 26 | ctx := cli.NewContext(app, set, nil) 27 | 28 | err := app.Before(ctx) 29 | if err != nil { 30 | t.Fatalf("Before function failed: %v", err) 31 | } 32 | 33 | if ctx.String("node-name") != "test-node" { 34 | t.Errorf("Expected node-name to be 'test-node', got %v", ctx.String("node-name")) 35 | } 36 | 37 | if ctx.String("cdi-root") != "/test/cdi" { 38 | t.Errorf("Expected cdi-root to be '/test/cdi', got %v", ctx.String("cdi-root")) 39 | } 40 | 41 | if ctx.Int("num-devices") != 10 { 42 | t.Errorf("Expected num-devices to be 10, got %v", ctx.Int("num-devices")) 43 | } 44 | } 45 | 46 | func TestWriteFile(t *testing.T) { 47 | tests := []struct { 48 | name string 49 | filePath string 50 | fileContents string 51 | expectError bool 52 | }{ 53 | { 54 | name: "Valid file path and contents", 55 | filePath: "testfile.txt", 56 | fileContents: "Hello, World!", 57 | expectError: false, 58 | }, 59 | { 60 | name: "Invalid file path", 61 | filePath: "/invalidpath/testfile.txt", 62 | fileContents: "Hello, World!", 63 | expectError: true, 64 | }, 65 | } 66 | 67 | for _, tt := range tests { 68 | t.Run(tt.name, func(t *testing.T) { 69 | err := WriteFile(tt.filePath, tt.fileContents) 70 | if (err != nil) != tt.expectError { 71 | t.Errorf("WriteFile() error = %v, expectError %v", err, tt.expectError) 72 | } 73 | 74 | if !tt.expectError { 75 | content, err := os.ReadFile(tt.filePath) 76 | if err != nil { 77 | t.Fatalf("Failed to read file: %v", err) 78 | } 79 | if string(content) != tt.fileContents { 80 | t.Errorf("Expected file contents to be %v, got %v", tt.fileContents, string(content)) 81 | } 82 | os.Remove(tt.filePath) 83 | } 84 | }) 85 | } 86 | } 87 | 88 | func TestStartPlugin(t *testing.T) { 89 | tests := []struct { 90 | name string 91 | config *Config 92 | newDriver func(ctx context.Context, config *Config) (Driver, error) 93 | setup func() 94 | expectError bool 95 | }{ 96 | { 97 | name: "CDI root is not a directory", 98 | config: &Config{ 99 | CommonFlags: &Flags{ 100 | KubeletPluginDir: "/tmp/testplugin", 101 | CdiRoot: "/tmp/testfile", 102 | }, 103 | }, 104 | setup: func() { 105 | if err := os.WriteFile("/tmp/testfile", []byte("not a directory"), 0644); err != nil { 106 | t.Fatalf("Failed to write file: %v", err) 107 | } 108 | }, 109 | expectError: true, 110 | }, 111 | { 112 | name: "KubeletPluginDir does not exist", 113 | config: &Config{ 114 | CommonFlags: &Flags{ 115 | KubeletPluginDir: "/does-not-exist", 116 | }, 117 | }, 118 | expectError: true, 119 | }, 120 | { 121 | name: "CDIRoot does not exist", 122 | config: &Config{ 123 | CommonFlags: &Flags{ 124 | KubeletPluginDir: AddRandomString("/tmp/test"), 125 | CdiRoot: "/does-not-exist", 126 | }, 127 | }, 128 | expectError: true, 129 | }, 130 | { 131 | name: "NewDriver returns error", 132 | config: &Config{ 133 | CommonFlags: &Flags{ 134 | KubeletPluginDir: "/tmp/testplugin", 135 | CdiRoot: "/tmp/testcdi", 136 | }, 137 | }, 138 | newDriver: func(ctx context.Context, config *Config) (Driver, error) { 139 | return nil, fmt.Errorf("fake error %v", "from newDriver") 140 | }, 141 | expectError: true, 142 | }, 143 | } 144 | 145 | for _, tt := range tests { 146 | t.Run(tt.name, func(t *testing.T) { 147 | if tt.setup != nil { 148 | tt.setup() 149 | } 150 | defer os.RemoveAll("/tmp/testplugin") 151 | defer os.RemoveAll("/tmp/testcdi") 152 | defer os.Remove("/tmp/testfile") 153 | 154 | ctx := context.Background() 155 | err := StartPlugin(ctx, tt.config, tt.newDriver) 156 | if (err != nil) != tt.expectError { 157 | t.Errorf("StartPlugin() error = %v, expectError %v", err, tt.expectError) 158 | } 159 | }) 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /pkg/helpers/node_state.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package helpers 18 | 19 | import ( 20 | "context" 21 | "encoding/json" 22 | "fmt" 23 | "os" 24 | "sync" 25 | 26 | "k8s.io/klog/v2" 27 | drav1 "k8s.io/kubelet/pkg/apis/dra/v1beta1" 28 | cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" 29 | ) 30 | 31 | type ClaimPreparations map[string][]*drav1.Device 32 | 33 | type NodeState struct { 34 | sync.Mutex 35 | CdiCache *cdiapi.Cache 36 | Allocatable interface{} 37 | Prepared ClaimPreparations 38 | PreparedClaimsFilePath string 39 | NodeName string 40 | SysfsRoot string 41 | } 42 | 43 | func (s *NodeState) Unprepare(ctx context.Context, claimUID string) error { 44 | s.Lock() 45 | defer s.Unlock() 46 | 47 | if s.Prepared[claimUID] == nil { 48 | return nil 49 | } 50 | 51 | klog.V(5).Infof("Freeing devices from claim %v", claimUID) 52 | delete(s.Prepared, claimUID) 53 | 54 | // write prepared claims to file 55 | if err := WritePreparedClaimsToFile(s.PreparedClaimsFilePath, s.Prepared); err != nil { 56 | return fmt.Errorf("failed to write prepared claims to file: %v", err) 57 | } 58 | 59 | return nil 60 | } 61 | 62 | // GetOrCreatePreparedClaims reads a PreparedClaim from a file and deserializes it or creates the file. 63 | func GetOrCreatePreparedClaims(preparedClaimFilePath string) (ClaimPreparations, error) { 64 | if _, err := os.Stat(preparedClaimFilePath); os.IsNotExist(err) { 65 | klog.V(5).Infof("could not find file %v. Creating file", preparedClaimFilePath) 66 | f, err := os.OpenFile(preparedClaimFilePath, os.O_CREATE|os.O_WRONLY, 0600) 67 | if err != nil { 68 | return nil, fmt.Errorf("failed creating file %v. Err: %v", preparedClaimFilePath, err) 69 | } 70 | defer f.Close() 71 | 72 | if _, err := f.WriteString("{}"); err != nil { 73 | return nil, fmt.Errorf("failed writing to file %v. Err: %v", preparedClaimFilePath, err) 74 | } 75 | 76 | klog.V(5).Infof("empty prepared claims file created %v", preparedClaimFilePath) 77 | 78 | return make(ClaimPreparations), nil 79 | } 80 | 81 | return ReadPreparedClaimsFromFile(preparedClaimFilePath) 82 | } 83 | 84 | // ReadPreparedClaimToFile returns unmarshaled content for given prepared claims JSON file. 85 | func ReadPreparedClaimsFromFile(preparedClaimFilePath string) (ClaimPreparations, error) { 86 | 87 | preparedClaims := make(ClaimPreparations) 88 | 89 | preparedClaimsBytes, err := os.ReadFile(preparedClaimFilePath) 90 | if err != nil { 91 | klog.V(5).Infof("could not read prepared claims configuration from file %v. Err: %v", preparedClaimFilePath, err) 92 | return nil, fmt.Errorf("failed reading file %v. Err: %v", preparedClaimFilePath, err) 93 | } 94 | 95 | if err := json.Unmarshal(preparedClaimsBytes, &preparedClaims); err != nil { 96 | klog.V(5).Infof("Could not parse default prepared claims configuration from file %v. Err: %v", preparedClaimFilePath, err) 97 | return nil, fmt.Errorf("failed parsing file %v. Err: %v", preparedClaimFilePath, err) 98 | } 99 | 100 | return preparedClaims, nil 101 | } 102 | 103 | // WritePreparedClaimsToFile serializes PreparedClaims and writes it to a file. 104 | func WritePreparedClaimsToFile(preparedClaimFilePath string, preparedClaims ClaimPreparations) error { 105 | if preparedClaims == nil { 106 | preparedClaims = ClaimPreparations{} 107 | } 108 | encodedPreparedClaims, err := json.MarshalIndent(preparedClaims, "", " ") 109 | if err != nil { 110 | return fmt.Errorf("prepared claims JSON encoding failed. Err: %v", err) 111 | } 112 | return os.WriteFile(preparedClaimFilePath, encodedPreparedClaims, 0600) 113 | } 114 | -------------------------------------------------------------------------------- /pkg/plugintesthelpers/plugintesthelpers.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package plugintesthelpers 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "path" 23 | "testing" 24 | 25 | resourcev1 "k8s.io/api/resource/v1beta1" 26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | "k8s.io/apimachinery/pkg/types" 28 | ) 29 | 30 | const ( 31 | testRootPrefix = "test-*" 32 | ) 33 | 34 | type TestDirsType struct { 35 | TestRoot string 36 | CdiRoot string 37 | KubeletPluginDir string 38 | KubeletPluginRegistryDir string 39 | SysfsRoot string 40 | DevfsRoot string 41 | } 42 | 43 | // NewTestDirs creates fake CDI root, sysfs, driverPlugin dirs and returns 44 | // them as a testDirsType or an error. 45 | func NewTestDirs(driverName string) (TestDirsType, error) { 46 | testRoot, err := os.MkdirTemp("", testRootPrefix) 47 | if err != nil { 48 | return TestDirsType{}, fmt.Errorf("failed creating test root dir: %v", err) 49 | } 50 | 51 | if err := os.Chmod(testRoot, 0755); err != nil { 52 | return TestDirsType{}, fmt.Errorf("failed changing permissions to test root dir: %v", err) 53 | } 54 | return NewTestDirsAt(testRoot, driverName) 55 | } 56 | func NewTestDirsAt(testRoot string, driverName string) (TestDirsType, error) { 57 | cdiRoot := path.Join(testRoot, "cdi") 58 | if err := os.MkdirAll(cdiRoot, 0755); err != nil { 59 | return TestDirsType{}, fmt.Errorf("failed creating fake CDI root dir: %v", err) 60 | } 61 | 62 | fakeSysfsRoot := path.Join(testRoot, "sysfs") 63 | if err := os.MkdirAll(fakeSysfsRoot, 0755); err != nil { 64 | return TestDirsType{}, fmt.Errorf("failed creating fake sysfs root dir: %v", err) 65 | } 66 | 67 | driverPluginRoot := path.Join(testRoot, "kubelet-plugin/plugins/", driverName) 68 | if err := os.MkdirAll(driverPluginRoot, 0755); err != nil { 69 | return TestDirsType{}, fmt.Errorf("failed creating fake driver plugin dir: %v", err) 70 | } 71 | 72 | driverRegistrarRoot := path.Join(testRoot, "kubelet-plugin/plugins_registry") 73 | if err := os.MkdirAll(driverRegistrarRoot, 0755); err != nil { 74 | return TestDirsType{}, fmt.Errorf("failed creating fake driver plugin dir: %v", err) 75 | } 76 | 77 | devfsRoot := path.Join(testRoot, "dev") 78 | if err := os.MkdirAll(devfsRoot, 0755); err != nil { 79 | return TestDirsType{}, fmt.Errorf("failed creating fake devfs dir: %v", err) 80 | } 81 | 82 | return TestDirsType{ 83 | TestRoot: testRoot, 84 | CdiRoot: cdiRoot, 85 | SysfsRoot: fakeSysfsRoot, 86 | KubeletPluginDir: driverPluginRoot, 87 | KubeletPluginRegistryDir: driverRegistrarRoot, 88 | DevfsRoot: devfsRoot, 89 | }, nil 90 | } 91 | 92 | func CleanupTest(t *testing.T, testname string, testRoot string) { 93 | if err := os.RemoveAll(testRoot); err != nil { 94 | t.Logf("%v: could not cleanup temp directory %v: %v", testname, testRoot, err) 95 | } 96 | } 97 | 98 | func NewMonitoringClaim(claimNs, claimName, claimUID, requestName, driverName, pool string, allocatedDevices []string) *resourcev1.ResourceClaim { 99 | claim := NewClaim(claimNs, claimName, claimUID, requestName, driverName, pool, allocatedDevices) 100 | claim.Spec.Devices.Requests[0].AdminAccess = &[]bool{true}[0] 101 | claim.Spec.Devices.Requests[0].AllocationMode = "All" 102 | 103 | return claim 104 | } 105 | 106 | func NewClaim(claimNs, claimName, claimUID, requestName, driverName, pool string, allocatedDevices []string) *resourcev1.ResourceClaim { 107 | allocationResults := []resourcev1.DeviceRequestAllocationResult{} 108 | for _, deviceUID := range allocatedDevices { 109 | newDevice := resourcev1.DeviceRequestAllocationResult{ 110 | Device: deviceUID, 111 | Request: requestName, 112 | Driver: driverName, 113 | Pool: pool, 114 | } 115 | allocationResults = append(allocationResults, newDevice) 116 | } 117 | 118 | alienDevice := resourcev1.DeviceRequestAllocationResult{ 119 | Device: "numberOne", 120 | Request: "complimentaryRequest", 121 | Driver: "NonExistent", 122 | Pool: pool, 123 | } 124 | allocationResults = append(allocationResults, alienDevice) 125 | 126 | claim := &resourcev1.ResourceClaim{ 127 | TypeMeta: metav1.TypeMeta{APIVersion: "resource.k8s.io/v1beta1", Kind: "ResourceClaim"}, 128 | ObjectMeta: metav1.ObjectMeta{Namespace: claimNs, Name: claimName, UID: types.UID(claimUID)}, 129 | Spec: resourcev1.ResourceClaimSpec{ 130 | Devices: resourcev1.DeviceClaim{ 131 | Requests: []resourcev1.DeviceRequest{ 132 | {Name: requestName, DeviceClassName: driverName, Count: 1}, 133 | {Name: "complimentaryRequest", DeviceClassName: "NonExistent"}, 134 | }, 135 | }, 136 | }, 137 | Status: resourcev1.ResourceClaimStatus{ 138 | Allocation: &resourcev1.AllocationResult{ 139 | Devices: resourcev1.DeviceAllocationResult{ 140 | Results: allocationResults, 141 | }, 142 | }, 143 | }, 144 | } 145 | 146 | return claim 147 | } 148 | -------------------------------------------------------------------------------- /pkg/qat/cdi/cdi.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cdi 18 | 19 | import ( 20 | "fmt" 21 | "path" 22 | 23 | "k8s.io/klog/v2" 24 | cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" 25 | cdispecs "tags.cncf.io/container-device-interface/specs-go" 26 | 27 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 28 | ) 29 | 30 | const ( 31 | CDIRoot = cdiapi.DefaultDynamicDir 32 | CDIVendor = "intel.com" 33 | CDIClass = "qat" 34 | CDIKind = CDIVendor + "/" + CDIClass 35 | ) 36 | 37 | type CDI struct { 38 | cache *cdiapi.Cache 39 | } 40 | 41 | func New(cdidir string) (*CDI, error) { 42 | 43 | if err := cdiapi.Configure(cdiapi.WithSpecDirs(cdidir)); err != nil { 44 | return nil, fmt.Errorf("unable to refresh the CDI registry: %v", err) 45 | } 46 | 47 | cdiCache := cdiapi.GetDefaultCache() 48 | 49 | cdi := &CDI{ 50 | cache: cdiCache, 51 | } 52 | 53 | return cdi, nil 54 | } 55 | 56 | func (c *CDI) getQatSpecs() []*cdiapi.Spec { 57 | qatSpecs := []*cdiapi.Spec{} 58 | for _, cdiSpec := range c.cache.GetVendorSpecs(CDIVendor) { 59 | if cdiSpec.Kind == CDIKind { 60 | qatSpecs = append(qatSpecs, cdiSpec) 61 | } 62 | } 63 | return qatSpecs 64 | } 65 | 66 | func (c *CDI) SyncDevices(vfdevices device.VFDevices) error { 67 | klog.V(5).Info("Syncing CDI devices") 68 | 69 | vfspec := &cdispecs.Spec{ 70 | Kind: CDIKind, 71 | } 72 | vfspecname := cdiapi.GenerateSpecName(CDIVendor, CDIClass) 73 | 74 | for _, vendorspec := range c.getQatSpecs() { 75 | vendorspecname := path.Base(vendorspec.GetPath()) 76 | 77 | if vendorspec.Kind != CDIKind { 78 | klog.V(5).Infof("Spec file %s is for other kind %s, skipping...", vendorspecname, vendorspec.Kind) 79 | continue 80 | } 81 | 82 | name := vfspecname + path.Ext(vendorspecname) 83 | if name == vendorspecname { 84 | klog.V(5).Infof("Adding rest of the devices to '%s'", name) 85 | vfspec = vendorspec.Spec 86 | } 87 | 88 | vendorspecupdate := false 89 | vendorspecdevices := []cdispecs.Device{} 90 | 91 | for _, vendordevice := range vendorspec.Devices { 92 | if _, exists := vfdevices[vendordevice.Name]; exists { 93 | klog.V(5).Infof("Vendor spec %s contains device name %s", vendorspecname, vendordevice.Name) 94 | 95 | delete(vfdevices, vendordevice.Name) 96 | vendorspecdevices = append(vendorspecdevices, vendordevice) 97 | } else { 98 | klog.Warningf("CDI device '%s' in spec file '%s' does not exist", vendordevice.Name, vendorspecname) 99 | vendorspecupdate = true 100 | } 101 | } 102 | if vendorspecupdate { 103 | // Update spec file that has a nonexistent device. 104 | klog.Infof("Updating spec file %s with existing devices", path.Base(vendorspec.GetPath())) 105 | 106 | vendorspec.Devices = vendorspecdevices 107 | err := c.cache.WriteSpec(vendorspec.Spec, vendorspecname) 108 | if err != nil { 109 | klog.Warningf("Failed to update existing CDI spec file %s: %v", vendorspecname, err) 110 | } 111 | } 112 | } 113 | 114 | if len(vfdevices) > 0 { 115 | return c.appendDevices(vfspec, vfdevices, vfspecname) 116 | } 117 | 118 | return nil 119 | } 120 | 121 | func (c *CDI) adddevicespec(spec *cdispecs.Spec, vfdevices device.VFDevices) error { 122 | 123 | for _, vf := range vfdevices { 124 | cdidevice := cdispecs.Device{ 125 | Name: vf.UID(), 126 | ContainerEdits: cdispecs.ContainerEdits{ 127 | DeviceNodes: []*cdispecs.DeviceNode{ 128 | {Path: vf.DeviceNode(), Type: "c"}, 129 | }, 130 | }, 131 | } 132 | spec.Devices = append(spec.Devices, cdidevice) 133 | 134 | klog.V(5).Infof("Added device %s name %s", cdidevice.ContainerEdits.DeviceNodes[0].Path, cdidevice.Name) 135 | } 136 | return nil 137 | } 138 | 139 | func (c *CDI) appendDevices(spec *cdispecs.Spec, vfdevices device.VFDevices, name string) error { 140 | 141 | klog.V(5).Info("Append CDI devices") 142 | 143 | if err := c.adddevicespec(spec, vfdevices); err != nil { 144 | return err 145 | } 146 | 147 | version, err := cdiapi.MinimumRequiredVersion(spec) 148 | if err != nil { 149 | return fmt.Errorf("minimum CDI spec version not found: %v", err) 150 | } 151 | spec.Version = version 152 | 153 | err = c.cache.WriteSpec(spec, name) 154 | if err != nil { 155 | return fmt.Errorf("failed to write CDI spec %s: %v", name, err) 156 | } 157 | 158 | klog.Infof("CDI %s: Kind %s, Version %v", name, spec.Kind, spec.Version) 159 | return nil 160 | } 161 | 162 | func (c *CDI) OverwriteDevices(vfdevices device.VFDevices) error { 163 | var err error 164 | 165 | klog.V(5).Info("Add/overwrite CDI devices") 166 | 167 | spec := &cdispecs.Spec{ 168 | Kind: CDIKind, 169 | } 170 | 171 | name, err := cdiapi.GenerateNameForSpec(spec) 172 | if err != nil { 173 | return fmt.Errorf("spec name not created: %v", err) 174 | } 175 | 176 | return c.appendDevices(spec, vfdevices, name) 177 | } 178 | -------------------------------------------------------------------------------- /pkg/qat/device/state.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package device 6 | 7 | import ( 8 | "encoding/json" 9 | "fmt" 10 | "os" 11 | 12 | "k8s.io/klog/v2" 13 | ) 14 | 15 | // Map allocation id to VF device. 16 | type savedAllocations map[string][]string 17 | 18 | func (q *QATDevices) ReadStateOrCreateEmpty(statefile string) error { 19 | if statefile == "" { 20 | return nil 21 | } 22 | 23 | if _, err := os.Stat(statefile); os.IsNotExist(err) { 24 | f, err := os.OpenFile(statefile, os.O_CREATE|os.O_WRONLY, 0600) 25 | if err != nil { 26 | return fmt.Errorf("failed to create state file '%s': %v", statefile, err) 27 | } 28 | defer f.Close() 29 | 30 | if _, err := f.WriteString("{}"); err != nil { 31 | return fmt.Errorf("failed to write to state file '%s': %v", statefile, err) 32 | } 33 | 34 | return nil 35 | } 36 | 37 | return q.readState(statefile) 38 | } 39 | 40 | func (q *QATDevices) readState(statefile string) error { 41 | if statefile == "" { 42 | return nil 43 | } 44 | 45 | savedstatebytes, err := os.ReadFile(statefile) 46 | if err != nil { 47 | return fmt.Errorf("could not read state file '%s': %v", statefile, err) 48 | } 49 | 50 | saveddevices := make(savedAllocations, 0) 51 | if err := json.Unmarshal(savedstatebytes, &saveddevices); err != nil { 52 | return fmt.Errorf("failed parsing state file '%s': %v", statefile, err) 53 | } 54 | 55 | for allocatedby, vfdevices := range saveddevices { 56 | for _, vf := range vfdevices { 57 | _, _, err := q.Allocate(vf, Unset, allocatedby) 58 | 59 | if err != nil { 60 | klog.Errorf("Failed to restore VF device '%s' for '%s': %v", vf, allocatedby, err) 61 | continue 62 | } 63 | 64 | klog.V(5).Infof("Successfully restored VF device '%s' for '%s'", vf, allocatedby) 65 | } 66 | } 67 | 68 | return nil 69 | } 70 | 71 | func (q *QATDevices) SaveState(statefile string) error { 72 | if statefile == "" { 73 | return nil 74 | } 75 | 76 | saveddevices := make(savedAllocations, 0) 77 | 78 | for _, pf := range *q { 79 | for allocatedby, vfdevices := range pf.AllocatedDevices { 80 | vflist, exists := saveddevices[allocatedby] 81 | if !exists { 82 | vflist = make([]string, 0) 83 | } 84 | 85 | for deviceuid := range vfdevices { 86 | vflist = append(vflist, deviceuid) 87 | } 88 | saveddevices[allocatedby] = vflist 89 | } 90 | } 91 | 92 | encodedstate, err := json.MarshalIndent(saveddevices, "", " ") 93 | if err != nil { 94 | return fmt.Errorf("failed save state JSON encoding to file '%s': %v", statefile, err) 95 | } 96 | 97 | return os.WriteFile(statefile, encodedstate, 0600) 98 | } 99 | -------------------------------------------------------------------------------- /pkg/version/version.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package version 18 | 19 | import ( 20 | "runtime" 21 | 22 | "k8s.io/klog/v2" 23 | ) 24 | 25 | // These are set during build time via -ldflags. 26 | var ( 27 | driverVersion = "N/A" 28 | gitCommit = "N/A" 29 | buildDate = "N/A" 30 | ) 31 | 32 | // GetVersion returns the version information of the driver. 33 | func PrintDriverVersion(apiGroupName string) { 34 | klog.Infof(` 35 | Driver Name: %v, 36 | Driver Version: %v, 37 | Git Commit: %v, 38 | Build Date: %v, 39 | Go Version: %v, 40 | Compiler: %v, 41 | Platform: %v/%v`, 42 | apiGroupName, 43 | driverVersion, 44 | gitCommit, 45 | buildDate, 46 | runtime.Version(), 47 | runtime.Compiler, 48 | runtime.GOOS, 49 | runtime.GOARCH, 50 | ) 51 | } 52 | -------------------------------------------------------------------------------- /qat.mk: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | QAT_VERSION ?= v0.2.0 17 | QAT_IMAGE_NAME ?= intel-qat-resource-driver 18 | QAT_IMAGE_VERSION ?= $(QAT_VERSION) 19 | QAT_IMAGE_TAG ?= $(REGISTRY)/$(QAT_IMAGE_NAME):$(QAT_IMAGE_VERSION) 20 | 21 | QAT_BINARIES = \ 22 | bin/qat-showdevice \ 23 | bin/kubelet-qat-plugin 24 | 25 | QAT_COMMON_SRC = \ 26 | $(COMMON_SRC) \ 27 | pkg/qat/device/*.go \ 28 | pkg/qat/cdi/*.go 29 | 30 | QAT_LDFLAGS = ${LDFLAGS} -extldflags $(EXT_LDFLAGS) -X ${PKG}/pkg/version.driverVersion=${QAT_VERSION} 31 | 32 | .PHONY: qat 33 | qat: $(QAT_BINARIES) 34 | 35 | bin/qat-showdevice: cmd/qat-showdevice/*.go $(QAT_COMMON_SRC) 36 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 37 | go build -a -ldflags "${QAT_LDFLAGS}" -mod vendor -o $@ ./cmd/qat-showdevice 38 | 39 | bin/kubelet-qat-plugin: cmd/kubelet-qat-plugin/*.go $(QAT_COMMON_SRC) 40 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 41 | go build -a -ldflags "${QAT_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-qat-plugin 42 | 43 | .PHONY: qat-container-build 44 | qat-container-build: cleanall vendor 45 | @echo "Building QAT resource driver container..." 46 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(QAT_IMAGE_TAG) \ 47 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) -f Dockerfile.qat . 48 | 49 | .PHONY: qat-container-push 50 | qat-container-push: qat-container-build 51 | $(DOCKER) push $(QAT_IMAGE_TAG) 52 | 53 | .PHONY: e2e-qat 54 | e2e-qat: 55 | sed -i 's|\(intel/intel-qat-resource-driver:\)[^ ]*|\1devel|' deployments/qat/base/resource-driver.yaml 56 | go test -v ./test/e2e/... --clean-start=true -ginkgo.v -ginkgo.trace -ginkgo.show-node-events 57 | -------------------------------------------------------------------------------- /test/e2e/dra_suite_test.go: -------------------------------------------------------------------------------- 1 | package e2e_test 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "os" 7 | "testing" 8 | 9 | "github.com/onsi/ginkgo/v2" 10 | "github.com/onsi/gomega" 11 | v1 "k8s.io/api/core/v1" 12 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 13 | "k8s.io/component-base/logs" 14 | "k8s.io/klog/v2" 15 | "k8s.io/kubernetes/test/e2e/framework" 16 | "k8s.io/kubernetes/test/e2e/framework/config" 17 | e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 18 | 19 | _ "github.com/intel/intel-resource-drivers-for-kubernetes/test/e2e/qat" 20 | ) 21 | 22 | func init() { 23 | ginkgo.SynchronizedBeforeSuite(setupFirstNode, func(data []byte) {}) 24 | } 25 | 26 | func setupFirstNode(ctx context.Context) []byte { 27 | c, err := framework.LoadClientset() 28 | if err != nil { 29 | framework.Failf("Error loading client: %v", err) 30 | } 31 | 32 | // Delete any namespaces except those created by the system. This ensures no 33 | // lingering resources are left over from a previous test run. 34 | if framework.TestContext.CleanStart { 35 | deleted, err2 := framework.DeleteNamespaces(ctx, c, nil, /* deleteFilter */ 36 | []string{ 37 | metav1.NamespaceSystem, 38 | metav1.NamespaceDefault, 39 | metav1.NamespacePublic, 40 | v1.NamespaceNodeLease, 41 | "cert-manager", 42 | }) 43 | if err2 != nil { 44 | framework.Failf("Error deleting orphaned namespaces: %v", err2) 45 | } 46 | 47 | framework.Logf("Waiting for deletion of the following namespaces: %v", deleted) 48 | 49 | if err2 = framework.WaitForNamespacesDeleted(ctx, c, deleted, e2epod.DefaultPodDeletionTimeout); err2 != nil { 50 | framework.Failf("Failed to delete orphaned namespaces %v: %v", deleted, err2) 51 | } 52 | } 53 | 54 | return []byte{} 55 | } 56 | func TestDra(t *testing.T) { 57 | gomega.RegisterFailHandler(ginkgo.Fail) 58 | ginkgo.RunSpecs(t, "E2E DRA Drivers Suite") 59 | } 60 | 61 | func TestMain(m *testing.M) { 62 | klog.SetOutput(ginkgo.GinkgoWriter) 63 | 64 | logs.InitLogs() 65 | config.CopyFlags(config.Flags, flag.CommandLine) 66 | framework.RegisterCommonFlags(flag.CommandLine) 67 | framework.RegisterClusterFlags(flag.CommandLine) 68 | flag.Parse() 69 | 70 | // Register framework flags, then handle flags. 71 | framework.AfterReadingAllFlags(&framework.TestContext) 72 | 73 | // Now run the test suite. 74 | os.Exit(m.Run()) 75 | } 76 | -------------------------------------------------------------------------------- /test/e2e/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "path/filepath" 9 | 10 | "k8s.io/kubernetes/test/e2e/framework" 11 | e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 12 | ) 13 | 14 | // LocateRepoFile locates a file inside this repository. 15 | func LocateRepoFile(repopath string) (string, error) { 16 | root := os.Getenv("PLUGINS_REPO_DIR") 17 | if root != "" { 18 | path := filepath.Join(root, repopath) 19 | if _, err := os.Stat(path); !os.IsNotExist(err) { 20 | return path, nil 21 | } 22 | } 23 | 24 | currentDir, err := os.Getwd() 25 | if err != nil { 26 | return "", err 27 | } 28 | 29 | path := filepath.Join(currentDir, repopath) 30 | if _, err := os.Stat(path); !os.IsNotExist(err) { 31 | return path, nil 32 | } 33 | 34 | path = filepath.Join(currentDir, "../../"+repopath) 35 | if _, err := os.Stat(path); !os.IsNotExist(err) { 36 | return path, err 37 | } 38 | 39 | return "", errors.New("no file found, try to define PLUGINS_REPO_DIR pointing to the root of the repository") 40 | } 41 | 42 | // GetPodLogs returns the log of the container. If not possible to get logs, it returns the error message. 43 | func GetPodLogs(ctx context.Context, f *framework.Framework, podName, containerName string) string { 44 | log, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, podName, containerName) 45 | if err != nil { 46 | return fmt.Sprintf("unable to get log from pod: %v", err) 47 | } 48 | 49 | return fmt.Sprintf("log output of the container %s in the pod %s:%s", containerName, podName, log) 50 | } 51 | --------------------------------------------------------------------------------