├── .github ├── dependabot.yaml └── workflows │ ├── build-test-lint.yml │ ├── codeql.yml │ ├── image-push-master.yml │ └── image-push-release.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── cmd ├── sriov-network-metrics-exporter.go └── sriov-network-metrics-exporter_test.go ├── collectors ├── collectors.go ├── collectors_test.go ├── pod_cpu_link.go ├── pod_cpu_link_test.go ├── pod_dev_link.go ├── pod_dev_link_test.go ├── sriovdev.go ├── sriovdev_readers.go ├── sriovdev_readers_test.go └── sriovdev_test.go ├── deployment ├── daemonset.yaml └── minimum-prom-scrape-config.yaml ├── docs └── prometheus-queries ├── go.mod ├── go.sum └── pkg ├── utils ├── test │ └── target ├── utils.go └── utils_test.go └── vfstats ├── netlink.go └── netlink_test.go /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Docker images 4 | - package-ecosystem: "docker" 5 | directory: "/" 6 | schedule: 7 | interval: "daily" 8 | commit-message: 9 | prefix: "chore:" 10 | 11 | # GitHub Actions 12 | - package-ecosystem: "github-actions" 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | day: "monday" 17 | 18 | - package-ecosystem: "gomod" 19 | directory: "/" 20 | schedule: 21 | interval: "weekly" 22 | day: "tuesday" 23 | groups: 24 | kubernetes: 25 | patterns: [ "k8s.io/*" ] 26 | ignore: 27 | # Ignore controller-runtime, and Kubernetes major and minor updates. These should be done manually. 28 | - dependency-name: "sigs.k8s.io/controller-runtime" 29 | update-types: [ "version-update:semver-major", "version-update:semver-minor" ] 30 | - dependency-name: "k8s.io/*" 31 | update-types: [ "version-update:semver-major", "version-update:semver-minor" ] 32 | 33 | -------------------------------------------------------------------------------- /.github/workflows/build-test-lint.yml: -------------------------------------------------------------------------------- 1 | name: build-test-lint 2 | on: [push, pull_request] 3 | 4 | env: 5 | BUILD_PLATFORMS: linux/amd64,linux/arm64,linux/ppc64le,linux/s390x 6 | 7 | jobs: 8 | build: 9 | name: build 10 | strategy: 11 | matrix: 12 | go-version: [1.22.x] 13 | goarch: [amd64,arm64,ppc64le,s390x] 14 | os: [ubuntu-latest] 15 | runs-on: ${{ matrix.os }} 16 | steps: 17 | - name: Set up Go matrix 18 | uses: actions/setup-go@v5 19 | with: 20 | go-version: ${{ matrix.go-version }} 21 | 22 | - name: Check out code into the Go module directory 23 | uses: actions/checkout@v4 24 | 25 | - name: Build 26 | env: 27 | GOARCH: ${{ matrix.goarch }} 28 | GOOS: ${{ matrix.goos }} 29 | run: make build 30 | 31 | build-image: 32 | runs-on: ubuntu-latest 33 | steps: 34 | - name: Check out the repo 35 | uses: actions/checkout@v4 36 | 37 | # Add support for more platforms with QEMU (optional) 38 | # https://github.com/docker/setup-qemu-action 39 | - name: Set up QEMU 40 | uses: docker/setup-qemu-action@v3 41 | 42 | - name: Set up Docker Buildx 43 | uses: docker/setup-buildx-action@v3 44 | 45 | - name: Build and push container image 46 | uses: docker/build-push-action@v6 47 | with: 48 | context: . 49 | push: false 50 | platforms: ${{ env.BUILD_PLATFORMS }} 51 | file: ./Dockerfile 52 | 53 | 54 | test: 55 | runs-on: ubuntu-latest 56 | needs: build 57 | name: test 58 | steps: 59 | - name: Set up Go 60 | uses: actions/setup-go@v5 61 | with: 62 | go-version: 1.22.x 63 | 64 | - name: Check out code into the Go module directory 65 | uses: actions/checkout@v4 66 | 67 | - name: Install hwdata 68 | run: sudo apt-get install hwdata -y 69 | 70 | - name: Go test 71 | run: make test 72 | 73 | test-coverage: 74 | runs-on: ubuntu-latest 75 | needs: build 76 | name: test-coverage 77 | steps: 78 | - name: Set up Go 79 | uses: actions/setup-go@v5 80 | with: 81 | go-version: 1.22.x 82 | 83 | - uses: actions/checkout@v4 84 | 85 | - name: Install hwdata 86 | run: sudo apt-get install hwdata -y 87 | 88 | - name: Go test with coverage 89 | run: make test-coverage 90 | 91 | golangci: 92 | name: Golangci-lint 93 | runs-on: ubuntu-latest 94 | steps: 95 | - name: Set up Go 96 | uses: actions/setup-go@v5 97 | with: 98 | go-version: 1.22.x 99 | - uses: actions/checkout@v4 100 | - name: golangci-lint 101 | uses: golangci/golangci-lint-action@v3 102 | with: 103 | # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. 104 | version: v1.63.4 105 | 106 | hadolint: 107 | runs-on: ubuntu-latest 108 | name: Hadolint 109 | steps: 110 | - uses: actions/checkout@v4 111 | - uses: brpaz/hadolint-action@v1.5.0 112 | name: Run Hadolint 113 | with: 114 | dockerfile: ./Dockerfile 115 | ignore: DL3018 # DL3018: GH issue 368 116 | 117 | go-check: 118 | runs-on: ubuntu-latest 119 | steps: 120 | - uses: actions/checkout@v4 121 | 122 | - name: Set up Go 123 | uses: actions/setup-go@v5 124 | with: 125 | go-version: 1.22.x 126 | 127 | # if this fails, run go mod tidy 128 | - name: Check if module files are consistent with code 129 | run: go mod tidy && git diff --exit-code 130 | 131 | # if this fails, run go mod vendor 132 | - name: Check if vendor directory is consistent with go modules 133 | run: go mod vendor && git diff --exit-code 134 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | schedule: 9 | - cron: "1 7 * * 5" 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ go ] 24 | 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v4 28 | 29 | - name: Initialize CodeQL 30 | uses: github/codeql-action/init@v3 31 | with: 32 | languages: ${{ matrix.language }} 33 | queries: +security-and-quality 34 | 35 | - name: Autobuild 36 | uses: github/codeql-action/autobuild@v3 37 | 38 | - name: Perform CodeQL Analysis 39 | uses: github/codeql-action/analyze@v3 40 | with: 41 | category: "/language:${{ matrix.language }}" 42 | -------------------------------------------------------------------------------- /.github/workflows/image-push-master.yml: -------------------------------------------------------------------------------- 1 | name: "Push images on merge to master" 2 | 3 | env: 4 | IMAGE_NAME: ghcr.io/${{ github.repository }} 5 | BUILD_PLATFORMS: linux/amd64,linux/arm64,linux/ppc64le,linux/s390x 6 | 7 | on: 8 | push: 9 | branches: 10 | - master 11 | 12 | jobs: 13 | build-and-push-image: 14 | runs-on: ubuntu-latest 15 | env: 16 | IMAGE_NAME: ghcr.io/${{ github.repository }} 17 | steps: 18 | - name: Check out the repo 19 | uses: actions/checkout@v4 20 | 21 | # Add support for more platforms with QEMU (optional) 22 | # https://github.com/docker/setup-qemu-action 23 | - name: Set up QEMU 24 | uses: docker/setup-qemu-action@v3 25 | 26 | - name: Set up Docker Buildx 27 | uses: docker/setup-buildx-action@v3 28 | 29 | - name: Login to Docker 30 | uses: docker/login-action@v3 31 | with: 32 | registry: ghcr.io 33 | username: ${{ github.repository_owner }} 34 | password: ${{ secrets.GITHUB_TOKEN }} 35 | 36 | - name: Docker meta 37 | id: docker_meta 38 | uses: docker/metadata-action@v5 39 | with: 40 | images: ${{ env.IMAGE_NAME }} 41 | 42 | - name: Build and push container image 43 | uses: docker/build-push-action@v6 44 | with: 45 | context: . 46 | push: true 47 | platforms: ${{ env.BUILD_PLATFORMS }} 48 | tags: | 49 | ${{ env.IMAGE_NAME }}:latest 50 | ${{ env.IMAGE_NAME }}:${{ github.sha }} 51 | file: ./Dockerfile 52 | labels: ${{ steps.docker_meta.outputs.labels }} 53 | -------------------------------------------------------------------------------- /.github/workflows/image-push-release.yml: -------------------------------------------------------------------------------- 1 | name: "Push images on release" 2 | 3 | env: 4 | IMAGE_NAME: ghcr.io/${{ github.repository }} 5 | BUILD_PLATFORMS: linux/amd64,linux/arm64,linux/ppc64le,linux/s390x 6 | 7 | on: 8 | push: 9 | tags: 10 | - v* 11 | 12 | jobs: 13 | build-and-push-image: 14 | runs-on: ubuntu-latest 15 | env: 16 | IMAGE_NAME: ghcr.io/${{ github.repository }} 17 | steps: 18 | - name: Check out the repo 19 | uses: actions/checkout@v4 20 | 21 | # Add support for more platforms with QEMU (optional) 22 | # https://github.com/docker/setup-qemu-action 23 | - name: Set up QEMU 24 | uses: docker/setup-qemu-action@v3 25 | 26 | - name: Set up Docker Buildx 27 | uses: docker/setup-buildx-action@v3 28 | 29 | - name: Login to Docker 30 | uses: docker/login-action@v3 31 | with: 32 | registry: ghcr.io 33 | username: ${{ github.repository_owner }} 34 | password: ${{ secrets.GITHUB_TOKEN }} 35 | 36 | - name: Docker meta 37 | id: docker_meta 38 | uses: docker/metadata-action@v5 39 | with: 40 | images: ${{ env.IMAGE_NAME }} 41 | flavor: | 42 | latest=false 43 | 44 | - name: Build and push container image 45 | uses: docker/build-push-action@v6 46 | with: 47 | context: . 48 | push: true 49 | platforms: ${{ env.BUILD_PLATFORMS }} 50 | tags: | 51 | ${{ steps.docker_meta.outputs.tags }} 52 | labels: ${{ steps.docker_meta.outputs.labels }} 53 | file: ./Dockerfile 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary Build Files 2 | /build/* 3 | bin/ 4 | testbin/ 5 | env 6 | .env 7 | .cache 8 | # Created by https://www.gitignore.io/api/go,vim,emacs,visualstudiocode 9 | ### Emacs ### 10 | # -*- mode: gitignore; -*- 11 | *~ 12 | \#*\# 13 | /.emacs.desktop 14 | /.emacs.desktop.lock 15 | *.elc 16 | auto-save-list 17 | tramp 18 | .\#* 19 | # Org-mode 20 | .org-id-locations 21 | *_archive 22 | # flymake-mode 23 | *_flymake.* 24 | # eshell files 25 | /eshell/history 26 | /eshell/lastdir 27 | # elpa packages 28 | /elpa/ 29 | # reftex files 30 | *.rel 31 | # AUCTeX auto folder 32 | /auto/ 33 | # cask packages 34 | .cask/ 35 | dist/ 36 | # Flycheck 37 | flycheck_*.el 38 | # server auth directory 39 | /server/ 40 | # projectiles files 41 | .projectile 42 | projectile-bookmarks.eld 43 | # directory configuration 44 | .dir-locals.el 45 | # saveplace 46 | places 47 | # url cache 48 | url/cache/ 49 | # cedet 50 | ede-projects.el 51 | # smex 52 | smex-items 53 | # company-statistics 54 | company-statistics-cache.el 55 | # anaconda-mode 56 | anaconda-mode/ 57 | ### Go ### 58 | # Binaries for programs and plugins 59 | *.exe 60 | *.exe~ 61 | *.dll 62 | *.so 63 | *.dylib 64 | # Test binary, build with 'go test -c' 65 | *.test 66 | # Output of the go coverage tool, specifically when used with LiteIDE 67 | *.out 68 | ### Vim ### 69 | # swap 70 | .sw[a-p] 71 | .*.sw[a-p] 72 | # session 73 | Session.vim 74 | # temporary 75 | .netrwhist 76 | # auto-generated tag files 77 | tags 78 | ### VisualStudioCode ### 79 | .vscode/* 80 | .history 81 | # End of https://www.gitignore.io/api/go,vim,emacs,visualstudiocode 82 | #IDE (GoLand) specific 83 | .idea/ 84 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/golang:alpine as builder 2 | 3 | RUN apk add --no-cache --virtual build-dependencies build-base linux-headers git 4 | COPY ./ /usr/src/sriov-network-metrics-exporter 5 | WORKDIR /usr/src/sriov-network-metrics-exporter 6 | RUN make clean && make build 7 | 8 | FROM docker.io/alpine:3.22 9 | COPY --from=builder /usr/src/sriov-network-metrics-exporter/build/* /usr/bin/ 10 | RUN apk update && apk add --no-cache ca-certificates && update-ca-certificates && apk add --no-cache openssl 11 | EXPOSE 9808 12 | ENTRYPOINT ["sriov-exporter"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Intel Corporation 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGE_REGISTRY?=ghcr.io/k8snetworkplumbingwg/ 2 | IMAGE_VERSION?=latest 3 | 4 | IMAGE_NAME?=$(IMAGE_REGISTRY)sriov-network-metrics-exporter:$(IMAGE_VERSION) 5 | IMAGE_BUILDER?=docker 6 | 7 | # Package related 8 | BINARY_NAME=sriov-exporter 9 | BUILDDIR=$(CURDIR)/build 10 | 11 | DOCKERARGS?= 12 | ifdef HTTP_PROXY 13 | DOCKERARGS += --build-arg http_proxy=$(HTTP_PROXY) 14 | endif 15 | ifdef HTTPS_PROXY 16 | DOCKERARGS += --build-arg https_proxy=$(HTTPS_PROXY) 17 | endif 18 | 19 | # Go settings 20 | GO = go 21 | GO_BUILD_OPTS ?=CGO_ENABLED=0 22 | GO_LDFLAGS ?= -s -w 23 | GO_FLAGS ?= 24 | GO_TAGS ?=-tags no_openssl 25 | export GOPATH?=$(shell go env GOPATH) 26 | 27 | # debug 28 | V ?= 0 29 | Q = $(if $(filter 1,$V),,@) 30 | 31 | all: build image-build test 32 | 33 | clean: 34 | rm -rf bin 35 | go clean -modcache -testcache 36 | 37 | build: 38 | $Q cd $(CURDIR)/cmd && $(GO_BUILD_OPTS) go build -ldflags '$(GO_LDFLAGS)' $(GO_FLAGS) -o $(BUILDDIR)/$(BINARY_NAME) $(GO_TAGS) -v 39 | 40 | image-build: 41 | @echo "Bulding container image $(IMAGE_NAME)" 42 | $(IMAGE_BUILDER) build -f Dockerfile -t $(IMAGE_NAME) $(DOCKERARGS) . 43 | 44 | image-push: 45 | $(IMAGE_BUILDER) push $(IMAGE_NAME) 46 | 47 | test: 48 | go test ./... -count=1 49 | 50 | test-coverage: 51 | go test ./... -coverprofile cover.out 52 | go tool cover -func cover.out 53 | 54 | go-lint-install: 55 | go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.63.4 56 | 57 | go-lint: go-lint-install 58 | go mod tidy 59 | go fmt ./... 60 | golangci-lint run --color always -v ./... 61 | 62 | go-lint-report: go-lint-install 63 | golangci-lint run --color always -v ./... &> golangci-lint.txt 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SR-IOV Network Metrics Exporter 2 | Exporter that reads metrics for [SR-IOV Virtual Functions](https://www.intel.com/content/dam/doc/white-paper/pci-sig-single-root-io-virtualization-support-in-virtualization-technology-for-connectivity-paper.pdf) and exposes them in the Prometheus format. 3 | 4 | The SR-IOV Network Metrics Exporter is designed with the Kubernetes SR-IOV stack in mind, including the [SR-IOV CNI](https://github.com/k8snetworkplumbingwg/sriov-cni) and the [SR-IOV Network Device Plugin](https://github.com/k8snetworkplumbingwg/sriov-network-device-plugin). 5 | 6 | **This software is a pre-production alpha version and should not be deployed to production servers.** 7 | 8 | ## Hardware support 9 | The sysfs collector for Virtual Function telemetry supports NICs with drivers that implement the SR-IOV sysfs management interface e.g. ice, i40e, mlnx_en and mlnx_ofed. 10 | 11 | The netlink collector relies on driver support and a kernel version of 4.4 or higher. 12 | Also, the following minimum driver versions are required for this collector: 13 | - `i40e` - 2.11+ for Intel® 700 series NICs 14 | - `ice` - 1.2+ for Intel® 800 series NICs 15 | - `mlx5_core` - 5.15+ for Mellanox NICs 16 | 17 | To check your current driver version run: `modinfo | grep ^version` where driver is `i40e` or `ice`\ 18 | i40e drivers: [Intel Download Center](https://downloadcenter.intel.com/download/18026/), [Source Forge](https://sourceforge.net/projects/e1000/files/i40e%20stable/)\ 19 | ice drivers: [Intel Download Center](https://www.intel.com/content/www/us/en/download/19630/), [Source Forge](https://sourceforge.net/projects/e1000/files/ice%20stable/) 20 | 21 | ## Metrics 22 | This exporter will make the following metrics available: 23 | 24 | - **sriov_vf_rx_bytes:** Received bytes per virtual function 25 | - **sriov_vf_tx_bytes:** Transmitted bytes per virtual function 26 | - **sriov_vf_rx_packets:** Received packets per virtual function 27 | - **sriov_vf_tx_packets:** Transmitted packets per virtual function 28 | - **sriov_vf_rx_dropped:** Dropped packets on receipt per virtual function 29 | - **sriov_vf_tx_dropped:** Dropped packets on transmit per virtual function 30 | - **sriov_vf_tx_errors:** Transmit errors per virtual function 31 | - **kubepoddevice:** Virtual functions linked to active pods 32 | - **kubepodcpu:** CPUs linked to pods (Guaranteed Pods managed by CPU Manager Static policy only) 33 | 34 | ## Usage 35 | Once the SR-IOV Network Metrics Exporter is up and running metrics can be queried in the usual way from Prometheus. 36 | The following PromQL query returns virtual function metrics with the name and namespace of the Pod it is attached to: 37 | ``` 38 | (sriov_vf_tx_errors * on (pciAddr) group_left(pod,namespace) sriov_kubepoddevice) 39 | ``` 40 | To get more detailed information about the pod the above can be joined with information from [Kube State Metrics](https://github.com/kubernetes/kube-state-metrics). 41 | 42 | For example, to get the VF along with the application name from the standard Kubernetes pod label: 43 | ``` 44 | (sriov_vf_tx_errors * on (pciAddr) group_left(pod,namespace) sriov_kubepoddevice) * on (pod,namespace) group_left (label_app_kubernetes_io_name) kube_pod_labels 45 | ``` 46 | 47 | Once available through Prometheus VF metrics can be used by metrics applications like Grafana, or the Horizontal Pod Autoscaler. 48 | 49 | ## Installation 50 | 51 | ### Kubernetes installation 52 | 53 | Typical deployment is as a daemonset in a cluster. A daemonset requires the image to be available on each node in the cluster or at a registry accessible from each node. 54 | 55 | #### Labeling nodes 56 | 57 | SR-IOV Network Metrics Exporter will only be deployed on nodes labeled with `"feature.node.kubernetes.io/network-sriov.capable": "true"` label. You can label the nodes automatically using [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery), or manually, executing the following `kubectl` command: 58 | 59 | ``` 60 | kubectl label node feature.node.kubernetes.io/network-sriov.capable="true" 61 | ``` 62 | 63 | If you prefer to use the `Node Feature Discovery` you can refer to the [Quick-start guide](https://github.com/kubernetes-sigs/node-feature-discovery#quick-start--the-short-short-version) on the project's repository. 64 | 65 | #### Deploying SR-IOV Network Metrics Exporter 66 | 67 | Create monitoring namespace: 68 | ``` 69 | kubectl create namespace monitoring 70 | ``` 71 | Once the image is available from each node in the cluster run: 72 | 73 | ``` 74 | kubectl apply -f deployment/daemonset.yaml 75 | ``` 76 | This will create the daemonset and set it running. To ensure it's running as expected run: 77 | ``` 78 | kubectl -n monitoring exec -it $(kubectl get pods -nmonitoring -o=jsonpath={.items[0].metadata.name} -lapp.kubernetes.io/name=sriov-metrics-exporter) -- wget -O- localhost:9808/metrics 79 | ``` 80 | The output of this command - which pulls data from the endpoint of the first instance of SR-IOV Network Metrics Exporter - should look something like: 81 | ``` 82 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.0",pf="ens785f2",vf="0"} 0 83 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.1",pf="ens785f2",vf="1"} 0 84 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.2",pf="ens785f2",vf="2"} 0 85 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.3",pf="ens785f2",vf="3"} 0 86 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.4",pf="ens785f2",vf="4"} 0 87 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.5",pf="ens785f2",vf="5"} 0 88 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.6",pf="ens785f2",vf="6"} 0 89 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.7",pf="ens785f2",vf="7"} 0 90 | ``` 91 | The above may show other metrics if there are no applicable SR-IOV Virtual Functions available on the system. Any metrics at all shows the pod is up and running and exposing metrics. 92 | 93 | #### Configuring Prometheus for in-Cluster installation 94 | In order to expose these metrics to Prometheus we need to configure the database to scrape our new endpoint. With the service contained in the daemonset file this can be done by adding: 95 | 96 | ``` 97 | - job_name: 'sriov-metrics' 98 | kubernetes_sd_configs: 99 | - role: endpoints 100 | relabel_configs: 101 | - source_labels: [__meta_kubernetes_endpoint_node_name] 102 | target_label: instance 103 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_target] 104 | action: keep 105 | regex: true 106 | static_configs: 107 | - targets: ['sriov-metrics-exporter.monitoring.svc.cluster.local'] 108 | scheme: http 109 | ``` 110 | The above should be added to the Prometheus configuration as a new target. For more about configuring Prometheus see the [official guide.](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) Once Prometheus is started with this included in its config sriov-metrics should appear on the "Targets page". Metrics should be available by querying the Prometheus API or in the web interface. 111 | 112 | In this mode it will serve stats on an endpoint inside the cluster. Prometheus will detect the label on the service endpoint through the above configuration. 113 | 114 | #### Building images (optional) 115 | 116 | Rather than using the Docker image available from GHCR, you may prefer to build the Docker image. 117 | 118 | The following assumes a local Docker registry available at localhost:5000, and assumes Docker is being used to build and manage containers in the cluster. 119 | 120 | In order to build the container and load it to a local registry run: 121 | 122 | ``` 123 | docker build . -t localhost:5000/sriov-metrics-exporter && docker push localhost:5000/sriov-metrics-exporter 124 | ``` 125 | or 126 | ``` 127 | make docker-build && make docker-push 128 | ``` 129 | 130 | The above assumes a registry available across the cluster at localhost:5000, for example on using the [Docker Registry Proxy](https://github.com/kubernetes-sigs/kubespray/blob/master/roles/kubernetes-apps/registry/README.md). 131 | 132 | Update the docker image path in `deployment/daemonset.yaml` as required e.g. `image: localhost:5000/sriov-metrics-exporter`. 133 | 134 | 135 | ### Standalone installation to an endpoint on the host. 136 | 137 | To run as standalone the SR-IOV Metrics exporter will have to be run on each host in the cluster. 138 | Go 1.14+ is required to build the exporter. 139 | Run: 140 | ```make build``` 141 | The binary should then be started on each relevant host in the cluster. Once running hitting the endpoint with: 142 | 143 | ```curl localhost:9808/metrics``` 144 | 145 | Will produce a list of metrics looking something like: 146 | ``` 147 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.0",pf="ens785f2",vf="0"} 0 148 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.1",pf="ens785f2",vf="1"} 0 149 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.2",pf="ens785f2",vf="2"} 0 150 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.3",pf="ens785f2",vf="3"} 0 151 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.4",pf="ens785f2",vf="4"} 0 152 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.5",pf="ens785f2",vf="5"} 0 153 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.6",pf="ens785f2",vf="6"} 0 154 | sriov_vf_tx_packets{numa_node="0",pciAddr="0000:02:0a.7",pf="ens785f2",vf="7"} 0 155 | ``` 156 | Note: The exact metrics will depend on the set up of each system, but the format will be similar. 157 | 158 | #### Configuring Prometheus for standalone installation 159 | With the default settings the SR-IOV Network Metrics Exporter will expose metrics at port 9808. The below configuration will tell Prometheus to scrape this port and each and every host in the cluster. 160 | 161 | ``` 162 | - job_name: 'sriov-metrics-standalone' 163 | scheme: http 164 | kubernetes_sd_configs: 165 | - role: node 166 | relabel_configs: 167 | - source_labels: [__address__] 168 | regex: ^(.*):\d+$ 169 | target_label: __address__ 170 | replacement: $1:9808 171 | - target_label: __scheme__ 172 | replacement: http 173 | ``` 174 | 175 | The above should be added to the Prometheus configuration as a new target. For more about configuring Prometheus see the [official guide.](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) Once Prometheus is started with this included in its config sriov-metrics-standalone should appear on the "Targets page". Metrics should be available by querying the Prometheus API or the web interface. 176 | 177 | ### Configuration 178 | A number of configuration flags can be passed to the SR-IOV Network Metrics Exporter in order to change enabled collectors, the paths it reads from and some properties of its web endpoint. 179 | 180 | The collector.vfstatspriority flag defines the priority of vf stats collectors, each pf will use the first supported collector in the list.\ 181 | Example: using the priority, "sysfs,netlink", with Intel® 700 and 800 series NICs installed and vfs initialized, the sysfs collector will be used for the 700 series NIC, and netlink for the 800 series NIC since it doesn't support sysfs collection, therefore it falls back to the netlink driver. 182 | 183 | | Flag | Type | Description | Default Value | 184 | |----|:----|:----|:----| 185 | | collector.kubepodcpu | boolean | Enables the kubepodcpu collector | false | 186 | | collector.kubepoddevice | boolean | Enables the kubepoddevice collector | false | 187 | | collector.vfstatspriority | string | Sets the priority of vfstats collectors | sysfs,netlink | 188 | | collector.sysfs | boolean | Enables using sr-iov sysfs for vfstats collection | true | 189 | | collector.netlink | boolean | Enables using netlink for vfstats collection | true | 190 | | path.cpucheckpoint | string | Path for location of cpu manager checkpoint file | /var/lib/kubelet/cpu_manager_state | 191 | | path.kubecgroup |string | Path for location of kubernetes cgroups on the host system | /sys/fs/cgroup/cpuset/kubepods/ | 192 | | path.kubeletsocket | string | Path to kubelet resources socket | /var/lib/kubelet/pod-resources/kubelet.sock | 193 | | path.nodecpuinfo | string | Path for location of system cpu information | /sys/devices/system/node/ | 194 | | path.sysbuspci | string | Path to sys/bus/pci on host | /sys/bus/pci/devices | 195 | | path.sysclassnet | string | Path to sys/class/net on host | /sys/class/net/ | 196 | | web.listen-address | string | Address to listen on for web interface and telemetry | :9808 | 197 | | web.rate-burst | int | Maximum per second burst rate for requests | 10 | 198 | | web.rate-limit | int | Limit for requests per second | 1 | 199 | 200 | ## Communication and contribution 201 | 202 | Report a bug by [filing a new issue](https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/issues). 203 | 204 | Contribute by [opening a pull request](https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pulls). 205 | 206 | Learn [about pull requests](https://help.github.com/articles/using-pull-requests/). 207 | -------------------------------------------------------------------------------- /cmd/sriov-network-metrics-exporter.go: -------------------------------------------------------------------------------- 1 | // The SR-IOV networks exporter makes metrics from SR-IOV Virtual Functions available in a prometheus format. 2 | // Different classes of metrics are implemented as individual collectors. 3 | 4 | package main 5 | 6 | import ( 7 | "flag" 8 | "log" 9 | "net/http" 10 | 11 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/collectors" 12 | 13 | "github.com/prometheus/client_golang/prometheus" 14 | "github.com/prometheus/client_golang/prometheus/promhttp" 15 | "golang.org/x/time/rate" 16 | ) 17 | 18 | var ( 19 | addr = flag.String("web.listen-address", ":9808", "Port to listen on for web interface and telemetry.") 20 | rateLimit = flag.Int("web.rate-limit", 1, "Limit for requests per second.") 21 | rateBurst = flag.Int("web.rate-burst", 10, "Maximum per second burst rate for requests.") 22 | metricsEndpoint = "/metrics" 23 | ) 24 | 25 | func main() { 26 | parseAndVerifyFlags() 27 | 28 | err := prometheus.Register(collectors.Enabled()) 29 | if err != nil { 30 | log.Fatalf("collector could not be registered: %v", err) 31 | return 32 | } 33 | 34 | // Use the default promhttp handler wrapped with middleware to serve at the metrics endpoint 35 | handlerWithMiddleware := limitRequests( 36 | getOnly( 37 | endpointOnly( 38 | noBody(promhttp.Handler()), metricsEndpoint)), 39 | rate.Limit(*rateLimit), *rateBurst) 40 | 41 | log.Printf("listening on %v", *addr) 42 | log.Fatalf("ListenAndServe error: %v", http.ListenAndServe(*addr, handlerWithMiddleware)) 43 | } 44 | 45 | func parseAndVerifyFlags() { 46 | flag.Parse() 47 | verifyFlags() 48 | } 49 | 50 | // endpointOnly restricts all responses to 404 where the passed endpoint isn't used. Used to minimize the possible outputs of the server. 51 | func endpointOnly(next http.Handler, endpoint string) http.Handler { 52 | return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 53 | if r.URL.Path != endpoint { 54 | w.WriteHeader(http.StatusNotFound) 55 | _, err := w.Write([]byte{}) 56 | if err != nil { 57 | log.Print(err) 58 | } 59 | return 60 | } 61 | next.ServeHTTP(w, r) 62 | }) 63 | } 64 | 65 | // getOnly restricts the possible verbs used in a http request to GET only 66 | func getOnly(next http.Handler) http.Handler { 67 | return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 68 | if r.Method != http.MethodGet { 69 | w.WriteHeader(http.StatusMethodNotAllowed) 70 | _, err := w.Write([]byte{}) 71 | if err != nil { 72 | log.Print(err) 73 | } 74 | return 75 | } 76 | next.ServeHTTP(w, r) 77 | }) 78 | } 79 | 80 | // noBody returns a 400 to any request that contains a body 81 | func noBody(next http.Handler) http.Handler { 82 | return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 83 | if r.Body != http.NoBody { 84 | w.WriteHeader(http.StatusBadRequest) 85 | _, err := w.Write([]byte{}) 86 | if err != nil { 87 | log.Print(err) 88 | } 89 | return 90 | } 91 | next.ServeHTTP(w, r) 92 | }) 93 | } 94 | 95 | // limitRequests sets a rate limit and a burst limit for requests to the endpoint 96 | func limitRequests(next http.Handler, rateLimit rate.Limit, burstLimit int) http.Handler { 97 | limiter := rate.NewLimiter(rateLimit, burstLimit) 98 | return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 99 | if !limiter.Allow() { 100 | http.Error(w, http.StatusText(http.StatusTooManyRequests), http.StatusTooManyRequests) 101 | return 102 | } 103 | next.ServeHTTP(w, r) 104 | }) 105 | } 106 | 107 | func verifyFlags() { 108 | if err := collectors.ResolveFilepaths(); err != nil { 109 | log.Panicf("failed to resolve paths\n%v", err) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /cmd/sriov-network-metrics-exporter_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "net/http" 7 | "net/http/httptest" 8 | "testing" 9 | 10 | . "github.com/onsi/ginkgo/v2" 11 | . "github.com/onsi/gomega" 12 | "github.com/prometheus/client_golang/prometheus/promhttp" 13 | "golang.org/x/time/rate" 14 | ) 15 | 16 | func TestMain(t *testing.T) { 17 | RegisterFailHandler(Fail) 18 | RunSpecs(t, "main test suite") 19 | } 20 | 21 | var _ = DescribeTable("test endpointOnly handler", // endpointOnly 22 | func(endpoint string, expectedResponse int) { 23 | recorder := httptest.NewRecorder() 24 | request := httptest.NewRequest(http.MethodGet, endpoint, nil) 25 | handler := endpointOnly(promhttp.Handler(), metricsEndpoint) 26 | 27 | handler.ServeHTTP(recorder, request) 28 | 29 | Expect(recorder.Code).To(Equal(expectedResponse)) 30 | }, 31 | Entry("returns status 'OK' when request endpoint is '/metrics'", "/metrics", http.StatusOK), 32 | Entry("returns status 'Not Found' when request endpoint is not '/metrics'", "/invalidendpoint", http.StatusNotFound), 33 | ) 34 | 35 | var _ = DescribeTable("test getOnly handler", // getOnly 36 | func(method string, expectedResponse int) { 37 | recorder := httptest.NewRecorder() 38 | request := httptest.NewRequest(method, metricsEndpoint, nil) 39 | handler := getOnly(promhttp.Handler()) 40 | 41 | handler.ServeHTTP(recorder, request) 42 | 43 | Expect(recorder.Code).To(Equal(expectedResponse)) 44 | }, 45 | Entry("returns status 'OK' when request method is 'GET'", http.MethodGet, http.StatusOK), 46 | Entry("returns status 'MethodNotAllowed' when request method is not 'GET'", http.MethodPost, http.StatusMethodNotAllowed), 47 | ) 48 | 49 | var _ = DescribeTable("test noBody handler", // noBody 50 | func(body io.Reader, expectedResponse int) { 51 | recorder := httptest.NewRecorder() 52 | request := httptest.NewRequest(http.MethodGet, metricsEndpoint, body) 53 | handler := noBody(promhttp.Handler()) 54 | 55 | handler.ServeHTTP(recorder, request) 56 | 57 | Expect(recorder.Code).To(Equal(expectedResponse)) 58 | }, 59 | Entry("returns status 'OK' when request body is empty", nil, http.StatusOK), 60 | Entry("returns status 'Bad Request' when request body is not empty", bytes.NewReader([]byte("body")), http.StatusBadRequest), 61 | ) 62 | 63 | var _ = DescribeTable("test limitRequests handler", // limitRequests 64 | func(limit int, requests int, expectedResponse int) { 65 | handler := limitRequests(promhttp.Handler(), rate.Limit(limit), limit) 66 | 67 | code := http.StatusOK 68 | for i := 0; i < requests; i++ { 69 | recorder := httptest.NewRecorder() 70 | request := httptest.NewRequest(http.MethodGet, metricsEndpoint, nil) 71 | handler.ServeHTTP(recorder, request) 72 | 73 | code = recorder.Code 74 | } 75 | 76 | Expect(code).To(Equal(expectedResponse)) 77 | }, 78 | Entry("returns status 'OK' when the number of requests does not exceed the request limit", 10, 10, http.StatusOK), 79 | Entry("returns status 'Too Many Requests' when number of requests exceeds the request limit", 10, 11, http.StatusTooManyRequests), 80 | ) 81 | -------------------------------------------------------------------------------- /collectors/collectors.go: -------------------------------------------------------------------------------- 1 | // Package Collectors defines the structure of the collector aggregator and contains the individual collectors used to gather metrics 2 | // Each collector should be created in its own file with any required command line flags, its collection behavior and its registration method defined. 3 | 4 | package collectors 5 | 6 | import ( 7 | "flag" 8 | "fmt" 9 | "log" 10 | 11 | "github.com/prometheus/client_golang/prometheus" 12 | ) 13 | 14 | var ( 15 | collectorNamespace = "sriov" 16 | enabled = true 17 | disabled = false 18 | collectorState = make(map[string]*bool) 19 | collectorFunctions = make(map[string]func() prometheus.Collector) 20 | ) 21 | 22 | // SriovCollector registers the collectors used for specific data and exposes a Collect method to gather the data 23 | type SriovCollector []prometheus.Collector 24 | 25 | // Register defines a flag for a collector and adds it to the registry of enabled collectors if the flag is set to true - either through the default option or the flag passed on start 26 | // Run by each individual collector in its init function. 27 | func register(name string, enabled bool, collector func() prometheus.Collector) { 28 | collectorState[name] = &enabled 29 | collectorFunctions[name] = collector 30 | flag.BoolVar(collectorState[name], "collector."+name, enabled, fmt.Sprintf("Enables the %v collector", name)) 31 | } 32 | 33 | // Collect metrics from all enabled collectors in unordered sequence. 34 | func (s SriovCollector) Collect(ch chan<- prometheus.Metric) { 35 | for _, collector := range s { 36 | collector.Collect(ch) 37 | } 38 | } 39 | 40 | // Describe each collector in unordered sequence 41 | func (s SriovCollector) Describe(ch chan<- *prometheus.Desc) { 42 | for _, collector := range s { 43 | collector.Describe(ch) 44 | } 45 | } 46 | 47 | // Enabled adds collectors enabled by default or command line flag to an SriovCollector object 48 | func Enabled() SriovCollector { 49 | collectors := make([]prometheus.Collector, 0) 50 | for collector, enabled := range collectorState { 51 | if enabled != nil && *enabled { 52 | log.Printf("The %v collector is enabled", collector) 53 | collectors = append(collectors, collectorFunctions[collector]()) 54 | } 55 | } 56 | return collectors 57 | } 58 | 59 | func ResolveFilepaths() error { 60 | resolveFuncs := []func() error{ 61 | resolveSriovDevFilepaths, 62 | resolveKubePodCPUFilepaths, 63 | resolveKubePodDeviceFilepaths, 64 | } 65 | 66 | for _, resolveFunc := range resolveFuncs { 67 | if err := resolveFunc(); err != nil { 68 | return err 69 | } 70 | } 71 | 72 | return nil 73 | } 74 | 75 | var logFatal = func(msg string, args ...any) { 76 | log.Fatalf(msg, args...) 77 | } 78 | -------------------------------------------------------------------------------- /collectors/collectors_test.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "log" 7 | "path/filepath" 8 | "testing" 9 | "time" 10 | 11 | . "github.com/onsi/ginkgo/v2" 12 | . "github.com/onsi/gomega" 13 | "github.com/onsi/gomega/gbytes" 14 | "github.com/prometheus/client_golang/prometheus" 15 | 16 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/utils" 17 | ) 18 | 19 | var buffer gbytes.Buffer 20 | 21 | func TestCollectors(t *testing.T) { 22 | RegisterFailHandler(Fail) 23 | RunSpecs(t, "collectors test suite") 24 | } 25 | 26 | var _ = BeforeSuite(func() { 27 | utils.EvalSymlinks = evalSymlinks 28 | 29 | logFatal = func(msg string, args ...any) { 30 | log.Printf(msg, args...) 31 | } 32 | 33 | log.SetFlags(0) 34 | }) 35 | 36 | var _ = BeforeEach(func() { 37 | buffer = *gbytes.NewBuffer() 38 | log.SetOutput(&buffer) 39 | }) 40 | 41 | type metric struct { 42 | labels map[string]string 43 | counter float64 44 | } 45 | 46 | type testCollector struct { 47 | name string 48 | } 49 | 50 | func createTestCollector() prometheus.Collector { 51 | return testCollector{ 52 | name: "collector.test", 53 | } 54 | } 55 | 56 | func (c testCollector) Collect(ch chan<- prometheus.Metric) {} 57 | func (c testCollector) Describe(chan<- *prometheus.Desc) {} 58 | 59 | var _ = DescribeTable("test registering collector", // register 60 | func(name string, enabled bool, collector func() prometheus.Collector) { 61 | register(name, enabled, collector) 62 | 63 | Expect(collectorState).To(HaveKey(name)) 64 | Expect(collectorState[name]).To(Equal(&enabled)) 65 | 66 | Expect(collectorFunctions).To(HaveKey(name)) 67 | // Expect(allCollectors[name]).To(Equal(collector)) // TODO: verify expected collector is returned 68 | 69 | }, 70 | Entry("the correct collector is enabled when default is true", 71 | "test_true", 72 | true, 73 | createTestCollector), 74 | Entry("the correct collector is not enabled when default is false", 75 | "test_false", 76 | false, 77 | createTestCollector), 78 | ) 79 | 80 | // TODO: create Enabled unit test 81 | 82 | func assertLogs(logs []string) { 83 | for _, log := range logs { 84 | Eventually(&buffer).WithTimeout(time.Duration(2 * time.Second)).Should(gbytes.Say(log)) 85 | } 86 | } 87 | 88 | // Replaces filepath.EvalSymlinks with an emulated evaluation to work with the in-memory fs. 89 | var evalSymlinks = func(path string) (string, error) { 90 | path = filepath.Join(filepath.Base(filepath.Dir(path)), filepath.Base(path)) 91 | 92 | if stat, err := fs.Stat(devfs, path); err == nil && stat.Mode() == fs.ModeSymlink { 93 | if target, err := fs.ReadFile(devfs, path); err == nil { 94 | return string(target), nil 95 | } else { 96 | return "", fmt.Errorf("error") 97 | } 98 | } else { 99 | return "", fmt.Errorf("error") 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /collectors/pod_cpu_link.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | // kubepodCPUCollector is a Kubernetes focused collector that exposes information about CPUs linked to specific Kubernetes pods through the CPU Manager component in Kubelet 4 | 5 | import ( 6 | "encoding/json" 7 | "flag" 8 | "fmt" 9 | "io/fs" 10 | "log" 11 | "os" 12 | "path/filepath" 13 | "regexp" 14 | "strconv" 15 | "strings" 16 | 17 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/utils" 18 | 19 | "github.com/prometheus/client_golang/prometheus" 20 | ) 21 | 22 | var ( 23 | kubepodcpu = "kubepodcpu" 24 | kubePodCgroupPath = flag.String("path.kubecgroup", "/sys/fs/cgroup/cpuset/kubepods.slice/", "Path for location of kubernetes cgroups on the host system") 25 | sysDevSysNodePath = flag.String("path.nodecpuinfo", "/sys/devices/system/node/", "Path for location of system cpu information") 26 | cpuCheckPointFile = flag.String("path.cpucheckpoint", "/var/lib/kubelet/cpu_manager_state", "Path for location of cpu manager checkpoint file") 27 | 28 | kubecgroupfs fs.FS 29 | cpuinfofs fs.FS 30 | cpucheckpointfs fs.FS 31 | ) 32 | 33 | // kubepodCPUCollector holds a static representation of node cpu topology and uses it to update information about kubernetes pod cpu usage. 34 | type kubepodCPUCollector struct { 35 | cpuInfo map[string]string 36 | name string 37 | } 38 | 39 | // podCPULink contains the information about the pod and container a single cpu is attached to 40 | type podCPULink struct { 41 | podID string 42 | containerID string 43 | cpu string 44 | } 45 | 46 | // cpuManagerCheckpoint is the structure needed to extract the default cpuSet information from the kubelet checkpoint file 47 | type cpuManagerCheckpoint struct { 48 | DefaultCPUSet string "json:\"defaultCpuSet\"" 49 | } 50 | 51 | // init runs the registration for this collector on package import 52 | func init() { 53 | register(kubepodcpu, disabled, createKubepodCPUCollector) 54 | } 55 | 56 | // Collect publishes the cpu information and all kubernetes pod cpu information to the prometheus channel 57 | // On each run it reads the guaranteed pod cpus and exposes the pod, container, and NUMA IDs to the collector 58 | func (c kubepodCPUCollector) Collect(ch chan<- prometheus.Metric) { 59 | // This exposes the basic cpu alignment to prometheus. 60 | for cpu, numa := range c.cpuInfo { 61 | cpuID := "cpu" + cpu 62 | desc := prometheus.NewDesc( 63 | prometheus.BuildFQName(collectorNamespace, "", "cpu_info"), 64 | c.name, 65 | []string{"cpu", "numa_node"}, nil, 66 | ) 67 | 68 | ch <- prometheus.MustNewConstMetric( 69 | desc, 70 | prometheus.CounterValue, 71 | 1, 72 | cpuID, 73 | numa, 74 | ) 75 | } 76 | 77 | links, err := getGuaranteedPodCPUs() 78 | if err != nil { 79 | log.Printf("pod cpu links not available: %v", err) 80 | return 81 | } 82 | 83 | for _, link := range links { 84 | desc := prometheus.NewDesc( 85 | prometheus.BuildFQName(collectorNamespace, "", c.name), 86 | "pod_cpu", 87 | []string{"cpu_id", "numa_node", "uid", "container_id"}, nil, 88 | ) 89 | 90 | ch <- prometheus.MustNewConstMetric( 91 | desc, 92 | prometheus.CounterValue, 93 | 1, 94 | link.cpu, 95 | c.cpuInfo[link.cpu], 96 | link.podID, 97 | link.containerID, 98 | ) 99 | } 100 | } 101 | 102 | // Describe is not defined for this collector 103 | func (c kubepodCPUCollector) Describe(ch chan<- *prometheus.Desc) { 104 | } 105 | 106 | // createKubepodCPUCollector creates a static picture of the cpu topology of the system and returns a collector 107 | // It also creates a static list of cpus in the kubernetes parent cgroup. 108 | func createKubepodCPUCollector() prometheus.Collector { 109 | cpuInfo, err := getCPUInfo() 110 | if err != nil { 111 | //Exporter will fail here if file can not be read. 112 | logFatal("Fatal Error: cpu info for node can not be collected, %v", err.Error()) 113 | } 114 | 115 | return kubepodCPUCollector{ 116 | cpuInfo: cpuInfo, 117 | name: kubepodcpu, 118 | } 119 | } 120 | 121 | // getCPUInfo looks in the sys directory for information on CPU IDs and NUMA topology. This method runs once on initialization of the pod. 122 | func getCPUInfo() (map[string]string, error) { 123 | cpuInfo := make(map[string]string, 0) 124 | files, err := fs.ReadDir(cpuinfofs, ".") 125 | if err != nil { 126 | return cpuInfo, fmt.Errorf("failed to read directory '%s'\n%v", *sysDevSysNodePath, err) 127 | } 128 | 129 | fileRE := regexp.MustCompile(`node\d+`) 130 | cpuFileRE := regexp.MustCompile(`cpu\d+`) 131 | for _, f := range files { 132 | if f.IsDir() { 133 | if fileRE.MatchString(f.Name()) { 134 | numaNode := f.Name()[4:] 135 | cpuFiles, err := fs.ReadDir(cpuinfofs, f.Name()) 136 | if err != nil { 137 | return cpuInfo, fmt.Errorf("failed to read directory '%s'\n%v", filepath.Join(*sysDevSysNodePath, numaNode), err) 138 | } 139 | 140 | for _, cpu := range cpuFiles { 141 | if cpuFileRE.MatchString(cpu.Name()) { 142 | cpuID := cpu.Name()[3:] 143 | cpuInfo[cpuID] = numaNode 144 | } 145 | } 146 | } 147 | } 148 | } 149 | return cpuInfo, nil 150 | } 151 | 152 | // getGuaranteedPodCPUs creates a podCPULink for each CPU that is guaranteed 153 | // This information is exposed under the cpuset in the cgroup file system with Kubernetes1.18/Docker/ 154 | // This accounting will create an entry for each guaranteed pod, even if that pod isn't managed by CPU manager 155 | // i.e. it will still create an entry if the pod is looking for millis of CPU 156 | // Todo: validate regex matching and evaluate performance of this approach 157 | // Todo: validate assumptions about directory structure against other runtimes and kubelet config. Plausibly problematic with CgroupsPerQos and other possible future cgroup changes 158 | func getGuaranteedPodCPUs() ([]podCPULink, error) { 159 | links := make([]podCPULink, 0) 160 | 161 | kubeCPUString, kubeDefaultSet := getKubeDefaults() 162 | 163 | podDirectoryFilenames, err := getPodDirectories() 164 | if err != nil { 165 | return links, err 166 | } 167 | 168 | for _, directory := range podDirectoryFilenames { 169 | containerIDs, err := getContainerIDs(directory) 170 | if err != nil { 171 | return links, err 172 | } 173 | 174 | for _, container := range containerIDs { 175 | cpuSet, err := readCPUSet(filepath.Join(directory, container, "cpuset.cpus")) 176 | if err != nil { 177 | return links, err 178 | } 179 | if cpuSet == kubeCPUString || cpuSet == kubeDefaultSet { 180 | continue 181 | } 182 | 183 | cpuRange, err := parseCPURange(cpuSet) 184 | if err != nil { 185 | return links, err 186 | } 187 | 188 | for _, link := range cpuRange { 189 | links = append(links, podCPULink{directory[12 : len(directory)-6], container, link}) 190 | } 191 | } 192 | } 193 | return links, nil 194 | } 195 | 196 | func getPodDirectories() ([]string, error) { 197 | podDirectoryFilenames := make([]string, 0) 198 | 199 | files, err := fs.ReadDir(kubecgroupfs, ".") // all files in the directory 200 | if err != nil { 201 | return podDirectoryFilenames, fmt.Errorf("could not open path kubePod cgroups: %v", err) 202 | } 203 | 204 | podDirectoryRegex := regexp.MustCompile("pod[[:xdigit:]]{8}[_-][[:xdigit:]]{4}[_-][[:xdigit:]]{4}[_-][[:xdigit:]]{4}[_-][[:xdigit:]]{12}") 205 | for _, podDirectory := range files { 206 | podDirectoryFilename := podDirectory.Name() 207 | if match := podDirectoryRegex.MatchString(podDirectoryFilename); match { 208 | podDirectoryFilenames = append(podDirectoryFilenames, podDirectoryFilename) 209 | } 210 | } 211 | return podDirectoryFilenames, nil 212 | } 213 | 214 | func getContainerIDs(podDirectoryFilename string) ([]string, error) { 215 | containerDirectoryFilenames := make([]string, 0) 216 | 217 | files, err := fs.ReadDir(kubecgroupfs, podDirectoryFilename) 218 | if err != nil { 219 | return containerDirectoryFilenames, fmt.Errorf("could not read cpu files directory: %v", err) 220 | } 221 | 222 | containerIDRegex := regexp.MustCompile("[[:xdigit:]]{20,}") // change regexback 223 | for _, containerDirectory := range files { 224 | containerID := containerDirectory.Name() 225 | if match := containerIDRegex.MatchString(containerID); match { 226 | containerDirectoryFilenames = append(containerDirectoryFilenames, containerID) 227 | } 228 | } 229 | 230 | return containerDirectoryFilenames, nil 231 | } 232 | 233 | // readDefaultSet extracts the information about the "default" set of cpus available to kubernetes 234 | func readDefaultSet(data []byte) string { 235 | checkpointFile := cpuManagerCheckpoint{} 236 | 237 | if err := json.Unmarshal(data, &checkpointFile); err != nil { 238 | log.Printf("cpu checkpoint file could not be unmarshalled, error: %v", err) 239 | return "" 240 | } 241 | 242 | return checkpointFile.DefaultCPUSet 243 | } 244 | 245 | // readCPUSet can read cpuFiles in the Kernel cpuset format 246 | func readCPUSet(cpuSetFilepath string) (string, error) { 247 | cpuSetBytes, err := fs.ReadFile(kubecgroupfs, cpuSetFilepath) 248 | if err != nil { 249 | return "", fmt.Errorf("could not open cgroup cpuset files, error: %v", err) 250 | } 251 | return strings.TrimSpace(string(cpuSetBytes)), err 252 | } 253 | 254 | // parseCPURanges can read cpuFiles in the Kernel cpuset format 255 | func parseCPURange(cpuString string) ([]string, error) { 256 | cpuList := make([]string, 0) 257 | cpuRanges := strings.Split(cpuString, ",") 258 | for _, r := range cpuRanges { 259 | endpoints := strings.Split(r, "-") 260 | if len(endpoints) == 1 { 261 | cpuList = append(cpuList, endpoints[0]) 262 | } else if len(endpoints) == 2 { 263 | start, err := strconv.Atoi(endpoints[0]) 264 | if err != nil { 265 | return cpuList, err 266 | } 267 | 268 | end, err := strconv.Atoi(endpoints[1]) 269 | if err != nil { 270 | return cpuList, err 271 | } 272 | 273 | for e := start; e <= end; e++ { 274 | cpuList = append(cpuList, strconv.Itoa(e)) 275 | } 276 | } 277 | } 278 | 279 | return cpuList, nil 280 | } 281 | 282 | func getKubeDefaults() (string, string) { 283 | kubeCPUString, err := readCPUSet("cpuset.cpus") 284 | if err != nil { 285 | // Exporter killed here as CPU collector can not work without this information. 286 | logFatal("Fatal Error: cannot get information on Kubernetes CPU usage, %v", err.Error()) 287 | } 288 | 289 | cpuRawBytes, err := fs.ReadFile(cpucheckpointfs, filepath.Base(*cpuCheckPointFile)) 290 | if err != nil { 291 | log.Printf("unable to read cpu checkpoint file '%s', error: %v", *cpuCheckPointFile, err) 292 | } 293 | 294 | return kubeCPUString, readDefaultSet(cpuRawBytes) 295 | } 296 | 297 | func resolveKubePodCPUFilepaths() error { 298 | if err := utils.ResolveFlag("path.kubecgroup", kubePodCgroupPath); err != nil { 299 | return err 300 | } 301 | 302 | if err := utils.ResolveFlag("path.nodecpuinfo", sysDevSysNodePath); err != nil { 303 | return err 304 | } 305 | 306 | if err := utils.ResolveFlag("path.cpucheckpoint", cpuCheckPointFile); err != nil { 307 | return err 308 | } 309 | 310 | kubecgroupfs = os.DirFS(*kubePodCgroupPath) 311 | cpuinfofs = os.DirFS(*sysDevSysNodePath) 312 | cpucheckpointfs = os.DirFS(filepath.Dir(*cpuCheckPointFile)) 313 | 314 | return nil 315 | } 316 | -------------------------------------------------------------------------------- /collectors/pod_cpu_link_test.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io/fs" 7 | "strconv" 8 | "testing/fstest" 9 | 10 | . "github.com/onsi/ginkgo/v2" 11 | . "github.com/onsi/gomega" 12 | "github.com/prometheus/client_golang/prometheus" 13 | dto "github.com/prometheus/client_model/go" 14 | ) 15 | 16 | var _ = DescribeTable("test pod cpu link collection", // Collect 17 | func(fsys fs.FS, expected []metric, logs ...string) { 18 | cpuinfofs = fsys 19 | kubecgroupfs = fsys 20 | cpucheckpointfs = fsys 21 | 22 | ch := make(chan prometheus.Metric, 1) 23 | go createKubepodCPUCollector().Collect(ch) 24 | 25 | for i := 0; i < len(expected); i++ { 26 | m := dto.Metric{} 27 | err := (<-ch).Write(&m) 28 | Expect(err).ToNot(HaveOccurred()) 29 | 30 | labels := make(map[string]string, 4) 31 | for _, label := range m.Label { 32 | labels[*label.Name] = *label.Value 33 | } 34 | 35 | metric := metric{labels: labels, counter: *m.Counter.Value} 36 | 37 | Expect(metric).To(BeElementOf(expected)) 38 | } 39 | 40 | assertLogs(logs) 41 | }, 42 | Entry("test numa node and cpuset collection", 43 | fstest.MapFS{ 44 | "node0/cpu0": {Mode: fs.ModeDir}, 45 | "node0/cpu2": {Mode: fs.ModeDir}, 46 | "node1/cpu1": {Mode: fs.ModeDir}, 47 | "node1/cpu3": {Mode: fs.ModeDir}, 48 | "cpuset.cpus": {Data: []byte("4-7")}, 49 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"4-7\",\"checksum\":1353318690}")}, 50 | "kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c82.slice/0123456789abcdefaaaa/cpuset.cpus": {Data: []byte("0-3")}}, 51 | []metric{ 52 | {map[string]string{"cpu": "cpu0", "numa_node": "0"}, 1}, 53 | {map[string]string{"cpu": "cpu2", "numa_node": "0"}, 1}, 54 | {map[string]string{"cpu": "cpu1", "numa_node": "1"}, 1}, 55 | {map[string]string{"cpu": "cpu3", "numa_node": "1"}, 1}, 56 | {map[string]string{"cpu_id": "0", "numa_node": "0", "uid": "6b5b533a_6307_48d1_911f_07bf5d4e1c82", "container_id": "0123456789abcdefaaaa"}, 1}, 57 | {map[string]string{"cpu_id": "2", "numa_node": "0", "uid": "6b5b533a_6307_48d1_911f_07bf5d4e1c82", "container_id": "0123456789abcdefaaaa"}, 1}, 58 | {map[string]string{"cpu_id": "1", "numa_node": "1", "uid": "6b5b533a_6307_48d1_911f_07bf5d4e1c82", "container_id": "0123456789abcdefaaaa"}, 1}, 59 | {map[string]string{"cpu_id": "3", "numa_node": "1", "uid": "6b5b533a_6307_48d1_911f_07bf5d4e1c82", "container_id": "0123456789abcdefaaaa"}, 1}}), 60 | Entry("test unavailable kube cgroup directory", 61 | fstest.MapFS{ 62 | "node0/cpu0": {Mode: fs.ModeDir}, 63 | "node0/cpu2": {Mode: fs.ModeDir}, 64 | "node1/cpu1": {Mode: fs.ModeDir}, 65 | "node1/cpu3": {Mode: fs.ModeDir}, 66 | "cpuset.cpus": {Data: []byte("4-7")}, 67 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"4-7\",\"checksum\":1353318690}")}, 68 | "kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c83.slice": {Mode: fs.ModeExclusive}}, 69 | []metric{ 70 | {map[string]string{"cpu": "cpu0", "numa_node": "0"}, 1}, 71 | {map[string]string{"cpu": "cpu2", "numa_node": "0"}, 1}, 72 | {map[string]string{"cpu": "cpu1", "numa_node": "1"}, 1}, 73 | {map[string]string{"cpu": "cpu3", "numa_node": "1"}, 1}}, 74 | "pod cpu links not available: could not read cpu files directory: readdir kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c83.slice: not implemented"), 75 | ) 76 | 77 | var _ = DescribeTable("test reading default cpu set", // readDefaultSet 78 | func(data []byte, expected string, logs ...string) { 79 | Expect(readDefaultSet(data)).To(Equal(expected)) 80 | 81 | assertLogs(logs) 82 | }, 83 | Entry("read empty", 84 | []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"\",\"checksum\":1353318690}"), 85 | ""), 86 | Entry("read successful", 87 | []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"1,2,3,4\",\"checksum\":1353318690}"), 88 | "1,2,3,4"), 89 | Entry("read failed with malformed data", 90 | []byte("\"policyName\":\"none\",\"checksum\":1353318690"), 91 | "", 92 | "cpu checkpoint file could not be unmarshalled, error: invalid character ':' after top-level value"), 93 | ) 94 | 95 | var _ = DescribeTable("test creating kubepodCPU collector", // createKubepodCPUCollector 96 | func(fsys fs.FS, expectedCollector kubepodCPUCollector, logs ...string) { 97 | cpuinfofs = fsys 98 | 99 | collector := createKubepodCPUCollector() 100 | Expect(collector).To(Equal(expectedCollector)) 101 | 102 | assertLogs(logs) 103 | }, 104 | Entry("successful creation", 105 | fstest.MapFS{ 106 | "node0/cpu0": {Mode: fs.ModeDir}, 107 | "node0/cpu2": {Mode: fs.ModeDir}, 108 | "node1/cpu1": {Mode: fs.ModeDir}, 109 | "node1/cpu3": {Mode: fs.ModeDir}}, 110 | kubepodCPUCollector{cpuInfo: map[string]string{"0": "0", "2": "0", "1": "1", "3": "1"}, name: kubepodcpu}), 111 | Entry("directory doesn't exist", 112 | fstest.MapFS{".": {Mode: fs.ModeExclusive}}, // to emulate the directory doesn't exist 113 | kubepodCPUCollector{cpuInfo: map[string]string{}, name: kubepodcpu}, 114 | "Fatal Error: cpu info for node can not be collected, failed to read directory '/sys/devices/system/node/'\nreaddir .: not implemented"), 115 | ) 116 | 117 | var _ = DescribeTable("test getting kubernetes cpu list", // getKubeDefaults 118 | func(fsys fs.FS, expectedKubeCPUString, expectedDefaultSet string) { 119 | kubecgroupfs = fsys 120 | cpucheckpointfs = fsys 121 | 122 | kubeCPUString, kubeDefaultSet := getKubeDefaults() 123 | Expect(kubeCPUString).To(Equal(expectedKubeCPUString)) 124 | Expect(kubeDefaultSet).To(Equal(expectedDefaultSet)) 125 | }, 126 | Entry("read empty", 127 | fstest.MapFS{ 128 | "cpuset.cpus": {Data: []byte("")}, 129 | "cpu_manager_state": {Data: []byte("")}}, 130 | "", 131 | ""), 132 | Entry("read successful", 133 | fstest.MapFS{ 134 | "cpuset.cpus": {Data: []byte("0-87")}, 135 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"static\",\"defaultCpuSet\":\"0-63\",\"checksum\":1058907510}")}}, 136 | "0-87", 137 | "0-63"), 138 | Entry("read successful with malformed data", 139 | fstest.MapFS{ 140 | "cpuset.cpus": {Data: []byte(" 0-87 ")}, 141 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"static\",\"defaultCpuSet\":\"0-63\",\"checksum\":1058 907 51 0}")}}, 142 | "0-87", 143 | ""), 144 | Entry("read failed, file doesn't exist", 145 | fstest.MapFS{}, 146 | "", 147 | ""), 148 | ) 149 | 150 | var _ = DescribeTable("test getting guaranteed pod cpus", // guaranteedPodCPUs 151 | func(fsys fs.FS, expected []podCPULink, expectedErr error, logs ...string) { 152 | kubecgroupfs = fsys 153 | cpucheckpointfs = fsys 154 | 155 | data, err := getGuaranteedPodCPUs() 156 | Expect(data).To(Equal(expected)) 157 | 158 | if expectedErr != nil { 159 | Expect(err).To(MatchError(expectedErr)) 160 | } 161 | 162 | assertLogs(logs) 163 | }, 164 | Entry("container cpuset available", 165 | fstest.MapFS{ 166 | "cpuset.cpus": {Data: []byte("0-3")}, 167 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"4-7\",\"checksum\":1353318690}")}, 168 | "kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c82.slice/0123456789abcdefaaaa/cpuset.cpus": {Data: []byte("8-11")}}, 169 | []podCPULink{ 170 | {"6b5b533a_6307_48d1_911f_07bf5d4e1c82", "0123456789abcdefaaaa", "8"}, 171 | {"6b5b533a_6307_48d1_911f_07bf5d4e1c82", "0123456789abcdefaaaa", "9"}, 172 | {"6b5b533a_6307_48d1_911f_07bf5d4e1c82", "0123456789abcdefaaaa", "10"}, 173 | {"6b5b533a_6307_48d1_911f_07bf5d4e1c82", "0123456789abcdefaaaa", "11"}}, 174 | nil), 175 | Entry("cgroup directory doesn't exist", 176 | fstest.MapFS{".": {Mode: fs.ModeExclusive}}, 177 | []podCPULink{}, 178 | fmt.Errorf("could not open path kubePod cgroups: readdir .: not implemented"), 179 | "cannot get information on Kubernetes CPU usage, could not open cgroup cpuset files, error: open cpuset.cpus: file does not exist", 180 | "unable to read cpu checkpoint file '/var/lib/kubelet/cpu_manager_state', error: open cpu_manager_state: file does not exist", 181 | "cpu checkpoint file could not be unmarshalled, error: unexpected end of JSON input"), 182 | Entry("unable to read pod cgroup directory", 183 | fstest.MapFS{ 184 | "cpuset.cpus": {Data: []byte("0-3")}, 185 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"4-7\",\"checksum\":1353318690}")}, 186 | "kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c82.slice": {Mode: fs.ModeExclusive}}, 187 | []podCPULink{}, 188 | fmt.Errorf("could not read cpu files directory: readdir kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c82.slice: not implemented")), 189 | Entry("unable to read container cpuset file", 190 | fstest.MapFS{ 191 | "cpuset.cpus": {Data: []byte("0-3")}, 192 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"4-7\",\"checksum\":1353318690}")}, 193 | "kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c82.slice/0123456789abcdefaaaa/cpuset.cpus": {Mode: fs.ModeDir}}, 194 | []podCPULink{}, 195 | fmt.Errorf("could not open cgroup cpuset files, error: read kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c82.slice/0123456789abcdefaaaa/cpuset.cpus: invalid argument")), 196 | Entry("container cpuset range covered by defaults", 197 | fstest.MapFS{ 198 | "cpuset.cpus": {Data: []byte("0-3")}, 199 | "cpu_manager_state": {Data: []byte("{\"policyName\":\"none\",\"defaultCpuSet\":\"4-7\",\"checksum\":1353318690}")}, 200 | "kubepods-pod6b5b533a_6307_48d1_911f_07bf5d4e1c82.slice/0123456789abcdefaaaa/cpuset.cpus": {Data: []byte("0-3")}}, 201 | []podCPULink{}, 202 | nil), 203 | ) 204 | 205 | var _ = DescribeTable("test parsing cpu file", // parseCPUFile 206 | func(path string, fsys fs.FS, expectedString string, expectedErr error) { 207 | kubecgroupfs = fsys 208 | 209 | data, err := readCPUSet(path) 210 | Expect(data).To(Equal(expectedString)) 211 | 212 | if expectedErr != nil { 213 | Expect(err).To(Equal(expectedErr)) 214 | } 215 | }, 216 | Entry("read empty", 217 | "cpuset.cpus", 218 | fstest.MapFS{ 219 | "cpuset.cpus": {Data: []byte("")}}, 220 | "", 221 | nil), 222 | Entry("read successful", 223 | "cpuset.cpus", 224 | fstest.MapFS{ 225 | "cpuset.cpus": {Data: []byte("0-87")}}, 226 | "0-87", 227 | nil), 228 | Entry("read successful with malformed data", 229 | "cpuset.cpus", 230 | fstest.MapFS{ 231 | "cpuset.cpus": {Data: []byte(" 0-87 ")}}, 232 | "0-87", 233 | nil), 234 | Entry("read failed, file doesn't exist", 235 | "cpuset.cpus", 236 | fstest.MapFS{}, 237 | "", 238 | fmt.Errorf("could not open cgroup cpuset files, error: open cpuset.cpus: file does not exist")), 239 | ) 240 | 241 | var _ = DescribeTable("test parsing cpu range", // parseCPURange 242 | func(cpuString string, expected []string, expectedErr error) { 243 | data, err := parseCPURange(cpuString) 244 | Expect(data).To(Equal(expected)) 245 | 246 | if expectedErr != nil { 247 | Expect(err).To(MatchError(expectedErr)) 248 | } 249 | }, 250 | Entry("valid range '0-3,7-9'", 251 | "0-3,7-9", 252 | []string{"0", "1", "2", "3", "7", "8", "9"}, 253 | nil), 254 | Entry("valid range '0-3'", 255 | "0-3", 256 | []string{"0", "1", "2", "3"}, 257 | nil), 258 | Entry("valid range '7'", 259 | "7", 260 | []string{"7"}, 261 | nil), 262 | Entry("invalid range '-1'", 263 | "-1", 264 | []string{}, 265 | strconv.ErrSyntax), 266 | Entry("invalid range '0-'", 267 | "0-", 268 | []string{}, 269 | strconv.ErrSyntax), 270 | ) 271 | 272 | var _ = DescribeTable("test getting cpu info", // getCPUInfo 273 | func(fsys fs.FS, expectedData map[string]string, expectedErr error) { 274 | cpuinfofs = fsys 275 | 276 | data, err := getCPUInfo() 277 | 278 | for k, v := range expectedData { 279 | Expect(data).To(HaveKey(k)) 280 | Expect(data[k]).To(Equal(v)) 281 | } 282 | 283 | Expect(data).To(Equal(expectedData)) 284 | 285 | if expectedErr != nil { 286 | Expect(err).To(MatchError(expectedErr)) 287 | } 288 | }, 289 | Entry("valid info", 290 | fstest.MapFS{ 291 | "node0/cpu0": {Mode: fs.ModeDir}, 292 | "node0/cpu2": {Mode: fs.ModeDir}, 293 | "node1/cpu1": {Mode: fs.ModeDir}, 294 | "node1/cpu3": {Mode: fs.ModeDir}}, 295 | map[string]string{"0": "0", "2": "0", "1": "1", "3": "1"}, 296 | nil), 297 | Entry("directory doesn't exist", 298 | fstest.MapFS{".": {Mode: fs.ModeExclusive}}, // to emulate the directory doesn't exist 299 | map[string]string{}, 300 | errors.New("failed to read directory '/sys/devices/system/node/'\nreaddir .: not implemented")), 301 | ) 302 | 303 | // TODO: create integration tests for GetV1Client and PodResources, they require the kubelet API 304 | -------------------------------------------------------------------------------- /collectors/pod_dev_link.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | // pod_dev_link publishes which devices are connected to which pods in Kubernetes by querying the Kubelet api 4 | 5 | import ( 6 | "context" 7 | "flag" 8 | "fmt" 9 | "log" 10 | "net" 11 | "net/url" 12 | "regexp" 13 | "strings" 14 | "time" 15 | 16 | "github.com/prometheus/client_golang/prometheus" 17 | "google.golang.org/grpc" 18 | "google.golang.org/grpc/credentials/insecure" 19 | v1 "k8s.io/kubelet/pkg/apis/podresources/v1" 20 | 21 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/utils" 22 | ) 23 | 24 | var ( 25 | defaultPodResourcesMaxSize = 1024 * 1024 * 16 // 16 Mb 26 | podDevLinkName = "kubepoddevice" 27 | podResourcesPath = flag.String("path.kubeletsocket", "/var/lib/kubelet/pod-resources/kubelet.sock", "Path to kubelet resources socket") 28 | pciAddressPattern = regexp.MustCompile(`^[[:xdigit:]]{4}:[[:xdigit:]]{2}:[[:xdigit:]]{2}\.\d$`) 29 | ) 30 | 31 | // podDevLinkCollector the basic type used to collect information on kubernetes device links 32 | type podDevLinkCollector struct { 33 | name string 34 | } 35 | 36 | // init runs the registration for this collector on package import 37 | func init() { 38 | register(podDevLinkName, disabled, createPodDevLinkCollector) 39 | } 40 | 41 | // This collector starts by making a call to the kubelet api which could create a delay. 42 | // This information could be cached on a loop after the previous call to improve prometheus scraping performance. 43 | 44 | // Collect scrapes the kubelet api and structures the returned value into a prometheus info metric. 45 | func (c podDevLinkCollector) Collect(ch chan<- prometheus.Metric) { 46 | resources := PodResources() 47 | for _, podRes := range resources { 48 | podName := podRes.GetName() 49 | podNamespace := podRes.GetNamespace() 50 | for _, contRes := range podRes.Containers { 51 | contName := contRes.GetName() 52 | for _, devices := range contRes.GetDevices() { 53 | devType := devices.ResourceName 54 | for _, dev := range devices.DeviceIds { 55 | if !isPci(dev) { 56 | continue 57 | } 58 | 59 | desc := prometheus.NewDesc( 60 | prometheus.BuildFQName(collectorNamespace, "", c.name), 61 | c.name, 62 | []string{"pciAddr", "dev_type", "pod", "namespace", "container"}, nil, 63 | ) 64 | 65 | ch <- prometheus.MustNewConstMetric( 66 | desc, 67 | prometheus.CounterValue, 68 | 1, 69 | dev, 70 | devType, 71 | podName, 72 | podNamespace, 73 | contName, 74 | ) 75 | } 76 | } 77 | } 78 | } 79 | } 80 | 81 | // Describe has no defined behaviour for this collector 82 | func (c podDevLinkCollector) Describe(ch chan<- *prometheus.Desc) { 83 | } 84 | 85 | func createPodDevLinkCollector() prometheus.Collector { 86 | return podDevLinkCollector{ 87 | name: podDevLinkName, 88 | } 89 | } 90 | 91 | // PodResources uses the kubernetes kubelet api to get information about the devices and the pods they are attached to. 92 | // We create and close a new connection here on each run. The performance impact of this seems marginal - but sharing a connection might save cpu time 93 | func PodResources() []*v1.PodResources { 94 | var podResource []*v1.PodResources 95 | 96 | kubeletSocket := strings.Join([]string{"unix:///", *podResourcesPath}, "") 97 | client, conn, err := GetV1Client(kubeletSocket, 10*time.Second, defaultPodResourcesMaxSize) 98 | if err != nil { 99 | log.Print(err) 100 | return podResource 101 | } 102 | 103 | defer conn.Close() 104 | ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 105 | defer cancel() 106 | resp, err := client.List(ctx, &v1.ListPodResourcesRequest{}) 107 | if err != nil { 108 | log.Printf("getPodResources: failed to list pod resources, %v.Get(_) = _, %v", client, err) 109 | return podResource 110 | } 111 | 112 | podResource = resp.PodResources 113 | 114 | return podResource 115 | } 116 | 117 | // Checks to see if a device id matches a pci address. If not we're able to discard it. 118 | func isPci(id string) bool { 119 | return pciAddressPattern.MatchString(id) 120 | } 121 | 122 | func resolveKubePodDeviceFilepaths() error { 123 | if err := utils.ResolveFlag("path.kubeletsocket", podResourcesPath); err != nil { 124 | return err 125 | } 126 | 127 | return nil 128 | } 129 | 130 | // GetV1Client returns a client for the PodResourcesLister grpc service 131 | // Extracted from package k8s.io/kubernetes/pkg/kubelet/apis/podresources client.go v1.24.3 132 | // This is what is recommended for consumers of this package 133 | func GetV1Client(socket string, connectionTimeout time.Duration, maxMsgSize int) (v1.PodResourcesListerClient, *grpc.ClientConn, error) { 134 | url, err := url.Parse(socket) 135 | if err != nil { 136 | return nil, nil, err 137 | } 138 | 139 | ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout) 140 | defer cancel() 141 | 142 | conn, err := grpc.DialContext(ctx, url.Path, 143 | grpc.WithTransportCredentials(insecure.NewCredentials()), 144 | grpc.WithContextDialer(dialer), 145 | grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(maxMsgSize))) 146 | if err != nil { 147 | return nil, nil, fmt.Errorf("error dialing socket %s: %v", socket, err) 148 | } 149 | return v1.NewPodResourcesListerClient(conn), conn, nil 150 | } 151 | 152 | func dialer(ctx context.Context, addr string) (net.Conn, error) { 153 | return (&net.Dialer{}).DialContext(ctx, "unix", addr) 154 | } 155 | -------------------------------------------------------------------------------- /collectors/pod_dev_link_test.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | import ( 4 | . "github.com/onsi/ginkgo/v2" 5 | . "github.com/onsi/gomega" 6 | ) 7 | 8 | // TODO: create Collector and dialer unit tests 9 | 10 | var _ = Describe("test creating podDevLink collector", func() { // createPodDevLinkCollector 11 | It("returns the correct collector", func() { 12 | collector := createPodDevLinkCollector() 13 | Expect(collector).To(Equal(podDevLinkCollector{name: podDevLinkName})) 14 | }) 15 | }) 16 | 17 | var _ = DescribeTable("test pci address regexp: "+pciAddressPattern.String(), // isPci 18 | func(pciAddr string, expected bool) { 19 | Expect(isPci(pciAddr)).To(Equal(expected)) 20 | }, 21 | Entry("valid, 0000:00:00.0", "0000:00:00.0", true), 22 | Entry("valid, ffff:00:00.0", "ffff:00:00.0", true), 23 | Entry("valid, 0000:ff:00.0", "0000:ff:00.0", true), 24 | Entry("valid, 0000:00:ff.0", "0000:00:ff.0", true), 25 | Entry("valid, 0000:00:00.0", "0000:00:00.0", true), 26 | Entry("invalid, 0000.00:00.0", "0000.00:00.0", false), 27 | Entry("invalid, 0000:00.00.0", "0000:00.00.0", false), 28 | Entry("invalid, 0000:00:00:0", "0000:00:00:0", false), 29 | Entry("invalid, gggg:00:00.0", "gggg:00:00.0", false), 30 | Entry("invalid, 0000:gg:00.0", "0000:gg:00.0", false), 31 | Entry("invalid, 0000:00:gg.0", "0000:00:gg.0", false), 32 | Entry("invalid, 0000:00:00.a", "0000:00:00.a", false), 33 | Entry("invalid, 00000:00:00.0", "00000:00:00.0", false), 34 | Entry("invalid, 0000:000:00.0", "0000:000:00.0", false), 35 | Entry("invalid, 0000:00:000.0", "0000:00:000.0", false), 36 | Entry("invalid, 0000:00:00.00", "0000:00:00.00", false), 37 | ) 38 | 39 | // TODO: create integration tests for GetV1Client and PodResources, they require the kubelet API 40 | -------------------------------------------------------------------------------- /collectors/sriovdev.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | // sriovDev has the methods for implementing an sriov stats reader and publishing its information to Prometheus 4 | 5 | import ( 6 | "flag" 7 | "fmt" 8 | "io/fs" 9 | "log" 10 | "os" 11 | "path/filepath" 12 | "strconv" 13 | "strings" 14 | 15 | "github.com/prometheus/client_golang/prometheus" 16 | 17 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/utils" 18 | ) 19 | 20 | const ( 21 | noNumaInfo = "-1" 22 | ) 23 | 24 | var ( 25 | collectorPriority utils.StringListFlag 26 | defaultPriority = utils.StringListFlag{"sysfs", "netlink"} 27 | sysBusPci = flag.String("path.sysbuspci", "/sys/bus/pci/devices", "Path to sys/bus/pci/devices/ on host") 28 | sysClassNet = flag.String("path.sysclassnet", "/sys/class/net", "Path to sys/class/net/ on host") 29 | pfNameFile = "net" 30 | netClassFile = "class" 31 | netClass int64 = 0x020000 32 | vfStatsSubsystem = "vf" 33 | vfStatsCollectorName = "vfstats" 34 | 35 | devfs fs.FS 36 | netfs fs.FS 37 | ) 38 | 39 | // vfsPCIAddr is a map of VF IDs to VF PCI addresses i.e. {"0": "0000:3b:02.0", "1": "0000:3b:02.1"} 40 | type vfsPCIAddr map[string]string 41 | 42 | // init runs the registration for this collector on package import 43 | func init() { 44 | flag.Var(&collectorPriority, "collector.vfstatspriority", "Priority of collectors") 45 | register(vfStatsCollectorName, enabled, createSriovDevCollector) 46 | } 47 | 48 | // This is the generic collector for VF stats. 49 | type sriovDevCollector struct { 50 | name string 51 | pfsWithNumaInfo map[string]string 52 | } 53 | 54 | type sriovDev struct { 55 | name string 56 | reader sriovStatReader 57 | vfs vfsPCIAddr 58 | } 59 | 60 | // Collect runs the appropriate collector for each SR-IOV vf on the system and publishes its statistics. 61 | func (c sriovDevCollector) Collect(ch chan<- prometheus.Metric) { 62 | log.Printf("collecting sr-iov device metrics") 63 | 64 | priority := collectorPriority 65 | if len(priority) == 0 { 66 | log.Printf("collector.priority not specified in flags, using default priority") 67 | priority = defaultPriority 68 | } 69 | 70 | log.Printf("collector priority: %s", priority) 71 | for pfAddr, numaNode := range c.pfsWithNumaInfo { 72 | pf := getSriovDev(pfAddr, priority) 73 | 74 | if pf.reader == nil { 75 | continue 76 | } 77 | 78 | for id, address := range pf.vfs { 79 | stats := pf.reader.ReadStats(pf.name, id) 80 | for name, v := range stats { 81 | desc := prometheus.NewDesc( 82 | prometheus.BuildFQName(collectorNamespace, vfStatsSubsystem, name), 83 | fmt.Sprintf("Statistic %s.", name), 84 | []string{"pf", "vf", "pciAddr", "numa_node"}, nil, 85 | ) 86 | 87 | ch <- prometheus.MustNewConstMetric( 88 | desc, 89 | prometheus.CounterValue, 90 | float64(v), 91 | pf.name, 92 | id, 93 | address, 94 | numaNode, 95 | ) 96 | } 97 | } 98 | } 99 | } 100 | 101 | // Describe isn't implemented for this collector 102 | func (c sriovDevCollector) Describe(ch chan<- *prometheus.Desc) { 103 | } 104 | 105 | // sriovDevCollector is initialized with the physical functions on the host. This is not updated after initialization. 106 | func createSriovDevCollector() prometheus.Collector { 107 | devs := getSriovDevAddrs() 108 | numaNodes := getNumaNodes(devs) 109 | 110 | return sriovDevCollector{ 111 | name: vfStatsCollectorName, 112 | pfsWithNumaInfo: numaNodes, 113 | } 114 | } 115 | 116 | // getSriovDevAddrs returns the PCI addresses of the SRIOV capable Physical Functions on the host. 117 | func getSriovDevAddrs() []string { 118 | sriovDevs := make([]string, 0) 119 | 120 | devs, err := fs.Glob(devfs, "*/sriov_totalvfs") 121 | if err != nil { 122 | log.Printf("Invalid pattern\n%v", err) // unreachable code 123 | } 124 | 125 | if len(devs) == 0 { 126 | log.Printf("no sriov net devices found") 127 | } 128 | 129 | for _, dev := range devs { 130 | devAddr := filepath.Dir(dev) 131 | if isNetDevice(filepath.Join(devAddr, netClassFile)) { 132 | sriovDevs = append(sriovDevs, devAddr) 133 | } 134 | } 135 | 136 | return sriovDevs 137 | } 138 | 139 | // getSriovDev returns a sriovDev record containing the physical function interface name, stats reader and initialized virtual functions. 140 | func getSriovDev(pfAddr string, priority []string) sriovDev { 141 | name := getPFName(pfAddr) 142 | vfs, err := vfList(pfAddr) 143 | if err != nil { 144 | log.Printf("error getting vf address\n%v", err) 145 | } 146 | 147 | reader, err := getStatsReader(name, priority) 148 | if err != nil { 149 | log.Printf("error getting stats reader for %s: %v", name, err) 150 | } 151 | 152 | return sriovDev{ 153 | name, 154 | reader, 155 | vfs, 156 | } 157 | } 158 | 159 | // getNumaNodes returns the numa location for each of the PFs with SR-IOV capabilities 160 | func getNumaNodes(devs []string) map[string]string { 161 | pfNumaInfo := make(map[string]string) 162 | 163 | for _, dev := range devs { 164 | numaFilepath := filepath.Join(dev, "numa_node") 165 | numaRaw, err := fs.ReadFile(devfs, numaFilepath) 166 | if err != nil { 167 | log.Printf("could not read numa_node file for device '%s'\n%v", dev, err) 168 | pfNumaInfo[dev] = "" 169 | continue 170 | } 171 | 172 | numaNode := strings.TrimSpace(string(numaRaw)) 173 | if numaNode == noNumaInfo { 174 | log.Printf("no numa node information for device '%s'", dev) 175 | pfNumaInfo[dev] = "" 176 | continue 177 | } 178 | 179 | pfNumaInfo[dev] = numaNode 180 | } 181 | 182 | return pfNumaInfo 183 | } 184 | 185 | // vfList returns the virtual functions associated with the specified SRIOV physical function 186 | func vfList(dev string) (vfsPCIAddr, error) { 187 | vfList := make(vfsPCIAddr, 0) 188 | 189 | vfs, err := fs.Glob(devfs, filepath.Join(dev, "virtfn*")) 190 | if err != nil { 191 | log.Printf("Invalid pattern\n%v", err) // unreachable code 192 | } 193 | 194 | // Read all VF directories and add VF PCI addr to the vfList 195 | for _, vf := range vfs { 196 | if id, link := vfData(vf); id != "" && link != "" { 197 | vfList[id] = link 198 | } 199 | } 200 | 201 | if len(vfList) == 0 { 202 | return vfList, fmt.Errorf("no virtual functions found for pf '%s'", dev) 203 | } 204 | 205 | return vfList, nil 206 | } 207 | 208 | // vfData gets vf id and pci address from the path specified 209 | func vfData(vfDir string) (string, string) { 210 | if link, err := utils.EvalSymlinks(filepath.Join(*sysBusPci, vfDir)); err == nil { 211 | return filepath.Base(vfDir)[6:], filepath.Base(link) 212 | } else { 213 | log.Printf("error evaluating symlink '%s'\n%v", vfDir, err) 214 | return "", "" 215 | } 216 | } 217 | 218 | // getPFName resolves the system's name for a physical interface from the PCI address linked to it. 219 | func getPFName(device string) string { 220 | pfDevPath := filepath.Join(device, pfNameFile) 221 | pfdir, err := fs.ReadDir(devfs, pfDevPath) 222 | if err != nil || len(pfdir) == 0 { 223 | log.Printf("%s - could not get pf interface name in path '%s'\n%v", device, pfDevPath, err) 224 | return "" 225 | } 226 | 227 | return pfdir[0].Name() 228 | } 229 | 230 | // isNetDevice checks if the device is a net device by checking its device class 231 | func isNetDevice(filepath string) bool { 232 | file, err := fs.ReadFile(devfs, filepath) 233 | if err != nil { 234 | return false 235 | } 236 | 237 | classHex := strings.TrimSpace(string(file)) 238 | deviceClass, err := strconv.ParseInt(classHex, 0, 64) 239 | if err != nil { 240 | log.Printf("could not parse class file: %v", err) 241 | return false 242 | } 243 | 244 | return deviceClass == netClass 245 | } 246 | 247 | func resolveSriovDevFilepaths() error { 248 | if err := utils.ResolveFlag("path.sysbuspci", sysBusPci); err != nil { 249 | return err 250 | } 251 | 252 | if err := utils.ResolveFlag("path.sysclassnet", sysClassNet); err != nil { 253 | return err 254 | } 255 | 256 | devfs = os.DirFS(*sysBusPci) 257 | netfs = os.DirFS(*sysClassNet) 258 | 259 | return nil 260 | } 261 | -------------------------------------------------------------------------------- /collectors/sriovdev_readers.go: -------------------------------------------------------------------------------- 1 | // This file should contain different sriov stat implementations for different drivers and versions. 2 | 3 | package collectors 4 | 5 | import ( 6 | "fmt" 7 | "io/fs" 8 | "log" 9 | "os" 10 | "path/filepath" 11 | "strconv" 12 | "strings" 13 | 14 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/utils" 15 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/vfstats" 16 | ) 17 | 18 | const sriovVFStatsDir = "%s/device/sriov/%s/stats" 19 | 20 | type sriovStats map[string]int64 21 | 22 | // sriovStatReader is an interface which takes in the Physical Function name and vf id and returns the stats for the VF 23 | type sriovStatReader interface { 24 | ReadStats(vfID string, pfName string) sriovStats 25 | } 26 | 27 | // netlinkReader is able to read stats from drivers that support the netlink interface 28 | type netlinkReader struct { 29 | data vfstats.PerPF 30 | } 31 | 32 | // sysfsReader is able to read stats from Physical Functions running the i40e or ice driver 33 | // Other drivers that store all VF stats in files under one folder could use this reader 34 | type sysfsReader struct { 35 | statsFS string 36 | } 37 | 38 | // check if a reader can read stats for a given pf and vfID 39 | func readerHasStats(reader sriovStatReader, pfName, vfID string) bool { 40 | stats := reader.ReadStats(pfName, vfID) 41 | return len(stats) > 0 42 | } 43 | 44 | // getStatsReader returns the correct stat reader for the given PF 45 | // Currently only drivers that implement netlink or the sriov sysfs interface are supported 46 | func getStatsReader(pf string, priority []string) (sriovStatReader, error) { 47 | // Try to find a collector that can actually read stats for at least VF 0 48 | vfTestID := "0" 49 | for _, collector := range priority { 50 | switch collector { 51 | case "sysfs": 52 | if _, err := fs.Stat(netfs, filepath.Join(pf, "/device/sriov")); !os.IsNotExist(err) { 53 | reader := sysfsReader{filepath.Join(*sysClassNet, "%s/device/sriov/%s/stats/")} 54 | // Test if sysfsReader can read stats for VF 0 55 | if readerHasStats(reader, pf, vfTestID) { 56 | log.Printf("%s - using sysfs collector", pf) 57 | return reader, nil 58 | } else { 59 | log.Printf("%s - sysfs collector present but no stats found for vf%s", pf, vfTestID) 60 | } 61 | } else { 62 | log.Printf("%s does not support sysfs collector, directory '%s' does not exist", pf, filepath.Join(pf, "/device/sriov")) 63 | } 64 | case "netlink": 65 | if vfstats.DoesPfSupportNetlink(pf) { 66 | reader := netlinkReader{vfstats.VfStats(pf)} 67 | // Test if netlinkReader can read stats for VF 0 68 | if readerHasStats(reader, pf, vfTestID) { 69 | log.Printf("%s - using netlink collector", pf) 70 | return reader, nil 71 | } else { 72 | log.Printf("%s - netlink collector present but no stats found for vf%s", pf, vfTestID) 73 | } 74 | } else { 75 | log.Printf("%s does not support netlink collector", pf) 76 | } 77 | default: 78 | log.Printf("%s - '%s' collector not supported", pf, collector) 79 | } 80 | } 81 | return nil, fmt.Errorf("no stats reader found for %s", pf) 82 | } 83 | 84 | // ReadStats takes in the name of a PF and the VF Id and returns a stats object. 85 | func (r netlinkReader) ReadStats(pfName string, vfID string) sriovStats { 86 | id, err := strconv.Atoi(vfID) 87 | if err != nil { 88 | log.Print("error reading passed virtual function id") 89 | return sriovStats{} 90 | } 91 | 92 | return func() sriovStats { 93 | vf := r.data.Vfs[id] 94 | return map[string]int64{ 95 | "tx_bytes": int64(vf.TxBytes), 96 | "rx_bytes": int64(vf.RxBytes), 97 | "tx_packets": int64(vf.TxPackets), 98 | "rx_packets": int64(vf.RxPackets), 99 | "tx_dropped": int64(vf.TxDropped), 100 | "rx_dropped": int64(vf.RxDropped), 101 | "rx_broadcast": int64(vf.Broadcast), 102 | "rx_multicast": int64(vf.Multicast), 103 | } 104 | }() 105 | } 106 | 107 | func (r sysfsReader) ReadStats(pfName string, vfID string) sriovStats { 108 | stats := make(sriovStats, 0) 109 | 110 | statDir := fmt.Sprintf(sriovVFStatsDir, pfName, vfID) 111 | files, err := fs.ReadDir(netfs, statDir) 112 | if err != nil { 113 | log.Printf("error reading stats for %s vf%s\n%v", pfName, vfID, err) 114 | return stats 115 | } 116 | 117 | log.Printf("getting stats for %s vf%s", pfName, vfID) 118 | 119 | for _, f := range files { 120 | path := filepath.Join(statDir, f.Name()) 121 | if utils.IsSymLink(netfs, path) { 122 | log.Printf("could not stat file '%s'", path) 123 | continue 124 | } 125 | 126 | statRaw, err := fs.ReadFile(netfs, path) 127 | if err != nil { 128 | log.Printf("error reading file, %v", err) 129 | continue 130 | } 131 | 132 | statString := strings.TrimSpace(string(statRaw)) 133 | value, err := strconv.ParseInt(statString, 10, 64) 134 | if err != nil { 135 | log.Printf("%s - error parsing integer from value '%s'\n%v", f.Name(), statString, err) 136 | continue 137 | } 138 | 139 | stats[f.Name()] = value 140 | } 141 | 142 | return stats 143 | } 144 | -------------------------------------------------------------------------------- /collectors/sriovdev_readers_test.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | import ( 4 | "io/fs" 5 | "testing/fstest" 6 | 7 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/vfstats" 8 | "github.com/vishvananda/netlink" 9 | 10 | . "github.com/onsi/ginkgo/v2" 11 | . "github.com/onsi/gomega" 12 | ) 13 | 14 | var _ = DescribeTable("test getting stats reader for pf", // getStatsReader 15 | func(pf string, priority []string, fsys fs.FS, link netlink.Link, expected sriovStatReader, logs ...string) { 16 | netfs = fsys 17 | 18 | if link != nil { 19 | vfstats.GetLink = func(name string) (netlink.Link, error) { 20 | return link, nil 21 | } 22 | DeferCleanup(func() { 23 | vfstats.GetLink = netlink.LinkByName 24 | }) 25 | } 26 | 27 | statsReader, err := getStatsReader(pf, priority) 28 | 29 | if expected != nil { 30 | Expect(statsReader).To(Equal(expected)) 31 | Expect(err).To(BeNil()) 32 | } else { 33 | Expect(statsReader).To(BeNil()) 34 | Expect(err).To(HaveOccurred()) 35 | } 36 | 37 | assertLogs(logs) 38 | }, 39 | Entry("with sysfs support", 40 | "ens785f0", 41 | []string{"sysfs", "netlink"}, 42 | fstest.MapFS{ 43 | "ens785f0/device/sriov": {Mode: fs.ModeDir}, 44 | "ens785f0/device/sriov/0/stats/rx_packets": {Data: []byte("1")}, // Added to enable sysfsReader 45 | }, 46 | nil, 47 | sysfsReader{"/sys/class/net/%s/device/sriov/%s/stats"}, 48 | "ens785f0 - using sysfs collector"), 49 | Entry("without sysfs support", 50 | "ens785f0", 51 | []string{"sysfs", "netlink"}, 52 | fstest.MapFS{}, 53 | &netlink.Device{LinkAttrs: netlink.LinkAttrs{Vfs: []netlink.VfInfo{}}}, //nolint:govet 54 | netlinkReader{vfstats.VfStats("ens785f0")}, 55 | "ens785f0 does not support sysfs collector", 56 | "ens785f0 - using netlink collector"), 57 | Entry("without any collector support", 58 | "ens785f0", 59 | []string{"unsupported_collector"}, 60 | fstest.MapFS{}, 61 | nil, 62 | nil, 63 | "ens785f0 - 'unsupported_collector' collector not supported"), 64 | Entry("sysfs present but returns no stats, fallback to netlink", 65 | "ens785f0", 66 | []string{"sysfs", "netlink"}, 67 | fstest.MapFS{ 68 | "ens785f0/device/sriov": {Mode: fs.ModeDir}, 69 | // sysfs stats file exists but is empty (simulates no stats) 70 | "ens785f0/device/sriov/0/stats/rx_packets": {Data: []byte("")}, 71 | }, 72 | &netlink.Device{LinkAttrs: netlink.LinkAttrs{Vfs: []netlink.VfInfo{{ID: 0, TxPackets: 42}}}}, 73 | netlinkReader{vfstats.PerPF{Pf: "ens785f0", Vfs: map[int]netlink.VfInfo{0: {ID: 0, TxPackets: 42}}}}, 74 | "ens785f0 - sysfs collector present but no stats found for vf0", 75 | "ens785f0 - using netlink collector"), 76 | ) 77 | 78 | var _ = DescribeTable("test getting reading stats through sriov sysfs interface", // sysfsReader.ReadStats 79 | func(pf string, vfId string, fsys fs.FS, expected sriovStats, logs ...string) { 80 | netfs = fsys 81 | 82 | statsReader := new(sysfsReader) 83 | stats := statsReader.ReadStats(pf, vfId) 84 | Expect(stats).To(Equal(expected)) 85 | 86 | assertLogs(logs) 87 | }, 88 | Entry("with stats files", 89 | "ens785f0", 90 | "0", 91 | fstest.MapFS{ 92 | "ens785f0/device/sriov/0/stats/rx_packets": {Data: []byte("6")}, 93 | "ens785f0/device/sriov/0/stats/rx_bytes": {Data: []byte("24")}, 94 | "ens785f0/device/sriov/0/stats/tx_packets": {Data: []byte("12")}, 95 | "ens785f0/device/sriov/0/stats/tx_bytes": {Data: []byte("48")}}, 96 | map[string]int64{ 97 | "rx_packets": 6, 98 | "rx_bytes": 24, 99 | "tx_packets": 12, 100 | "tx_bytes": 48}, 101 | "getting stats for ens785f0 vf0"), 102 | Entry("without stats files", 103 | "ens785f0", 104 | "0", 105 | fstest.MapFS{}, 106 | map[string]int64{}, 107 | "error reading stats for ens785f0 vf0", 108 | "open ens785f0/device/sriov/0/stats: file does not exist"), 109 | Entry("with stat file as a symlink", 110 | "ens785f0", 111 | "0", 112 | fstest.MapFS{ 113 | "ens785f0/device/sriov/0/stats/rx_packets": {Mode: fs.ModeSymlink}}, 114 | map[string]int64{}, 115 | "getting stats for ens785f0 vf0", 116 | "could not stat file 'ens785f0/device/sriov/0/stats/rx_packets'"), 117 | Entry("with stat file as a directory", 118 | "ens785f0", 119 | "0", 120 | fstest.MapFS{ 121 | "ens785f0/device/sriov/0/stats/rx_packets": {Mode: fs.ModeDir}}, 122 | map[string]int64{}, 123 | "getting stats for ens785f0 vf0", 124 | "error reading file, read ens785f0/device/sriov/0/stats/rx_packets: invalid argument"), 125 | Entry("with invalid stat file", 126 | "ens785f0", 127 | "0", 128 | fstest.MapFS{ 129 | "ens785f0/device/sriov/0/stats/rx_packets": {Data: []byte("NaN")}}, 130 | map[string]int64{}, 131 | "getting stats for ens785f0 vf0", 132 | "rx_packets - error parsing integer from value 'NaN'", 133 | "strconv.ParseInt: parsing \"NaN\": invalid syntax"), 134 | ) 135 | -------------------------------------------------------------------------------- /collectors/sriovdev_test.go: -------------------------------------------------------------------------------- 1 | package collectors 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "testing/fstest" 7 | 8 | "github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/pkg/vfstats" 9 | 10 | . "github.com/onsi/ginkgo/v2" 11 | . "github.com/onsi/gomega" 12 | "github.com/prometheus/client_golang/prometheus" 13 | dto "github.com/prometheus/client_model/go" 14 | "github.com/vishvananda/netlink" 15 | ) 16 | 17 | var _ = AfterEach(func() { 18 | vfstats.GetLink = netlink.LinkByName 19 | }) 20 | 21 | var _ = DescribeTable("test vf stats collection", // Collect 22 | func(priority []string, fsys fs.FS, link netlink.Device, expected []metric, logs ...string) { 23 | devfs = fsys 24 | netfs = fsys 25 | collectorPriority = priority 26 | 27 | vfstats.GetLink = func(name string) (netlink.Link, error) { 28 | return &link, nil 29 | } 30 | 31 | ch := make(chan prometheus.Metric, 1) 32 | go createSriovDevCollector().Collect(ch) 33 | 34 | for i := 0; i < len(expected); i++ { 35 | m := dto.Metric{} 36 | err := (<-ch).Write(&m) 37 | Expect(err).ToNot(HaveOccurred()) 38 | 39 | labels := make(map[string]string, 4) 40 | for _, label := range m.Label { 41 | labels[*label.Name] = *label.Value 42 | } 43 | 44 | metric := metric{labels: labels, counter: *m.Counter.Value} 45 | 46 | Expect(metric).To(BeElementOf(expected)) 47 | } 48 | 49 | assertLogs(logs) 50 | }, 51 | Entry("with only sysfs", 52 | []string{"sysfs"}, 53 | fstest.MapFS{ 54 | "0000:1d:00.0/sriov_totalvfs": {Data: []byte("64")}, 55 | "0000:1d:00.0/net/t_ens785f0": {Mode: fs.ModeDir}, 56 | "0000:1d:00.0/numa_node": {Data: []byte("0")}, 57 | "0000:1d:00.0/class": {Data: []byte("0x020000")}, 58 | "0000:1d:00.0/virtfn0": {Data: []byte("/sys/devices/0000:1d:01.0"), Mode: fs.ModeSymlink}, 59 | "0000:1d:00.0/virtfn1": {Data: []byte("/sys/devices/0000:1d:01.1"), Mode: fs.ModeSymlink}, 60 | "t_ens785f0/device/sriov/0/stats/rx_packets": {Data: []byte("4")}, 61 | "t_ens785f0/device/sriov/0/stats/tx_packets": {Data: []byte("8")}, 62 | "t_ens785f0/device/sriov/1/stats/rx_packets": {Data: []byte("16")}, 63 | "t_ens785f0/device/sriov/1/stats/tx_packets": {Data: []byte("32")}}, 64 | nil, 65 | []metric{ 66 | {map[string]string{"numa_node": "0", "pciAddr": "0000:1d:01.0", "pf": "t_ens785f0", "vf": "0"}, 4}, 67 | {map[string]string{"numa_node": "0", "pciAddr": "0000:1d:01.0", "pf": "t_ens785f0", "vf": "0"}, 8}, 68 | {map[string]string{"numa_node": "0", "pciAddr": "0000:1d:01.1", "pf": "t_ens785f0", "vf": "1"}, 16}, 69 | {map[string]string{"numa_node": "0", "pciAddr": "0000:1d:01.1", "pf": "t_ens785f0", "vf": "1"}, 32}}, 70 | "collecting sr-iov device metrics", 71 | "collector priority: \\[sysfs\\]", 72 | "t_ens785f0 - using sysfs collector", 73 | "getting stats for t_ens785f0 vf\\d", 74 | "getting stats for t_ens785f0 vf\\d"), 75 | Entry("with only netlink", 76 | []string{"netlink"}, 77 | fstest.MapFS{ 78 | "0000:2e:00.0/sriov_totalvfs": {Data: []byte("64")}, 79 | "0000:2e:00.0/net/t_ens801f0": {Mode: fs.ModeDir}, 80 | "0000:2e:00.0/numa_node": {Data: []byte("0")}, 81 | "0000:2e:00.0/class": {Data: []byte("0x020000")}, 82 | "0000:2e:00.0/virtfn0": {Data: []byte("/sys/devices/0000:2e:01.0"), Mode: fs.ModeSymlink}, 83 | "0000:2e:00.0/virtfn1": {Data: []byte("/sys/devices/0000:2e:01.1"), Mode: fs.ModeSymlink}}, 84 | netlink.Device{LinkAttrs: netlink.LinkAttrs{Vfs: []netlink.VfInfo{ 85 | {ID: 0, Mac: nil, Vlan: 0, Qos: 0, TxRate: 0, Spoofchk: true, LinkState: 0, MaxTxRate: 0, MinTxRate: 0, RxPackets: 11, TxPackets: 12, RxBytes: 13, TxBytes: 14, Multicast: 15, Broadcast: 16, RxDropped: 17, TxDropped: 18, RssQuery: 0, Trust: 0}, 86 | {ID: 1, Mac: nil, Vlan: 0, Qos: 0, TxRate: 0, Spoofchk: true, LinkState: 0, MaxTxRate: 0, MinTxRate: 0, RxPackets: 21, TxPackets: 22, RxBytes: 23, TxBytes: 24, Multicast: 25, Broadcast: 26, RxDropped: 27, TxDropped: 28, RssQuery: 0, Trust: 0}, 87 | }}}, 88 | []metric{ 89 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 11}, 90 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 12}, 91 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 13}, 92 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 14}, 93 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 15}, 94 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 16}, 95 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 17}, 96 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.0", "pf": "t_ens801f0", "vf": "0"}, 18}, 97 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 21}, 98 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 22}, 99 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 23}, 100 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 24}, 101 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 25}, 102 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 26}, 103 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 27}, 104 | {map[string]string{"numa_node": "0", "pciAddr": "0000:2e:01.1", "pf": "t_ens801f0", "vf": "1"}, 28}}, 105 | "collecting sr-iov device metrics", 106 | "collector priority: \\[netlink\\]", 107 | "t_ens801f0 - using netlink collector"), 108 | Entry("with both sysfs and netlink", 109 | []string{"sysfs", "netlink"}, 110 | fstest.MapFS{ 111 | "0000:3f:00.0/sriov_totalvfs": {Data: []byte("64")}, 112 | "0000:3f:00.0/net/t_ens785f0": {Mode: fs.ModeDir}, 113 | "0000:3f:00.0/numa_node": {Data: []byte("0")}, 114 | "0000:3f:00.0/class": {Data: []byte("0x020000")}, 115 | "0000:3f:00.0/virtfn0": {Data: []byte("/sys/devices/0000:3f:01.0"), Mode: fs.ModeSymlink}, 116 | "t_ens785f0/device/sriov/0/stats/rx_packets": {Data: []byte("4")}, 117 | "t_ens785f0/device/sriov/0/stats/tx_packets": {Data: []byte("8")}, 118 | "0000:4g:00.0/sriov_totalvfs": {Data: []byte("128")}, 119 | "0000:4g:00.0/net/t_ens801f0": {Mode: fs.ModeDir}, 120 | "0000:4g:00.0/numa_node": {Data: []byte("0")}, 121 | "0000:4g:00.0/class": {Data: []byte("0x020000")}, 122 | "0000:4g:00.0/virtfn0": {Data: []byte("/sys/devices/0000:4g:01.0"), Mode: fs.ModeSymlink}}, 123 | netlink.Device{LinkAttrs: netlink.LinkAttrs{Vfs: []netlink.VfInfo{ 124 | {ID: 0, Mac: nil, Vlan: 0, Qos: 0, TxRate: 0, Spoofchk: true, LinkState: 0, MaxTxRate: 0, MinTxRate: 0, RxPackets: 31, TxPackets: 32, RxBytes: 33, TxBytes: 34, Multicast: 35, Broadcast: 36, RxDropped: 37, TxDropped: 38, RssQuery: 0, Trust: 0}, 125 | }}}, 126 | []metric{ 127 | {map[string]string{"numa_node": "0", "pciAddr": "0000:3f:01.0", "pf": "t_ens785f0", "vf": "0"}, 4}, 128 | {map[string]string{"numa_node": "0", "pciAddr": "0000:3f:01.0", "pf": "t_ens785f0", "vf": "0"}, 8}, 129 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 31}, 130 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 32}, 131 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 33}, 132 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 34}, 133 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 35}, 134 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 36}, 135 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 37}, 136 | {map[string]string{"numa_node": "0", "pciAddr": "0000:4g:01.0", "pf": "t_ens801f0", "vf": "0"}, 38}}, 137 | "collecting sr-iov device metrics", 138 | "collector priority: \\[sysfs netlink\\]", 139 | "t_ens785f0 - using sysfs collector", 140 | "getting stats for t_ens785f0 vf\\d"), 141 | 142 | // These logs are expected, but were causing instability in this test case, removed for now 143 | // "t_ens801f0 does not support sysfs collector, directory 't_ens801f0/device/sriov' does not exist", 144 | // "t_ens801f0 - using netlink collector", 145 | ) 146 | 147 | var _ = DescribeTable("test creating sriovDev collector", // createSriovDevCollector 148 | func(fsys fs.FS, expected sriovDevCollector, logs ...string) { 149 | devfs = fsys 150 | 151 | collector := createSriovDevCollector() 152 | Expect(collector).To(Equal(expected)) 153 | 154 | assertLogs(logs) 155 | }, 156 | Entry("only sriov net devices", 157 | fstest.MapFS{ 158 | "0000:1a:00.0/sriov_totalvfs": {Data: []byte("64")}, 159 | "0000:1a:00.0/numa_node": {Data: []byte("1")}, 160 | "0000:1a:00.0/class": {Data: []byte("0x020000")}, 161 | "0000:1a:00.1/sriov_totalvfs": {Data: []byte("64")}, 162 | "0000:1a:00.1/numa_node": {Data: []byte("1")}, 163 | "0000:1a:00.1/class": {Data: []byte("0x020000")}, 164 | "0000:2b:00.0/sriov_totalvfs": {Data: []byte("128")}, 165 | "0000:2b:00.0/numa_node": {Data: []byte("2")}, 166 | "0000:2b:00.0/class": {Data: []byte("0x020000")}, 167 | "0000:2b:00.1/sriov_totalvfs": {Data: []byte("128")}, 168 | "0000:2b:00.1/numa_node": {Data: []byte("2")}, 169 | "0000:2b:00.1/class": {Data: []byte("0x020000")}}, 170 | sriovDevCollector{ 171 | "vfstats", 172 | map[string]string{"0000:1a:00.0": "1", "0000:1a:00.1": "1", "0000:2b:00.0": "2", "0000:2b:00.1": "2"}}), 173 | Entry("mixed devices", 174 | fstest.MapFS{ 175 | "0000:3c:00.0/sriov_totalvfs": {Data: []byte("63")}, 176 | "0000:3c:00.0/numa_node": {Data: []byte("1")}, 177 | "0000:3c:00.0/class": {Data: []byte("0x020000")}, 178 | "0000:3c:00.1/sriov_totalvfs": {Data: []byte("63")}, 179 | "0000:3c:00.1/numa_node": {Data: []byte("1")}, 180 | "0000:3c:00.1/class": {Data: []byte("0x020000")}, 181 | "0000:4d:00.0/sriov_totalvfs": {Data: []byte("64")}, 182 | "0000:4d:00.0/numa_node": {Data: []byte("-1")}, 183 | "0000:4d:00.0/class": {Data: []byte("0x020000")}, 184 | "0000:4d:00.1/sriov_totalvfs": {Data: []byte("64")}, 185 | "0000:4d:00.1/numa_node": {Data: []byte("-1")}, 186 | "0000:4d:00.1/class": {Data: []byte("0x020000")}}, 187 | sriovDevCollector{ 188 | "vfstats", 189 | map[string]string{"0000:3c:00.0": "1", "0000:3c:00.1": "1", "0000:4d:00.0": "", "0000:4d:00.1": ""}}, 190 | "no numa node information for device '0000:4d:00.0'", 191 | "no numa node information for device '0000:4d:00.1'"), 192 | Entry("no sriov net devices", 193 | fstest.MapFS{ 194 | "0000:5e:00.0/": {Mode: fs.ModeDir}, 195 | "0000:5e:00.1/": {Mode: fs.ModeDir}, 196 | "0000:5e:00.2/": {Mode: fs.ModeDir}, 197 | "0000:5e:00.3/": {Mode: fs.ModeDir}}, 198 | sriovDevCollector{ 199 | "vfstats", 200 | map[string]string{}}, 201 | "no sriov net devices found"), 202 | ) 203 | 204 | var _ = DescribeTable("test getting sriov devices from filesystem", // getSriovDevAddrs 205 | func(fsys fs.FS, expected []string, logs ...string) { 206 | devfs = fsys 207 | 208 | devs := getSriovDevAddrs() 209 | Expect(devs).To(Equal(expected)) 210 | 211 | assertLogs(logs) 212 | }, 213 | Entry("only sriov net devices", 214 | fstest.MapFS{ 215 | "0000:6f:00.0/sriov_totalvfs": {Data: []byte("64")}, "0000:6f:00.0/class": {Data: []byte("0x020000")}, 216 | "0000:6f:00.1/sriov_totalvfs": {Data: []byte("64")}, "0000:6f:00.1/class": {Data: []byte("0x020000")}, 217 | "0000:7g:00.0/sriov_totalvfs": {Data: []byte("128")}, "0000:7g:00.0/class": {Data: []byte("0x020000")}, 218 | "0000:7g:00.1/sriov_totalvfs": {Data: []byte("128")}, "0000:7g:00.1/class": {Data: []byte("0x020000")}}, 219 | []string{"0000:6f:00.0", "0000:6f:00.1", "0000:7g:00.0", "0000:7g:00.1"}), 220 | Entry("mixed devices", 221 | fstest.MapFS{ 222 | "0000:8h:00.0/": {Mode: fs.ModeDir}, 223 | "0000:8h:00.1/": {Mode: fs.ModeDir}, 224 | "0000:9i:00.0/sriov_totalvfs": {Data: []byte("63")}, "0000:9i:00.0/class": {Data: []byte("0x020000")}, 225 | "0000:9i:00.1/sriov_totalvfs": {Data: []byte("63")}, "0000:9i:00.1/class": {Data: []byte("0x020000")}}, 226 | []string{"0000:9i:00.0", "0000:9i:00.1"}), 227 | Entry("no sriov net devices", 228 | fstest.MapFS{ 229 | "0000:1b:00.0/": {Mode: fs.ModeDir}, 230 | "0000:1b:00.1/": {Mode: fs.ModeDir}, 231 | "0000:1b:00.2/": {Mode: fs.ModeDir}, 232 | "0000:1b:00.3/": {Mode: fs.ModeDir}}, 233 | []string{}, 234 | "no sriov net devices found"), 235 | ) 236 | 237 | var _ = DescribeTable("test getting sriov dev details", // getSriovDev 238 | func(dev string, priority []string, fsys fs.FS, link netlink.Link, expected sriovDev, logs ...string) { 239 | devfs = fsys 240 | netfs = fsys 241 | 242 | if link != nil { 243 | vfstats.GetLink = func(name string) (netlink.Link, error) { 244 | return link, nil 245 | } 246 | DeferCleanup(func() { 247 | vfstats.GetLink = netlink.LinkByName 248 | }) 249 | } 250 | 251 | sriovDev := getSriovDev(dev, priority) 252 | Expect(sriovDev).To(Equal(expected)) 253 | 254 | assertLogs(logs) 255 | }, 256 | Entry("with sysfs support", 257 | "0000:4f:00.0", 258 | []string{"sysfs", "netlink"}, 259 | fstest.MapFS{ 260 | "0000:4f:00.0/net/ens785f0": {Mode: fs.ModeDir}, 261 | "0000:4f:00.0/virtfn0": {Data: []byte("/sys/devices/0000:4f:01.0"), Mode: fs.ModeSymlink}, 262 | "0000:4f:00.0/virtfn1": {Data: []byte("/sys/devices/0000:4f:01.1"), Mode: fs.ModeSymlink}, 263 | "ens785f0/device/sriov": {Mode: fs.ModeDir}, 264 | "ens785f0/device/sriov/0/stats/rx_packets": {Data: []byte("1")}, // Added to enable sysfsReader 265 | "0000:5g:00.0/net/ens801f0": {Mode: fs.ModeDir}, 266 | "0000:5g:00.0/virtfn0": {Data: []byte("/sys/devices/0000:5g:01.0"), Mode: fs.ModeSymlink}}, 267 | nil, 268 | sriovDev{ 269 | "ens785f0", 270 | sysfsReader{"/sys/class/net/%s/device/sriov/%s/stats"}, 271 | map[string]string{"0": "0000:4f:01.0", "1": "0000:4f:01.1"}}, 272 | "ens785f0 - using sysfs collector"), 273 | Entry("without sysfs support", 274 | "0000:6h:00.0", 275 | []string{"sysfs", "netlink"}, 276 | fstest.MapFS{ 277 | "0000:6h:00.0/net/ens785f0": {Mode: fs.ModeDir}, 278 | "0000:6h:00.0/virtfn0": {Data: []byte("/sys/devices/0000:6h:01.0"), Mode: fs.ModeSymlink}, 279 | "0000:6h:00.0/virtfn1": {Data: []byte("/sys/devices/0000:6h:01.1"), Mode: fs.ModeSymlink}, 280 | "0000:7i:00.0/net/ens801f0": {Mode: fs.ModeDir}, 281 | "0000:7i:00.0/virtfn0": {Data: []byte("/sys/devices/0000:7i:01.0"), Mode: fs.ModeSymlink}}, 282 | &netlink.Device{LinkAttrs: netlink.LinkAttrs{Vfs: []netlink.VfInfo{}}}, //nolint:govet 283 | sriovDev{ 284 | "ens785f0", 285 | netlinkReader{vfstats.VfStats("ens785f0")}, 286 | map[string]string{"0": "0000:6h:01.0", "1": "0000:6h:01.1"}}, 287 | "ens785f0 does not support sysfs collector", 288 | "ens785f0 - using netlink collector"), 289 | Entry("without any collector support", 290 | "0000:8j:00.0", 291 | []string{"unsupported_collector"}, 292 | fstest.MapFS{ 293 | "0000:8j:00.0/net/ens785f0": {Mode: fs.ModeDir}, 294 | "0000:8j:00.0/virtfn0": {Data: []byte("/sys/devices/0000:8j:01.0"), Mode: fs.ModeSymlink}, 295 | "0000:8j:00.0/virtfn1": {Data: []byte("/sys/devices/0000:8j:01.1"), Mode: fs.ModeSymlink}}, 296 | nil, 297 | sriovDev{ 298 | "ens785f0", 299 | nil, 300 | map[string]string{"0": "0000:8j:01.0", "1": "0000:8j:01.1"}}, 301 | "ens785f0 - 'unsupported_collector' collector not supported"), 302 | Entry("without any virtual functions", 303 | "0000:9k:00.0", 304 | []string{"sysfs"}, 305 | fstest.MapFS{ 306 | "0000:9k:00.0/net/ens785f0": {Mode: fs.ModeDir}}, 307 | nil, 308 | sriovDev{ 309 | "ens785f0", 310 | nil, 311 | map[string]string{}}, 312 | "error getting vf address", 313 | "no virtual functions found for pf '0000:9k:00.0'", 314 | "ens785f0 does not support sysfs collector"), 315 | ) 316 | 317 | var _ = DescribeTable("test getting numa node information for devices from filesystem", // getNumaNodes // TODO: ensure map order 318 | func(devices []string, fsys fs.FS, expected map[string]string, logs ...string) { 319 | devfs = fsys 320 | 321 | numaNodes := getNumaNodes(devices) 322 | Expect(numaNodes).To(Equal(expected)) 323 | 324 | assertLogs(logs) 325 | }, 326 | Entry("only sriov net devices", 327 | []string{"0000:2c:00.0", "0000:2c:00.1", "0000:3d:00.0", "0000:3d:00.1"}, 328 | fstest.MapFS{ 329 | "0000:2c:00.0/numa_node": {Data: []byte("0")}, 330 | "0000:2c:00.1/numa_node": {Data: []byte("0")}, 331 | "0000:3d:00.0/numa_node": {Data: []byte("1")}, 332 | "0000:3d:00.1/numa_node": {Data: []byte("1")}}, 333 | map[string]string{"0000:2c:00.0": "0", "0000:2c:00.1": "0", "0000:3d:00.0": "1", "0000:3d:00.1": "1"}), 334 | Entry("mixed devices", 335 | []string{"0000:4e:00.0", "0000:4e:00.1", "0000:5f:00.0", "0000:5f:00.1"}, 336 | fstest.MapFS{ 337 | "0000:4e:00.0/": {Mode: fs.ModeDir}, 338 | "0000:4e:00.1/": {Mode: fs.ModeDir}, 339 | "0000:5f:00.0/numa_node": {Data: []byte("-1")}, 340 | "0000:5f:00.1/numa_node": {Data: []byte("-1")}}, 341 | map[string]string{"0000:4e:00.0": "", "0000:4e:00.1": "", "0000:5f:00.0": "", "0000:5f:00.1": ""}, 342 | "could not read numa_node file for device '0000:4e:00.0'", 343 | "open 0000:4e:00.0/numa_node: file does not exist", 344 | "could not read numa_node file for device '0000:4e:00.1'", 345 | "open 0000:4e:00.1/numa_node: file does not exist", 346 | "no numa node information for device '0000:5f:00.0'", 347 | "no numa node information for device '0000:5f:00.1'"), 348 | Entry("no sriov net devices", 349 | []string{"0000:6g:00.0", "0000:6g:00.1", "0000:6g:00.2", "0000:6g:00.3"}, 350 | fstest.MapFS{ 351 | "0000:6g:00.0/": {Mode: fs.ModeDir}, 352 | "0000:6g:00.1/": {Mode: fs.ModeDir}, 353 | "0000:6g:00.2/": {Mode: fs.ModeDir}, 354 | "0000:6g:00.3/": {Mode: fs.ModeDir}}, 355 | map[string]string{"0000:6g:00.0": "", "0000:6g:00.1": "", "0000:6g:00.2": "", "0000:6g:00.3": ""}, 356 | "could not read numa_node file for device '0000:6g:00.0'", 357 | "open 0000:6g:00.0/numa_node: file does not exist", 358 | "could not read numa_node file for device '0000:6g:00.1'", 359 | "open 0000:6g:00.1/numa_node: file does not exist", 360 | "could not read numa_node file for device '0000:6g:00.2'", 361 | "open 0000:6g:00.2/numa_node: file does not exist", 362 | "could not read numa_node file for device '0000:6g:00.3'", 363 | "open 0000:6g:00.3/numa_node: file does not exist"), 364 | ) 365 | 366 | var _ = DescribeTable("test getting vf information for devices from filesystem", // vfList 367 | func(dev string, fsys fs.FS, expected vfsPCIAddr, err error, logs ...string) { 368 | devfs = fsys 369 | 370 | vfs, e := vfList(dev) 371 | Expect(vfs).To(Equal(expected)) 372 | 373 | if err != nil { 374 | Expect(e).Should(MatchError(err)) 375 | } 376 | 377 | assertLogs(logs) 378 | }, 379 | Entry("only retrieve vf information for specified sriov net device", 380 | "0000:7h:00.0", 381 | fstest.MapFS{ 382 | "0000:7h:00.0/virtfn0": {Data: []byte("/sys/devices/0000:7h:01.0"), Mode: fs.ModeSymlink}, 383 | "0000:7h:00.0/virtfn1": {Data: []byte("/sys/devices/0000:7h:01.1"), Mode: fs.ModeSymlink}, 384 | "0000:8i:00.0/virtfn0": {Data: []byte("/sys/devices/0000:8i:01.0"), Mode: fs.ModeSymlink}}, 385 | map[string]string{"0": "0000:7h:01.0", "1": "0000:7h:01.1"}, 386 | nil), 387 | Entry("vf file is not a symlink for specified sriov net device", 388 | "0000:9j:00.0", 389 | fstest.MapFS{ 390 | "0000:9j:00.0/virtfn0": {Data: []byte("/sys/devices/0000:9j:01.0"), Mode: fs.ModeDir}}, 391 | map[string]string{}, 392 | fmt.Errorf("no virtual functions found for pf '0000:9j:00.0'"), 393 | "error evaluating symlink '0000:9j:00.0/virtfn0'"), 394 | Entry("vf file does not exist for specified sriov net device", 395 | "0000:1c:00.0", 396 | fstest.MapFS{}, 397 | map[string]string{}, 398 | fmt.Errorf("no virtual functions found for pf '0000:1c:00.0'")), 399 | ) 400 | 401 | var _ = DescribeTable("test getting vf data from filesystem", // vfData 402 | func(vfDir string, fsys fs.FS, expectedVfId string, expectedVfPciAddr string, logs ...string) { 403 | devfs = fsys 404 | 405 | vfId, vfPci := vfData(vfDir) 406 | Expect(vfId).To(Equal(expectedVfId)) 407 | Expect(vfPci).To(Equal(expectedVfPciAddr)) 408 | 409 | assertLogs(logs) 410 | }, 411 | Entry("valid symlink", 412 | "0000:7h:00.0/virtfn0", 413 | fstest.MapFS{"0000:7h:00.0/virtfn0": {Data: []byte("/sys/devices/0000:7h:01.0"), Mode: fs.ModeSymlink}}, 414 | "0", 415 | "0000:7h:01.0"), 416 | Entry("invalid symlink", 417 | "0000:8i:00.0/virtfn0", 418 | fstest.MapFS{"0000:8i:00.0/virtfn0": {Mode: fs.ModeDir}}, 419 | "", 420 | "", 421 | "error evaluating symlink '0000:8i:00.0/virtfn0'"), 422 | ) 423 | 424 | var _ = DescribeTable("test getting pf name from pci address on filesystem", // getPFName 425 | func(dev string, fsys fs.FS, expected string, logs ...string) { 426 | devfs = fsys 427 | 428 | pfName := getPFName(dev) 429 | Expect(pfName).To(Equal(expected)) 430 | 431 | assertLogs(logs) 432 | }, 433 | Entry("pf exists", 434 | "0000:2d:00.0", 435 | fstest.MapFS{"0000:2d:00.0/net/ens785f0": {Mode: fs.ModeDir}}, 436 | "ens785f0"), 437 | Entry("pf does not exist", 438 | "0000:3e:00.0", 439 | fstest.MapFS{}, 440 | "", 441 | "0000:3e:00.0 - could not get pf interface name in path '0000:3e:00.0/net'", 442 | "open 0000:3e:00.0/net: file does not exist"), 443 | ) 444 | -------------------------------------------------------------------------------- /deployment/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: sriov-metrics-exporter 6 | app.kubernetes.io/version: v0.0.1 7 | name: sriov-metrics-exporter 8 | namespace: monitoring 9 | spec: 10 | revisionHistoryLimit: 10 11 | selector: 12 | matchLabels: 13 | app.kubernetes.io/name: sriov-metrics-exporter 14 | template: 15 | metadata: 16 | labels: 17 | app.kubernetes.io/name: sriov-metrics-exporter 18 | app.kubernetes.io/version: v0.0.1 19 | spec: 20 | hostNetwork: true 21 | containers: 22 | - args: 23 | - --path.kubecgroup=/host/kubecgroup 24 | - --path.sysbuspci=/host/sys/bus/pci/devices/ 25 | - --path.sysclassnet=/host/sys/class/net/ 26 | - --path.cpucheckpoint=/host/cpu_manager_state 27 | - --path.kubeletsocket=/host/kubelet.sock 28 | - --collector.kubepoddevice=true 29 | - --collector.vfstatspriority=sysfs,netlink 30 | image: ghcr.io/k8snetworkplumbingwg/sriov-network-metrics-exporter:latest 31 | imagePullPolicy: Always 32 | name: sriov-metrics-exporter 33 | resources: 34 | requests: 35 | memory: 100Mi 36 | cpu: 100m 37 | limits: 38 | memory: 100Mi 39 | cpu: 100m 40 | securityContext: 41 | capabilities: 42 | drop: 43 | - ALL 44 | readOnlyRootFilesystem: true 45 | allowPrivilegeEscalation: false 46 | volumeMounts: 47 | - mountPath: /host/kubelet.sock 48 | name: kubeletsocket 49 | - mountPath: /host/sys/bus/pci/devices 50 | name: sysbuspcidevices 51 | readOnly: true 52 | - mountPath: /host/sys/devices 53 | name: sysdevices 54 | readOnly: true 55 | - mountPath: /host/sys/class/net 56 | name: sysclassnet 57 | readOnly: true 58 | - mountPath: /host/kubecgroup 59 | name: kubecgroup 60 | readOnly: true 61 | - mountPath: /host/cpu_manager_state 62 | name: cpucheckpoint 63 | readOnly: true 64 | nodeSelector: 65 | kubernetes.io/os: linux 66 | feature.node.kubernetes.io/network-sriov.capable: "true" 67 | restartPolicy: Always 68 | tolerations: 69 | - operator: Exists 70 | volumes: 71 | - hostPath: 72 | path: /var/lib/kubelet/pod-resources/kubelet.sock 73 | type: "Socket" 74 | name: kubeletsocket 75 | - hostPath: 76 | path: /sys/fs/cgroup/cpuset/kubepods.slice/ 77 | type: "Directory" 78 | name: kubecgroup 79 | - hostPath: 80 | path: /var/lib/kubelet/cpu_manager_state 81 | type: "File" 82 | name: cpucheckpoint 83 | - hostPath: 84 | path: /sys/class/net 85 | type: "Directory" 86 | name: sysclassnet 87 | - hostPath: 88 | path: /sys/bus/pci/devices 89 | type: "Directory" 90 | name: sysbuspcidevices 91 | - hostPath: 92 | path: /sys/devices 93 | type: "Directory" 94 | name: sysdevices 95 | --- 96 | apiVersion: v1 97 | kind: Service 98 | metadata: 99 | name: sriov-metrics-exporter 100 | namespace: monitoring 101 | annotations: 102 | prometheus.io/target: "true" 103 | spec: 104 | selector: 105 | app.kubernetes.io/name: sriov-metrics-exporter 106 | ports: 107 | - protocol: TCP 108 | port: 9808 109 | targetPort: 9808 110 | -------------------------------------------------------------------------------- /deployment/minimum-prom-scrape-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | prometheus.yml: | 4 | global: 5 | evaluation_interval: 5s 6 | scrape_interval: 5s 7 | scrape_timeout: 5s 8 | scrape_configs: 9 | - job_name: 'sriov-metrics' 10 | kubernetes_sd_configs: 11 | - role: endpoints 12 | relabel_configs: 13 | - source_labels: [__meta_kubernetes_endpoint_node_name] 14 | target_label: instance 15 | - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_target] 16 | action: keep 17 | regex: true 18 | static_configs: 19 | - targets: ['sriov-metrics-exporter.monitoring.svc.cluster.local'] 20 | scheme: http 21 | - job_name: 'sriov-metrics-standalone' 22 | scheme: http 23 | kubernetes_sd_configs: 24 | - role: node 25 | relabel_configs: 26 | - source_labels: [__address__] 27 | regex: ^(.*):\d+$ 28 | target_label: __address__ 29 | replacement: $1:9999 30 | - target_label: __scheme__ 31 | replacement: http 32 | kind: ConfigMap 33 | metadata: 34 | annotations: 35 | labels: 36 | app: prometheus 37 | component: server 38 | name: minimum-scrape-config-sriov-exporter.yml 39 | namespace: monitoring 40 | -------------------------------------------------------------------------------- /docs/prometheus-queries: -------------------------------------------------------------------------------- 1 | ## CPU usage per pod 2 | 3 | The below query will return the name and namespace of every guaranteed pod linked to specific cpu usage: 4 | `sriov_kubepodcpu * on (uid) group_left(pod,namespace) kube_pod_info` 5 | kube_pod_info here is a metric supplied by the kube-state-metrics project. 6 | 7 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter 2 | 3 | go 1.23.0 4 | 5 | require ( 6 | github.com/onsi/ginkgo/v2 v2.27.2 7 | github.com/onsi/gomega v1.38.2 8 | github.com/prometheus/client_golang v1.23.2 9 | github.com/prometheus/client_model v0.6.2 10 | github.com/vishvananda/netlink v1.3.1 11 | golang.org/x/time v0.12.0 12 | google.golang.org/grpc v1.56.3 13 | k8s.io/kubelet v0.25.16 14 | ) 15 | 16 | require ( 17 | github.com/Masterminds/semver/v3 v3.4.0 // indirect 18 | github.com/beorn7/perks v1.0.1 // indirect 19 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 20 | github.com/go-logr/logr v1.4.3 // indirect 21 | github.com/go-task/slim-sprig/v3 v3.0.0 // indirect 22 | github.com/gogo/protobuf v1.3.2 // indirect 23 | github.com/golang/protobuf v1.5.3 // indirect 24 | github.com/google/go-cmp v0.7.0 // indirect 25 | github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect 26 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 27 | github.com/prometheus/common v0.66.1 // indirect 28 | github.com/prometheus/procfs v0.16.1 // indirect 29 | github.com/vishvananda/netns v0.0.5 // indirect 30 | go.yaml.in/yaml/v2 v2.4.2 // indirect 31 | go.yaml.in/yaml/v3 v3.0.4 // indirect 32 | golang.org/x/mod v0.27.0 // indirect 33 | golang.org/x/net v0.43.0 // indirect 34 | golang.org/x/sync v0.16.0 // indirect 35 | golang.org/x/sys v0.35.0 // indirect 36 | golang.org/x/text v0.28.0 // indirect 37 | golang.org/x/tools v0.36.0 // indirect 38 | google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect 39 | google.golang.org/protobuf v1.36.8 // indirect 40 | ) 41 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= 2 | github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= 3 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 4 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 5 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 6 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 7 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 8 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 9 | github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= 10 | github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= 11 | github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= 12 | github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= 13 | github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= 14 | github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= 15 | github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= 16 | github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 17 | github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= 18 | github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= 19 | github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= 20 | github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= 21 | github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= 22 | github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= 23 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= 24 | github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= 25 | github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= 26 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 27 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= 28 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= 29 | github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= 30 | github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= 31 | github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= 32 | github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= 33 | github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= 34 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= 35 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= 36 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= 37 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 38 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 39 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 40 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 41 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= 42 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= 43 | github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= 44 | github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= 45 | github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= 46 | github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= 47 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 48 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 49 | github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= 50 | github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= 51 | github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= 52 | github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= 53 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 54 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 55 | github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= 56 | github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= 57 | github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= 58 | github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= 59 | github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= 60 | github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= 61 | github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= 62 | github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= 63 | github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= 64 | github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= 65 | github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= 66 | github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= 67 | github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= 68 | github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= 69 | github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= 70 | github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= 71 | github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= 72 | github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= 73 | github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= 74 | github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= 75 | github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= 76 | github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= 77 | github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= 78 | github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= 79 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 80 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 81 | go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= 82 | go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= 83 | go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= 84 | go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= 85 | go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= 86 | go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= 87 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 88 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 89 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 90 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 91 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 92 | golang.org/x/mod v0.27.0 h1:kb+q2PyFnEADO2IEF935ehFUXlWiNjJWtRNgBLSfbxQ= 93 | golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= 94 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 95 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 96 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 97 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 98 | golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= 99 | golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= 100 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 101 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 102 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 103 | golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= 104 | golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= 105 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 106 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 107 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 108 | golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 109 | golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 110 | golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= 111 | golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 112 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 113 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 114 | golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= 115 | golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= 116 | golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= 117 | golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= 118 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 119 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 120 | golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= 121 | golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= 122 | golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= 123 | golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= 124 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 125 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 126 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 127 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 128 | google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A= 129 | google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= 130 | google.golang.org/grpc v1.56.3 h1:8I4C0Yq1EjstUzUJzpcRVbuYA2mODtEmpWiQoN/b2nc= 131 | google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= 132 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= 133 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 134 | google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= 135 | google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= 136 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 137 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 138 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 139 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 140 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 141 | k8s.io/kubelet v0.25.16 h1:TksmBHJSPh7CsoMXtV378SEiiEBRSS9EVRrm/GeCE3Q= 142 | k8s.io/kubelet v0.25.16/go.mod h1:zxC3K/9ZT9EphrtoMtuXXb0xiHJKY+LRIucg0JhGopc= 143 | -------------------------------------------------------------------------------- /pkg/utils/test/target: -------------------------------------------------------------------------------- 1 | hello world 2 | -------------------------------------------------------------------------------- /pkg/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "path/filepath" 7 | "strings" 8 | ) 9 | 10 | // Comma-separated string flag type 11 | type StringListFlag []string 12 | 13 | func (list *StringListFlag) String() string { 14 | return strings.Join(*list, ",") 15 | } 16 | 17 | func (list *StringListFlag) Set(val string) error { 18 | *list = strings.Split(val, ",") 19 | 20 | for i := range *list { 21 | (*list)[i] = strings.TrimSpace((*list)[i]) 22 | } 23 | 24 | return nil 25 | } 26 | 27 | func ResolveFlag(flag string, path *string) error { 28 | if err := ResolvePath(path); err != nil { 29 | return fmt.Errorf("%s - %v", flag, err) 30 | } 31 | 32 | return nil 33 | } 34 | 35 | func ResolvePath(path *string) error { 36 | if *path == "" { 37 | return fmt.Errorf("unable to resolve an empty path") 38 | } 39 | 40 | cleanPath, err := filepath.Abs(*path) 41 | if err != nil { 42 | *path = "" 43 | return fmt.Errorf("unsafe or invalid path specified '%s'\n%v", *path, err) 44 | } 45 | 46 | evaluatedPath, err := EvalSymlinks(cleanPath) 47 | if err != nil { 48 | *path = cleanPath 49 | return fmt.Errorf("unable to evaluate symbolic links on path '%s'\n%v", *path, err) 50 | } 51 | 52 | *path = evaluatedPath 53 | 54 | return nil 55 | } 56 | 57 | // Required to enable testing (filepath.EvalSymlinks does not support the fs.FS interface that fstest implements) 58 | var EvalSymlinks = func(path string) (string, error) { 59 | return filepath.EvalSymlinks(path) 60 | } 61 | 62 | func IsSymLink(fsys fs.FS, path string) bool { 63 | if info, err := fs.Stat(fsys, path); err != nil { 64 | return false 65 | } else if info.Mode() == fs.ModeSymlink { 66 | return true 67 | } 68 | 69 | return false 70 | } 71 | -------------------------------------------------------------------------------- /pkg/utils/utils_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "io/fs" 6 | "log" 7 | "os" 8 | "path/filepath" 9 | "testing" 10 | "testing/fstest" 11 | 12 | . "github.com/onsi/ginkgo/v2" 13 | . "github.com/onsi/gomega" 14 | ) 15 | 16 | const ( 17 | linkPath = "test/link" 18 | targetPath = "test/target" 19 | ) 20 | 21 | func TestUtils(t *testing.T) { 22 | RegisterFailHandler(Fail) 23 | RunSpecs(t, "utils test suite") 24 | } 25 | 26 | var _ = BeforeSuite(func() { 27 | log.SetFlags(0) 28 | }) 29 | 30 | var _ = AfterEach(func() { 31 | os.Remove(linkPath) 32 | }) 33 | 34 | var _ = DescribeTable("test path resolution", // ResolvePath 35 | func(input string, output string, isSymlink bool, expectedErr error) { 36 | if isSymlink { 37 | targetPath, err := filepath.Abs(output) 38 | Expect(err).ToNot(HaveOccurred()) 39 | 40 | err = os.Symlink(targetPath, input) 41 | Expect(err).ToNot(HaveOccurred()) 42 | } 43 | 44 | err := ResolvePath(&input) 45 | Expect(input).To(Equal(output)) 46 | 47 | if expectedErr != nil { 48 | Expect(err).To(Equal(expectedErr)) 49 | } 50 | }, 51 | Entry("path resolved without change", "/var/lib/kubelet/cpu_manager_state", "/var/lib/kubelet/cpu_manager_state", false, nil), 52 | Entry("path resolved with change", "/var/lib/../lib/kubelet/cpu_manager_state", "/var/lib/kubelet/cpu_manager_state", false, nil), 53 | Entry("empty path", "", "", false, fmt.Errorf("unable to resolve an empty path")), 54 | Entry("symbolic link", linkPath, getAbsPath(targetPath), true, nil), 55 | ) 56 | 57 | var _ = DescribeTable("test flag resolution", // ResolveFlag 58 | func(flag string, path string, expectedResult string, expectedErr error) { 59 | err := ResolveFlag(flag, &path) 60 | Expect(path).To(Equal(expectedResult)) 61 | 62 | if expectedErr != nil { 63 | Expect(err).To(Equal(expectedErr)) 64 | } 65 | }, 66 | Entry("flag resolved", "test_flag1", "/var/lib/kubelet/cpu_manager_state", "/var/lib/kubelet/cpu_manager_state", nil), 67 | Entry("empty path", "test_flag2", "", "", fmt.Errorf("test_flag2 - unable to resolve an empty path")), 68 | ) 69 | 70 | var _ = DescribeTable("test IsSymLink", // IsSymLink 71 | func(fsys fs.FS, path string, expected bool) { 72 | Expect(IsSymLink(fsys, path)).To(Equal(expected)) 73 | }, 74 | Entry("with symlink", fstest.MapFS{"test_file": {Mode: fs.ModeSymlink}}, "test_file", true), 75 | Entry("without symlink", fstest.MapFS{"test_file": {Mode: fs.ModeDir}}, "test_file", false), 76 | ) 77 | 78 | var _ = DescribeTable("test StringListFlag type", // StringListFlag 79 | func(input string, expectedSlice StringListFlag, expectedString string) { 80 | var list StringListFlag 81 | err := list.Set(input) 82 | 83 | Expect(err).ToNot(HaveOccurred()) 84 | Expect(list).To(Equal(expectedSlice)) 85 | Expect(list.String()).To(Equal(expectedString)) 86 | }, 87 | Entry("just one value", "sysfs", StringListFlag{"sysfs"}, "sysfs"), 88 | Entry("two values", "sysfs,netlink", StringListFlag{"sysfs", "netlink"}, "sysfs,netlink"), 89 | Entry("odd formatting", " sysfs , netlink ", StringListFlag{"sysfs", "netlink"}, "sysfs,netlink"), 90 | ) 91 | 92 | func getAbsPath(fp string) string { 93 | absPath, err := filepath.Abs(fp) 94 | if err != nil { 95 | log.Printf("Failed to get absolute path, %v", err.Error()) 96 | } 97 | return absPath 98 | } 99 | -------------------------------------------------------------------------------- /pkg/vfstats/netlink.go: -------------------------------------------------------------------------------- 1 | // Package vfstats contains methods to pull the SRIOV stats from various locations in linux 2 | package vfstats 3 | 4 | import ( 5 | "log" 6 | 7 | "github.com/vishvananda/netlink" 8 | ) 9 | 10 | // PerPF returns stats related to each virtual function for a given physical function 11 | type PerPF struct { 12 | Pf string 13 | Vfs map[int]netlink.VfInfo 14 | } 15 | 16 | // VfStats returns the stats for all of the SRIOV Virtual Functions attached to the given Physical Function 17 | func VfStats(pf string) PerPF { 18 | output := PerPF{pf, make(map[int]netlink.VfInfo)} 19 | lnk, err := GetLink(pf) 20 | if err != nil { 21 | log.Printf("netlink: error retrieving link for pf '%s'\n%v", pf, err) 22 | return output 23 | } 24 | 25 | for _, vf := range lnk.Attrs().Vfs { 26 | output.Vfs[vf.ID] = vf 27 | } 28 | 29 | return output 30 | } 31 | 32 | // DoesPfSupportNetlink returns true if the Physical Function supports the netlink APIs 33 | func DoesPfSupportNetlink(pf string) bool { 34 | _, err := GetLink(pf) 35 | return err == nil 36 | } 37 | 38 | var GetLink = netlink.LinkByName 39 | -------------------------------------------------------------------------------- /pkg/vfstats/netlink_test.go: -------------------------------------------------------------------------------- 1 | package vfstats 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | . "github.com/onsi/ginkgo/v2" 8 | . "github.com/onsi/gomega" 9 | "github.com/vishvananda/netlink" 10 | ) 11 | 12 | func TestNetlink(t *testing.T) { 13 | RegisterFailHandler(Fail) 14 | RunSpecs(t, "netlink test suite") 15 | } 16 | 17 | var _ = DescribeTable("test vf stats collection", // VfStats 18 | func(devName string, link netlink.Device, err error, expectedPerPF PerPF, logs ...string) { 19 | GetLink = func(name string) (netlink.Link, error) { 20 | return &link, err 21 | } 22 | 23 | Expect(VfStats(devName)).To(Equal(expectedPerPF)) 24 | }, 25 | Entry("Without error", 26 | "ens801f0", 27 | netlink.Device{LinkAttrs: netlink.LinkAttrs{Vfs: []netlink.VfInfo{ 28 | {ID: 0, Mac: nil, Vlan: 0, Qos: 0, TxRate: 0, Spoofchk: true, LinkState: 0, MaxTxRate: 0, MinTxRate: 0, RxPackets: 11, TxPackets: 12, RxBytes: 13, TxBytes: 14, Multicast: 15, Broadcast: 16, RxDropped: 17, TxDropped: 18, RssQuery: 0, Trust: 0}, 29 | {ID: 1, Mac: nil, Vlan: 0, Qos: 0, TxRate: 0, Spoofchk: true, LinkState: 0, MaxTxRate: 0, MinTxRate: 0, RxPackets: 21, TxPackets: 22, RxBytes: 23, TxBytes: 24, Multicast: 25, Broadcast: 26, RxDropped: 27, TxDropped: 28, RssQuery: 0, Trust: 0}}}}, 30 | nil, 31 | PerPF{"ens801f0", map[int]netlink.VfInfo{ 32 | 0: {ID: 0, Mac: nil, Vlan: 0, Qos: 0, TxRate: 0, Spoofchk: true, LinkState: 0, MaxTxRate: 0, MinTxRate: 0, RxPackets: 11, TxPackets: 12, RxBytes: 13, TxBytes: 14, Multicast: 15, Broadcast: 16, RxDropped: 17, TxDropped: 18, RssQuery: 0, Trust: 0}, 33 | 1: {ID: 1, Mac: nil, Vlan: 0, Qos: 0, TxRate: 0, Spoofchk: true, LinkState: 0, MaxTxRate: 0, MinTxRate: 0, RxPackets: 21, TxPackets: 22, RxBytes: 23, TxBytes: 24, Multicast: 25, Broadcast: 26, RxDropped: 27, TxDropped: 28, RssQuery: 0, Trust: 0}}}, 34 | ), 35 | Entry("With error", 36 | "ens801f0", 37 | nil, 38 | fmt.Errorf("Link not found"), 39 | PerPF{"ens801f0", map[int]netlink.VfInfo{}}, 40 | ), 41 | ) 42 | --------------------------------------------------------------------------------