├── .code-workspace ├── .devcontainer ├── devcontainer.json └── devcontainer.json.license ├── .dockerignore ├── .github └── workflows │ ├── release.yml │ ├── reuse.yml │ └── verify.yml ├── .gitignore ├── .goreleaser.yaml ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── LICENSES └── Apache-2.0.txt ├── README.md ├── cext ├── cext.go ├── cext.swigcxx ├── cext_test.go ├── cjobfetcher.cpp ├── cjobfetcher.hpp ├── cnodefetcher.cpp ├── cnodefetcher.hpp ├── justfile ├── server.go └── test │ ├── job_test.cpp │ ├── node_test.cpp │ ├── sleep.sh │ ├── test_util.cpp │ └── test_util.hpp ├── cmain.go ├── docker_build.sh ├── entrypoint.sh ├── exporter ├── diags.go ├── diags_test.go ├── fixtures │ ├── license_out.json │ ├── license_out.json.license │ ├── sacctmgr.txt │ ├── sacctmgr.txt.license │ ├── sdiag.json │ ├── sdiag.json.license │ ├── sdiag_2405.json │ ├── sdiag_2405.json.license │ ├── sinfo_fallback.txt │ ├── sinfo_fallback.txt.license │ ├── sinfo_out.json │ ├── sinfo_out.json.license │ ├── squeue_fallback.txt │ ├── squeue_fallback.txt.license │ ├── squeue_out.json │ ├── squeue_out.json.license │ ├── trace_info_body.json │ └── trace_info_body.json.license ├── jobs.go ├── jobs_test.go ├── license.go ├── license_test.go ├── limits.go ├── limits_test.go ├── main_test.go ├── mock_utils.go ├── nodes.go ├── nodes_test.go ├── server.go ├── trace.go ├── trace_test.go ├── utils.go └── utils_test.go ├── go.mod ├── go.mod.license ├── go.sum ├── go.sum.license ├── images ├── dev_container_launch.png ├── dev_container_launch.png.license ├── trace_example.png └── trace_example.png.license ├── init_cgroup.conf ├── init_slurm.conf ├── justfile ├── main.go └── wrappers └── proctrac.py /.code-workspace: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rivosinc/prometheus-slurm-exporter/9186d0777b03fce4d36c988336639335e827eb1f/.code-workspace -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile 3 | { 4 | "name": "Existing Dockerfile", 5 | "build": { 6 | // Sets the run context to one level up instead of the .devcontainer folder. 7 | "context": "..", 8 | // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. 9 | "dockerfile": "../Dockerfile" 10 | }, 11 | "features": { 12 | "ghcr.io/devcontainers/features/github-cli:1": {}, 13 | "ghcr.io/wxw-matt/devcontainer-features/command_runner:0": {} 14 | } 15 | 16 | // Features to add to the dev container. More info: https://containers.dev/features. 17 | // "features": {}, 18 | 19 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 20 | // "forwardPorts": [], 21 | 22 | // Uncomment the next line to run commands after the container is created. 23 | // "postCreateCommand": "cat /etc/os-release", 24 | 25 | // Configure tool-specific properties. 26 | // "customizations": {}, 27 | 28 | // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. 29 | // "remoteUser": "devcontainer" 30 | } 31 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json.license: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | build/ 5 | venv/ 6 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | name: goreleaser 6 | 7 | on: 8 | release: 9 | types: [published] 10 | 11 | permissions: 12 | contents: write 13 | 14 | jobs: 15 | goreleaser: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - 19 | name: Checkout 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | - 24 | name: Set up Go 25 | uses: actions/setup-go@v5 26 | with: 27 | go-version: '>=1.20' 28 | - 29 | name: Run GoReleaser 30 | uses: goreleaser/goreleaser-action@v6 31 | with: 32 | # either 'goreleaser' (default) or 'goreleaser-pro' 33 | distribution: goreleaser 34 | # 'latest', 'nightly', or a semver 35 | version: '~> v2' 36 | args: release --clean 37 | env: 38 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 39 | -------------------------------------------------------------------------------- /.github/workflows/reuse.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | name: reuse 6 | 7 | on: [push, pull_request] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: reuse Compliance Check 15 | uses: fsfe/reuse-action@v1 16 | -------------------------------------------------------------------------------- /.github/workflows/verify.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | name: Verify exporter 6 | 7 | on: push 8 | 9 | jobs: 10 | pre-commit: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: actions/setup-python@v5 15 | with: 16 | python-version: '3.10' 17 | - uses: actions/setup-go@v5 18 | with: 19 | go-version: '>=1.23' 20 | - uses: pre-commit/action@v3.0.1 21 | 22 | test: 23 | runs-on: ubuntu-latest 24 | needs: pre-commit 25 | steps: 26 | - uses: actions/checkout@v4 27 | - uses: actions/setup-python@v5 28 | with: 29 | python-version: '3.10' 30 | - uses: actions/setup-go@v5 31 | with: 32 | go-version: '>=1.23' 33 | - run: pip install psutil requests 34 | - run: CGO_ENABLED=0 go test ./exporter 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Binaries for programs and plugins 6 | *.so 7 | *.dylib 8 | 9 | # Test binary, built with `go test -c` 10 | *.test 11 | 12 | # Output of the go coverage tool, specifically when used with LiteIDE 13 | *.out 14 | 15 | 16 | # Go workspace file 17 | go.work 18 | build/ 19 | venv/ 20 | vendor 21 | .vscode/ 22 | tmp* 23 | .env 24 | coverage.html 25 | coverage.out 26 | .DS_Store 27 | dist 28 | **/__debug_bin* 29 | -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | version: 2 5 | builds: 6 | - env: 7 | - CGO_ENABLED=0 8 | goos: 9 | - linux 10 | goarch: 11 | - amd64 12 | - arm 13 | - arm64 14 | nfpms: 15 | - vendor: rivosinc 16 | maintainer: abhinavDhulipala 17 | formats: 18 | - apk 19 | - deb 20 | - rpm 21 | - termux.deb 22 | - archlinux 23 | archives: 24 | - name_template: '{{ .ProjectName }}_{{ .Os }}_{{ .Arch }}' 25 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.4.0 8 | hooks: 9 | - id: end-of-file-fixer 10 | - id: mixed-line-ending 11 | - id: trailing-whitespace 12 | - id: check-merge-conflict 13 | 14 | - repo: https://github.com/dnephin/pre-commit-golang 15 | rev: v0.5.1 16 | hooks: 17 | - id: go-fmt 18 | # - id: go-unit-tests 19 | - id: go-mod-tidy 20 | 21 | - repo: https://github.com/rivosinc/reuse-tool 22 | rev: 'b512b7b19fd56388eda5f0c7a8ea6c2ad094109f' 23 | hooks: 24 | # Check compliance 25 | - id: reuse 26 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 6 | # Developing the main package (Golang only) 7 | 8 | Requirements: 9 | - [golang](https://go.dev/doc/install) >= 1.20 10 | - [python3](https://www.python.org/downloads/) > 3.8 11 | - python3-pip 12 | - python3-venv 13 | - [just](https://github.com/casey/just) 14 | 15 | Once the requirements are met run `just init && just test-exporter`. If all tests pass, you have a working install of the exporter. 16 | 17 | 18 | # Developing the C extension 19 | 20 | ## Developing w/ Docker 21 | 22 | ```bash 23 | # should take about 5-10 min 24 | just docker 25 | ``` 26 | 27 | This should be all that's required to get started. This will launch a single node slurm cluster upon instantiation. If for some reason, these services are killed, run `./entrypoint.sh bash` within the container. This container is equipped with everything needed to contribute to the repo out of the box. 28 | 29 | ### Opening in VScode 30 | 31 | Download the following extensions: 32 | - [Dev Container](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) 33 | - [SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) 34 | 35 | If building the image natively fails, users can build docker with `--platform linux/amd64`. After building the container, 36 | Open Vscode (`Cmd/Ctrl+Shift+P`) and run with the following: 37 | 38 | ![launch dev container]() 39 | 40 | This should pull our configured dev container. From there, our standard plugins should work with minimal modifications: 41 | 42 | - [Go](https://marketplace.visualstudio.com/items?itemName=golang.Go) 43 | - [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python) 44 | - [C/C++](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools) 45 | 46 | For the C/C++ extension, add the following include path to `.vscode/c_cpp_properties.json` 47 | 48 | ```json 49 | { 50 | "configurations": [ 51 | { 52 | "name": "Linux", 53 | "includePath": [ 54 | "${workspaceFolder}/**", 55 | "/usr/lib64/include" 56 | ], 57 | "defines": [], 58 | "compilerPath": "/usr/bin/gcc", 59 | "cStandard": "c17", 60 | "cppStandard": "gnu++14", 61 | "intelliSenseMode": "linux-gcc-x64" 62 | } 63 | ], 64 | "version": 4 65 | } 66 | ``` 67 | 68 | ### Developing Locally 69 | 70 | Download slurm and associated headers. This will typically involve [downloading](https://github.com/SchedMD/slurm/tags) a slurm release and 71 | configuring and installing the repo. Note, installing the RPM/apt packages won't install the headers that the extension needs. 72 | After installation, modify the variables in your `.env` file and invoke via the `justfile` 73 | 74 | | Variable | Default Value | Purpose | 75 | |-------------------|----------------------|-----------------------------------------------------------------------------| 76 | | SLURM_LIB_DIR | /usr/lib64/lib/slurm | directory where `libslurm.so` is located | 77 | | SLURM_INCLUDE_DIR | /usr/lib64/include | location of `slurm/slurm.h` | 78 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | FROM --platform=linux/amd64 ubuntu:20.04 5 | SHELL ["/bin/bash", "-c"] 6 | ARG SLURM_VERSION="23-02-5-1" 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | ENV LD_LIRBARY_PATH=/usr/lib64/lib/slurm 9 | ENV PATH=/usr/lib64/bin:/usr/lib64/sbin:/root/.cargo/bin:/usr/local/go/bin:$PATH 10 | RUN apt-get update -y && apt-get install -y build-essential \ 11 | cargo \ 12 | git \ 13 | git-lfs \ 14 | gdb \ 15 | libjson-c-dev \ 16 | python3-venv \ 17 | python-is-python3 \ 18 | python3-pip \ 19 | tmux \ 20 | vim \ 21 | swig3.0 \ 22 | wget && \ 23 | apt-get autoclean && \ 24 | ln -s /usr/bin/swig3.0 /usr/bin/swig 25 | # munge 26 | RUN printf '#!/bin/sh\nexit 0' > /usr/sbin/policy-rc.d && apt-get install -y libmunge-dev munge && apt-get autoclean && chown 0 /var/log/munge/munged.log 27 | # install slurm 28 | RUN mkdir -p /etc/slurm && \ 29 | mkdir -p /usr/lib64 && \ 30 | mkdir -p /var/log/slurm && \ 31 | mkdir -p /var/spool/slurmd && \ 32 | wget "https://github.com/SchedMD/slurm/archive/refs/tags/slurm-${SLURM_VERSION}.tar.gz" && \ 33 | tar -xf "slurm-${SLURM_VERSION}.tar.gz" && \ 34 | cd "slurm-slurm-${SLURM_VERSION}" && \ 35 | ./configure --prefix=/usr/lib64 --sysconfdir=/etc/slurm/ && \ 36 | make install && \ 37 | cd .. && \ 38 | rm -rf "slurm-slurm-${SLURM_VERSION}" && \ 39 | rm "slurm-${SLURM_VERSION}.tar.gz" 40 | # install go deps 41 | RUN arch=`uname -m` && \ 42 | if [ $arch == "aarch64" ]; then arch="arm64"; elif [ "$arch" == "x86_64" ]; then arch="amd64" ;fi && \ 43 | wget "https://go.dev/dl/go1.23.1.linux-${arch}.tar.gz" && \ 44 | tar -C /usr/local -xzf "go1.23.1.linux-${arch}.tar.gz" && \ 45 | rm "go1.23.1.linux-${arch}.tar.gz" && \ 46 | mkdir /src 47 | 48 | # default wrapper deps for e2e tests 49 | RUN pip install -U pip requests psutil 50 | WORKDIR /src 51 | RUN cargo install just 52 | # load project and cluster configs 53 | ADD . . 54 | RUN cp init_cgroup.conf /etc/slurm/cgroup.conf && \ 55 | cp init_slurm.conf /etc/slurm/slurm.conf 56 | ENTRYPOINT [ "./entrypoint.sh" ] 57 | ARG USER=$USER 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 10 | 11 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 12 | 13 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 14 | 15 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 16 | 17 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 18 | 19 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 20 | 21 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 22 | 23 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 24 | 25 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 26 | 27 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 28 | 29 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 30 | 31 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 32 | 33 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 34 | 35 | (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and 36 | 37 | (b) You must cause any modified files to carry prominent notices stating that You changed the files; and 38 | 39 | (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 40 | 41 | (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 42 | 43 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 44 | 45 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 46 | 47 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 48 | 49 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 50 | 51 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 52 | 53 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 54 | 55 | END OF TERMS AND CONDITIONS 56 | 57 | APPENDIX: How to apply the Apache License to your work. 58 | 59 | To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. 60 | 61 | Copyright [yyyy] [name of copyright owner] 62 | 63 | Licensed under the Apache License, Version 2.0 (the "License"); 64 | you may not use this file except in compliance with the License. 65 | You may obtain a copy of the License at 66 | 67 | http://www.apache.org/licenses/LICENSE-2.0 68 | 69 | Unless required by applicable law or agreed to in writing, software 70 | distributed under the License is distributed on an "AS IS" BASIS, 71 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 72 | See the License for the specific language governing permissions and 73 | limitations under the License. 74 | -------------------------------------------------------------------------------- /LICENSES/Apache-2.0.txt: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 6 | 7 | # Slurm Exporter 8 | 9 | [![REUSE status](https://api.reuse.software/badge/github.com/rivosinc/prometheus-slurm-exporter)](https://api.reuse.software/info/github.com/rivosinc/prometheus-slurm-exporter) 10 | [![Go Report Card](https://goreportcard.com/badge/github.com/rivosinc/prometheus-slurm-exporter)](https://goreportcard.com/report/github.com/rivosinc/prometheus-slurm-exporter) 11 | 12 | Inspired by the now unmaintained prometheus slurm [exporter](https://github.com/vpenso/prometheus-slurm-exporter). We implement in some form or another, most of the 13 | metrics from the previously maintained exporter. We have not yet added GPU or fairshare support, although we will be more than happy to accept contributions for those. 14 | This exporter supports `--json` output from cli. Note that the plugin supported is `openapi/v0.0.37` not `data_parser`, which ships with the most modern version of slurm. 15 | While in production we've found that the cli fallback (defining a custom json format from the slurm cmdline) performs far better and more reliably than parsing with the slurm 16 | provided json output. Thus, this is now the default mode of deployment as it also doesn't require any compiled plugins. We are keeping the openapi support for slurmrestd 17 | support in the future. We also support client-side throttling. In practice, users can have multiple prometheus instances polling the same exporter without worrying about 18 | overwhelming slurmctld. The final addition we've added is tracing support. If enabled, users can publish process stats for their jobs and track alloc vs live usage for 19 | profiling and optimization consideration. 20 | 21 | ### Getting Started 22 | 23 | Golang >= 1.20 is required. From there, follow the `justfile` or run `just prod` to start a dev server. 24 | You can also install the exporter directly with `go install github.com/rivosinc/prometheus-slurm-exporter@latest`. Then you can run `prometheus-slurm-exporter -h`. 25 | 26 | ```bash 27 | # example installation 28 | $ go install github.com/rivosinc/prometheus-slurm-exporter@v1.6.3 29 | # or if you like living on the edge 30 | $ go install github.com/rivosinc/prometheus-slurm-exporter@latest 31 | # if not already added, ensure 32 | $ export PATH="$PATH:$GOPATH/bin" 33 | $ prometheus-slurm-exporter -h 34 | ... 35 | # probably the most common invocation 36 | $ prometheus-slurm-exporter -slurm.cli-fallback 37 | ``` 38 | 39 | Example prometheus config: 40 | 41 | ```yaml 42 | - job_name: 'slurm-prod-scraper' 43 | scrape_interval: 30s # For the best experience, this should be at least 2x POLL_LIMIT 44 | scrape_timeout: 30s 45 | static_configs: 46 | - targets: ['path.to.exporter:9092'] 47 | ``` 48 | 49 | We've also uploaded a example [dashboard](https://grafana.com/grafana/dashboards/19835-slurm-dashboardv2) to help users get started. If the link doesn't work try import by Id: `19835` 50 | 51 | ### Job Tracing 52 | 53 | Job tracing is default disabled. To enable it simply add `-trace.enabled` to the arg list. This will enable endpoint `/trace` by default (configurable, see help page). 54 | With trace enabled jobs can _POST_ process metrics to the exporter. This adds a memory overhead that is proportional to the amount of jobs enabled for tracing. 55 | When writing wrapper scripts to upload job data, ensure that they publish data in a json schema that the exporter can understand and that it uploads proc info at a rate thats faster than the prometheus scrape time (I recommend 2x the prometheus scrape interval). Wrapped jobs can now be traced on Grafana so users can see job resource usage 56 | alongside a jobs allocated resources. Here is an example wrapper script: 57 | 58 | ```bash 59 | #!/bin/bash 60 | 61 | #SBATCH -p 62 | #SBATCH ...other options 63 | 64 | module load python3 65 | 66 | export SLURM_EXPORTER=http://path.to.exporter:8092/trace 67 | export SAMPLE_RATE=5 68 | 69 | python3 ./wrappers/proctrac.py $@ 70 | ``` 71 | 72 | We can then get dispatch jobs with our wrapper script as such `sbatch srun_wrapper.sh sleep 300`. With tracing enabled, we get the following visualization. 73 | ![Alt text]() 74 | 75 | 76 | Feel free to write your own wrappers. To use ours, ensure slurm nodes have access to `pip3 install psutil requests`. Currently, we do not plan to support any auth modes between the wrappers and the exporter, although contributions are welcome. 77 | Here is the trace architecture: 78 | ```mermaid 79 | flowchart LR 80 | 81 | A[Sbatch 1] -->|*10sec*| B[[Slurm Exporter]] 82 | C[Sbatch 2] -->|*10sec*| B[[Slurm Exporter]] 83 | D[Sbatch 3] -->|*10sec*| B[[Slurm Exporter]] 84 | B[[Slurm Exporter]] -->|*30sec*| E[(Prometheus)] 85 | ``` 86 | 87 | ### Available Metrics 88 | 89 | ```bash 90 | $ curl localhost:9092/metrics | grep "# HELP" 91 | # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. 92 | # HELP process_max_fds Maximum number of open file descriptors. 93 | # HELP process_open_fds Number of open file descriptors. 94 | # HELP process_resident_memory_bytes Resident memory size in bytes. 95 | # HELP process_start_time_seconds Start time of the process since unix epoch in seconds. 96 | # HELP process_virtual_memory_bytes Virtual memory size in bytes. 97 | # HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. 98 | # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. 99 | # HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. 100 | # HELP slurm_account_cpu_alloc alloc cpu consumed per account 101 | # HELP slurm_account_job_state_total total jobs per account per job state 102 | # HELP slurm_account_mem_alloc alloc mem consumed per account 103 | # HELP slurm_cpu_load Total cpu load 104 | # HELP slurm_cpus_idle Total idle cpus 105 | # HELP slurm_cpus_per_state Cpus per state i.e alloc, mixed, draining, etc. 106 | # HELP slurm_cpus_total Total cpus 107 | # HELP slurm_job_scrape_duration how long the cmd [cat fixtures/squeue_out.json] took (ms) 108 | # HELP slurm_job_scrape_error slurm job scrape error 109 | # HELP slurm_mem_alloc Total alloc mem 110 | # HELP slurm_mem_free Total free mem 111 | # HELP slurm_mem_real Total real mem 112 | # HELP slurm_node_scrape_duration how long the cmd [cat fixtures/sinfo_out.json] took (ms) 113 | # HELP slurm_node_scrape_error slurm node info scrape errors 114 | # HELP slurm_partition_alloc_cpus Alloc cpus per partition 115 | # HELP slurm_partition_alloc_mem Alloc mem per partition 116 | # HELP slurm_partition_cpu_load Total cpu load per partition 117 | # HELP slurm_partition_idle_cpus Idle cpus per partition 118 | # HELP slurm_partition_job_state_total total jobs per partition per state 119 | # HELP slurm_partition_real_mem Real mem per partition 120 | # HELP slurm_partition_total_cpus Total cpus per partition 121 | # HELP slurm_partition_weight Total node weight per partition?? 122 | # HELP slurm_user_cpu_alloc total cpu alloc per user 123 | # HELP slurm_user_mem_alloc total mem alloc per user 124 | # HELP slurm_user_state_total total jobs per state per user 125 | # HELP slurm_node_count_per_state nodes per state 126 | 127 | # Only available for -trace.enabled jobs 128 | # HELP slurm_proc_cpu_usage actual cpu usage collected from proc monitor 129 | # HELP slurm_proc_mem_usage proc mem usage 130 | # HELP slurm_proc_pid pid of running slurm job 131 | # HELP slurm_proc_read_bytes proc read bytes 132 | # HELP slurm_proc_threadcount threads currently being used 133 | # HELP slurm_proc_write_bytes proc write bytes 134 | # HELP slurm_job_cpu_alloc running job cpus allocated 135 | # HELP slurm_job_mem_alloc running job cpus allocated 136 | 137 | # Exporter stats 138 | # HELP slurm_node_count_per_state nodes per state 139 | # HELP slurm_node_scrape_duration how long the cmd [] took ms 140 | # HELP slurm_node_scrape_error slurm node info scrape errors 141 | # HELP slurm_job_count_per_state jobs per state 142 | # HELP slurm_job_scrape_duration how long the cmd [] took ms 143 | # HELP slurm_job_scrape_error slurm job info scrape errors 144 | 145 | ``` 146 | 147 | ### Exporter Env Var Docs 148 | 149 | Env vars can be sepcified in a `.env` file, while using the `just` 150 | | Var | Default Value | Purpose | 151 | |-----------------|---------------|-----------------------------------------------------------------------------| 152 | | POLL_LIMIT | 10 | # of seconds to wait before polling slurmctl again (client-side throttling) | 153 | | LOGLEVEL | info | Log Level: debug, info, warn, error | 154 | | CLI_TIMEOUT | 10. | # seconds before the exporter terminates command. | 155 | | TRACE_ROOT_PATH | "cwd" | path to ./templates directory where html files are located | 156 | 157 | ### RPM/DEB Packages 158 | 159 | You can download RPM or DEB versions from the [Releases](https://github.com/rivosinc/prometheus-slurm-exporter/releases) tab. 160 | 161 | ### Running with Systemd 162 | 163 | You can install a systemd service definition using the following command: 164 | 165 | ```bash 166 | sudo bash -c 'cat << EOF > /etc/systemd/system/prometheus-slurm-exporter.service 167 | [Unit] 168 | Description=Prometheus SLURM Exporter 169 | 170 | [Service] 171 | ExecStart=/usr/bin/prometheus-slurm-exporter 172 | Restart=always 173 | RestartSec=15 174 | 175 | [Install] 176 | WantedBy=multi-user.target 177 | EOF' 178 | sudo systemctl daemon-reload 179 | sudo systemctl enable --now prometheus-slurm-exporter.service 180 | ``` 181 | 182 | Customizing the systemd service with environment variables: 183 | 184 | ```bash 185 | sudo systemctl edit prometheus-slurm-exporter.service` 186 | ``` 187 | 188 | ```text 189 | ### Editing /etc/systemd/system/prometheus-slurm-exporter.service.d/override.conf 190 | ### Anything between here and the comment below will become the new contents of the file 191 | 192 | [Service] 193 | Environment="PATH=/opt/slurm/bin" 194 | Environment="POLL_INTERVAL=300" 195 | Environment="CLI_TIMEOUT=60" 196 | Environment="LOGLEVEL=debug" 197 | 198 | ### Lines below this comment will be discarded 199 | 200 | ### /usr/lib/systemd/system/prometheus-slurm-exporter.service 201 | # [Unit] 202 | # Description=Prometheus SLURM Exporter 203 | # 204 | # [Service] 205 | # ExecStart=/usr/bin/prometheus-slurm-exporter 206 | # Restart=always 207 | # RestartSec=15 208 | # 209 | # [Install] 210 | # WantedBy=multi-user.target 211 | ``` 212 | 213 | ### Future work 214 | slurmrestd support 215 | -------------------------------------------------------------------------------- /cext/cext.go: -------------------------------------------------------------------------------- 1 | package cext 2 | 3 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 4 | // 5 | // SPDX-License-Identifier: Apache-2.0 6 | 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "strings" 12 | "time" 13 | 14 | "log/slog" 15 | 16 | "github.com/prometheus/client_golang/prometheus" 17 | "github.com/rivosinc/prometheus-slurm-exporter/exporter" 18 | ) 19 | 20 | type Destructor interface { 21 | Deinit() 22 | } 23 | 24 | type CNodeFetcher struct { 25 | cache *exporter.AtomicThrottledCache[exporter.NodeMetric] 26 | scraper NodeMetricScraper 27 | duration time.Duration 28 | errorCounter prometheus.Counter 29 | } 30 | 31 | // should be defer'd immediately after new cmd to prevent mem leaks 32 | func (cni *CNodeFetcher) Deinit() { 33 | DeleteNodeMetricScraper(cni.scraper) 34 | } 35 | 36 | func (cni *CNodeFetcher) CToGoMetricConvert() ([]exporter.NodeMetric, error) { 37 | if errno := cni.scraper.CollectNodeInfo(); errno != 0 { 38 | cni.errorCounter.Inc() 39 | return nil, fmt.Errorf("Node Info CPP errno: %d", errno) 40 | } 41 | cni.scraper.IterReset() 42 | nodeMetrics := make([]exporter.NodeMetric, 0) 43 | metric := NewPromNodeMetric() 44 | defer DeletePromNodeMetric(metric) 45 | nodeStates := map[uint64]string{ 46 | 0: "UNKNOWN", 47 | 1: "DOWN", 48 | 2: "IDLE", 49 | 3: "ALLOCATED", 50 | 4: "ERROR", 51 | 5: "MIXED", 52 | 6: "FUTURE", 53 | // used by the C api to detect end of enum. Shouldn't ever be emitted 54 | 7: "END", 55 | } 56 | 57 | now := time.Now() 58 | for cni.scraper.IterNext(metric) == 0 { 59 | nodeMetrics = append(nodeMetrics, exporter.NodeMetric{ 60 | Hostname: metric.GetHostname(), 61 | Cpus: float64(metric.GetCpus()), 62 | RealMemory: float64(metric.GetRealMemory()), 63 | FreeMemory: float64(metric.GetFreeMem()), 64 | Partitions: strings.Split(metric.GetPartitions(), ","), 65 | State: nodeStates[metric.GetNodeState()], 66 | AllocMemory: float64(metric.GetAllocMem()), 67 | AllocCpus: float64(metric.GetAllocCpus()), 68 | IdleCpus: float64(metric.GetCpus()) - float64(metric.GetAllocCpus()), 69 | Weight: float64(metric.GetWeight()), 70 | CpuLoad: float64(metric.GetCpuLoad()), 71 | }) 72 | } 73 | cni.duration = time.Since(now) 74 | return nodeMetrics, nil 75 | } 76 | 77 | func (cni *CNodeFetcher) FetchMetrics() ([]exporter.NodeMetric, error) { 78 | return cni.cache.FetchOrThrottle(cni.CToGoMetricConvert) 79 | } 80 | 81 | func (cni *CNodeFetcher) ScrapeDuration() time.Duration { 82 | return cni.duration 83 | } 84 | 85 | func (cni *CNodeFetcher) ScrapeError() prometheus.Counter { 86 | return cni.errorCounter 87 | } 88 | 89 | func NewNodeFetcher(pollLimit float64) *CNodeFetcher { 90 | return &CNodeFetcher{ 91 | cache: exporter.NewAtomicThrottledCache[exporter.NodeMetric](pollLimit), 92 | scraper: NewNodeMetricScraper(""), 93 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{ 94 | Name: "slurm_cplugin_node_fetch_error", 95 | Help: "slurm cplugin fetch error", 96 | }), 97 | } 98 | } 99 | 100 | type CJobFetcher struct { 101 | cache *exporter.AtomicThrottledCache[exporter.JobMetric] 102 | scraper JobMetricScraper 103 | duration time.Duration 104 | errorCounter prometheus.Counter 105 | } 106 | 107 | func (cjf *CJobFetcher) CToGoMetricConvert() ([]exporter.JobMetric, error) { 108 | if errno := cjf.scraper.CollectJobInfo(); errno != 0 { 109 | cjf.errorCounter.Inc() 110 | return nil, fmt.Errorf("Job Info CPP errno: %d", errno) 111 | } 112 | jobStates := map[int]string{ 113 | 0: "PENDING", 114 | 1: "RUNNING", 115 | 2: "SUSPENDED", 116 | 3: "COMPLETE", 117 | 4: "CANCELLED", 118 | 5: "FAILED", 119 | 6: "TIMEOUT", 120 | 7: "NODE_FAIL", 121 | 8: "PREEMPTED", 122 | 9: "BOOT_FAIL", 123 | 10: "DEADLINE", 124 | 11: "OOM", 125 | // should never happen 126 | 12: "END", 127 | } 128 | metrics := make([]exporter.JobMetric, 0) 129 | cmetric := NewPromJobMetric() 130 | defer DeletePromJobMetric(cmetric) 131 | cjf.scraper.IterReset() 132 | for cjf.scraper.IterNext(cmetric) == 0 { 133 | metric := exporter.JobMetric{ 134 | Account: cmetric.GetAccount(), 135 | JobId: float64(cmetric.GetJobId()), 136 | EndTime: cmetric.GetEndTime(), 137 | JobState: jobStates[cmetric.GetJobState()], 138 | UserName: cmetric.GetUserName(), 139 | Partition: cmetric.GetPartitions(), 140 | JobResources: exporter.JobResource{ 141 | AllocCpus: cmetric.GetAllocCpus(), 142 | AllocNodes: map[string]*exporter.NodeResource{"0": {Mem: cmetric.GetAllocMem()}}, 143 | }, 144 | } 145 | metrics = append(metrics, metric) 146 | slog.Error(fmt.Sprintf("metrics %v, alloc mem %f", metric, metric.JobResources.AllocNodes["0"].Mem)) 147 | } 148 | 149 | return metrics, nil 150 | } 151 | 152 | func (cjf *CJobFetcher) FetchMetrics() ([]exporter.JobMetric, error) { 153 | return cjf.cache.FetchOrThrottle(cjf.CToGoMetricConvert) 154 | } 155 | 156 | func (cjf *CJobFetcher) ScrapeDuration() time.Duration { 157 | return cjf.duration 158 | } 159 | 160 | func (cjf *CJobFetcher) ScrapeError() prometheus.Counter { 161 | return cjf.errorCounter 162 | } 163 | 164 | func (cjf *CJobFetcher) Deinit() { 165 | DeleteJobMetricScraper(cjf.scraper) 166 | } 167 | 168 | func NewJobFetcher(pollLimit float64) *CJobFetcher { 169 | return &CJobFetcher{ 170 | cache: exporter.NewAtomicThrottledCache[exporter.JobMetric](pollLimit), 171 | scraper: NewJobMetricScraper(""), 172 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{ 173 | Name: "slurm_cplugin_job_fetch_error", 174 | Help: "slurm cplugin job fetch error", 175 | }), 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /cext/cext.swigcxx: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | %module cext 6 | 7 | %include 8 | %include "std_string.i" 9 | %include "std_vector.i" 10 | %include "std_map.i" 11 | %{ 12 | #include "cnodefetcher.hpp" 13 | #include "cjobfetcher.hpp" 14 | %} 15 | %include "cnodefetcher.hpp" 16 | %include "cjobfetcher.hpp" 17 | -------------------------------------------------------------------------------- /cext/cext_test.go: -------------------------------------------------------------------------------- 1 | package cext 2 | 3 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 4 | // 5 | // SPDX-License-Identifier: Apache-2.0 6 | 7 | import ( 8 | "os" 9 | "os/exec" 10 | "testing" 11 | 12 | "github.com/prometheus/client_golang/prometheus" 13 | "github.com/rivosinc/prometheus-slurm-exporter/exporter" 14 | "github.com/stretchr/testify/assert" 15 | ) 16 | 17 | func TestCtoGoNodeMetrics(t *testing.T) { 18 | assert := assert.New(t) 19 | collector := NewNodeFetcher(0) 20 | defer collector.Deinit() 21 | metrics, err := collector.CToGoMetricConvert() 22 | assert.NoError(err) 23 | assert.Positive(len(metrics)) 24 | } 25 | 26 | func TestCtoGoJobMetrics(t *testing.T) { 27 | assert := assert.New(t) 28 | fetcher := NewJobFetcher(0) 29 | defer fetcher.Deinit() 30 | cmd := exec.Command("srun", "sleep", "100") 31 | cmd.Start() 32 | defer cmd.Process.Kill() 33 | metrics, err := fetcher.CToGoMetricConvert() 34 | assert.NoError(err) 35 | assert.Positive(len(metrics)) 36 | } 37 | func TestCtoGoJobMetricsTwice(t *testing.T) { 38 | assert := assert.New(t) 39 | fetcher := NewJobFetcher(0) 40 | defer fetcher.Deinit() 41 | cmd := exec.Command("srun", "sleep", "100") 42 | cmd.Start() 43 | defer cmd.Process.Kill() 44 | metrics, err := fetcher.CToGoMetricConvert() 45 | assert.NoError(err) 46 | assert.Positive(len(metrics)) 47 | metrics, err = fetcher.CToGoMetricConvert() 48 | assert.NoError(err) 49 | assert.Positive(len(metrics)) 50 | } 51 | 52 | func TestCtoGoNodeMetricsTwice(t *testing.T) { 53 | assert := assert.New(t) 54 | // force cache misses 55 | collector := NewNodeFetcher(0) 56 | defer collector.Deinit() 57 | metrics, err := collector.CToGoMetricConvert() 58 | assert.NoError(err) 59 | assert.Positive(len(metrics)) 60 | // tests cached partition & node info data path 61 | metrics, err = collector.CToGoMetricConvert() 62 | assert.NoError(err) 63 | assert.Positive(len(metrics)) 64 | } 65 | 66 | func TestNodeCollectorCFetcher(t *testing.T) { 67 | if os.Getenv("TEST_CLUSTER") != "true" { 68 | return 69 | } 70 | assert := assert.New(t) 71 | config, err := exporter.NewConfig(new(exporter.CliFlags)) 72 | assert.Nil(err) 73 | config.PollLimit = 10 74 | nc := exporter.NewNodeCollecter(config) 75 | // cache miss, use our mock fetcher 76 | nc.SetFetcher(NewNodeFetcher(config.PollLimit)) 77 | metricChan := make(chan prometheus.Metric) 78 | go func() { 79 | nc.Collect(metricChan) 80 | close(metricChan) 81 | }() 82 | metrics := make([]prometheus.Metric, 0) 83 | for m, ok := <-metricChan; ok; m, ok = <-metricChan { 84 | metrics = append(metrics, m) 85 | t.Logf("Received metric %s", m.Desc().String()) 86 | } 87 | assert.NotEmpty(metrics) 88 | } 89 | -------------------------------------------------------------------------------- /cext/cjobfetcher.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | const string STRING_NULL = "(null)"; 10 | constexpr int MB = 1000000; 11 | 12 | PromJobMetric::PromJobMetric(slurm_job_info_t &job_ref) 13 | { 14 | job_info = job_ref; 15 | if ((JOB_STATE_BASE & job_info.job_state) != JOB_RUNNING) 16 | return; 17 | slurm_job_cpus_allocated_on_node(job_info.job_resrcs, job_info.nodes); 18 | int error_code = slurm_get_errno(); 19 | if (SLURM_SUCCESS != error_code && SLURM_NO_CHANGE_IN_DATA != error_code) 20 | printf("failed to add alloc cpus with errno %d \n", error_code); 21 | } 22 | 23 | PromJobMetric::PromJobMetric() 24 | { 25 | job_info = slurm_job_info_t(); 26 | } 27 | 28 | PromJobMetric::~PromJobMetric() {} 29 | 30 | string PromJobMetric::GetAccount() 31 | { 32 | if (job_info.account) 33 | return job_info.account; 34 | return STRING_NULL; 35 | } 36 | 37 | int PromJobMetric::GetJobId() 38 | { 39 | return job_info.job_id; 40 | } 41 | 42 | double PromJobMetric::GetEndTime() 43 | { 44 | return job_info.end_time; 45 | } 46 | 47 | double PromJobMetric::GetAllocCpus() 48 | { 49 | if (nullptr == job_info.job_resrcs) 50 | return job_info.pn_min_cpus; 51 | job_resrcs *resc = (job_resrcs *)job_info.job_resrcs; 52 | return (double)resc->ncpus; 53 | } 54 | 55 | double PromJobMetric::GetAllocMem() 56 | { 57 | if (job_info.gres_total) { 58 | cout << "gres total " << job_info.gres_total << "state: " << job_info.job_state << endl; 59 | } 60 | if (nullptr == job_info.job_resrcs) { 61 | cout << "min_mem " << job_info.mem_per_tres << " num nodes " << job_info.num_nodes << "\n"; 62 | return job_info.pn_min_memory * job_info.num_nodes; 63 | } 64 | job_resrcs *resc = (job_resrcs *)job_info.job_resrcs; 65 | uint64_t alloc_mem = 0; 66 | for (int i = 0; i < resc->nhosts; i++) 67 | alloc_mem += resc->memory_allocated[i]; 68 | return (double)alloc_mem * MB; 69 | } 70 | 71 | int PromJobMetric::GetJobState() 72 | { 73 | return job_info.job_state; 74 | } 75 | 76 | string PromJobMetric::GetPartitions() 77 | { 78 | return job_info.partition ? job_info.partition : STRING_NULL; 79 | } 80 | 81 | string PromJobMetric::GetUserName() 82 | { 83 | if (0 == job_info.user_id) 84 | return "root"; 85 | if (nullptr == job_info.user_name) 86 | return STRING_NULL; 87 | return job_info.user_name; 88 | } 89 | 90 | JobMetricScraper::JobMetricScraper(string conf) 91 | { 92 | if (conf == "") 93 | { 94 | slurm_init(nullptr); 95 | } 96 | else 97 | { 98 | slurm_init(conf.c_str()); 99 | } 100 | new_job_ptr = nullptr; 101 | old_job_ptr = nullptr; 102 | IterReset(); 103 | } 104 | 105 | int JobMetricScraper::CollectJobInfo() 106 | { 107 | time_t updated_at = old_job_ptr ? old_job_ptr->last_update : (time_t) nullptr; 108 | int error_code = slurm_load_jobs(updated_at, &new_job_ptr, SHOW_DETAIL); 109 | if (SLURM_SUCCESS != error_code && SLURM_NO_CHANGE_IN_DATA == slurm_get_errno()) 110 | { 111 | error_code = SLURM_SUCCESS; 112 | new_job_ptr = old_job_ptr; 113 | } 114 | if (SLURM_SUCCESS != error_code) 115 | return slurm_get_errno(); 116 | 117 | // want to ensure stale members aren't kept in the map i.e new job array is a subset of old job array 118 | // also old_job_array + new_job_array could still be a subset of collection map 119 | // delete all stale members in map 120 | if (old_job_ptr && new_job_ptr != old_job_ptr){ 121 | job_metric_map.clear(); 122 | slurm_free_job_info_msg(old_job_ptr); 123 | } 124 | // enrich with new members 125 | for (int i = 0; i < new_job_ptr->record_count; i++) 126 | { 127 | slurm_job_info_t job = new_job_ptr->job_array[i]; 128 | PromJobMetric metric(job); 129 | job_metric_map[metric.GetJobId()] = metric; 130 | } 131 | old_job_ptr = new_job_ptr; 132 | return SLURM_SUCCESS; 133 | } 134 | 135 | int JobMetricScraper::IterNext(PromJobMetric *metric) 136 | { 137 | if (it == job_metric_map.cend()) 138 | return 1; 139 | *metric = it->second; 140 | it++; 141 | return 0; 142 | } 143 | 144 | void JobMetricScraper::IterReset() 145 | { 146 | it = job_metric_map.cbegin(); 147 | } 148 | -------------------------------------------------------------------------------- /cext/cjobfetcher.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | class PromJobMetric 12 | { 13 | slurm_job_info_t job_info; 14 | // unfortunately reveal opaque data type: https://github.com/SchedMD/slurm/blob/master/src/common/job_resources.h#L106-L128 15 | // slurm library doesn't expose enough information for us to scrape 16 | struct job_resrcs 17 | { 18 | bitstr_t *core_bitmap; 19 | bitstr_t *core_bitmap_used; 20 | uint32_t cpu_array_cnt; 21 | uint16_t *cpu_array_value; 22 | uint32_t *cpu_array_reps; 23 | uint16_t *cpus; 24 | uint16_t *cpus_used; 25 | uint16_t *cores_per_socket; 26 | uint16_t cr_type; 27 | uint64_t *memory_allocated; 28 | uint64_t *memory_used; 29 | uint32_t nhosts; 30 | bitstr_t *node_bitmap; 31 | uint32_t node_req; 32 | char *nodes; 33 | uint32_t ncpus; 34 | uint32_t *sock_core_rep_count; 35 | uint16_t *sockets_per_node; 36 | uint16_t *tasks_per_node; 37 | uint16_t threads_per_core; 38 | uint8_t whole_node; 39 | }; 40 | 41 | public: 42 | PromJobMetric(slurm_job_info_t &job_ref); 43 | PromJobMetric(); 44 | ~PromJobMetric(); 45 | string GetAccount(); 46 | int GetJobId(); 47 | double GetEndTime(); 48 | double GetAllocCpus(); 49 | double GetAllocMem(); 50 | int GetJobState(); 51 | string GetPartitions(); 52 | string GetUserName(); 53 | }; 54 | 55 | class JobMetricScraper 56 | { 57 | private: 58 | job_info_msg_t *new_job_ptr, *old_job_ptr; 59 | map job_metric_map; 60 | map::const_iterator it; 61 | 62 | public: 63 | JobMetricScraper(string conf); 64 | int CollectJobInfo(); 65 | int IterNext(PromJobMetric *metric); 66 | void IterReset(); 67 | }; 68 | -------------------------------------------------------------------------------- /cext/cnodefetcher.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "cnodefetcher.hpp" 11 | 12 | PromNodeMetric::PromNodeMetric(node_info_t &node_ptr) 13 | { 14 | node_info = node_ptr; 15 | int err = slurm_get_select_nodeinfo(node_info.select_nodeinfo, 16 | SELECT_NODEDATA_SUBCNT, 17 | NODE_STATE_ALLOCATED, 18 | &alloc_cpus); 19 | if (err) 20 | cout << "WARNING: failed to enrich alloc cpu data\n"; 21 | err += slurm_get_select_nodeinfo(node_info.select_nodeinfo, 22 | SELECT_NODEDATA_MEM_ALLOC, 23 | NODE_STATE_ALLOCATED, 24 | &alloc_mem); 25 | if (err) 26 | cout << "WARNING: failed to enrich alloc mem data\n"; 27 | } 28 | 29 | PromNodeMetric::PromNodeMetric() 30 | { 31 | node_info = node_info_t(); 32 | } 33 | 34 | string PromNodeMetric::GetHostname() 35 | { 36 | return node_info.node_hostname; 37 | } 38 | 39 | string PromNodeMetric::GetPartitions() 40 | { 41 | return node_info.partitions; 42 | } 43 | 44 | double PromNodeMetric::GetCpuLoad() 45 | { 46 | return (double)node_info.cpu_load / 100; 47 | } 48 | 49 | double PromNodeMetric::GetCpus() 50 | { 51 | return (double)node_info.cpus; 52 | } 53 | 54 | double PromNodeMetric::GetFreeMem() 55 | { 56 | return (double)node_info.free_mem * MB; 57 | } 58 | 59 | double PromNodeMetric::GetRealMemory() 60 | { 61 | return (double)node_info.real_memory * MB; 62 | } 63 | 64 | double PromNodeMetric::GetWeight() 65 | { 66 | return (double)node_info.weight; 67 | } 68 | 69 | double PromNodeMetric::GetAllocCpus() 70 | { 71 | return (double)alloc_cpus; 72 | } 73 | 74 | double PromNodeMetric::GetAllocMem() 75 | { 76 | return (double)alloc_mem * MB; 77 | } 78 | 79 | uint64_t PromNodeMetric::GetNodeState() 80 | { 81 | return (uint64_t)node_info.node_state; 82 | } 83 | 84 | // destruction should happen slurm_free_node_info_msg not via individual destructors 85 | PromNodeMetric::~PromNodeMetric() {} 86 | 87 | NodeMetricScraper::~NodeMetricScraper() 88 | { 89 | if (new_node_ptr) 90 | slurm_free_node_info_msg(new_node_ptr); 91 | if (old_node_ptr != new_node_ptr && old_node_ptr) 92 | slurm_free_node_info_msg(old_node_ptr); 93 | old_node_ptr = nullptr; 94 | new_node_ptr = nullptr; 95 | if (new_part_ptr) 96 | slurm_free_partition_info_msg(new_part_ptr); 97 | if (old_part_ptr != new_part_ptr && old_part_ptr) 98 | slurm_free_partition_info_msg(old_part_ptr); 99 | old_part_ptr = nullptr; 100 | new_part_ptr = nullptr; 101 | slurm_fini(); 102 | } 103 | 104 | int NodeMetricScraper::CollectNodeInfo() 105 | { 106 | int error_code; 107 | time_t part_update_at, node_update_at; 108 | part_update_at = old_part_ptr ? old_part_ptr->last_update : (time_t) nullptr; 109 | node_update_at = old_node_ptr ? old_node_ptr->last_update : (time_t) nullptr; 110 | error_code = slurm_load_partitions(part_update_at, &new_part_ptr, SHOW_ALL); 111 | if (SLURM_SUCCESS != error_code && SLURM_NO_CHANGE_IN_DATA == slurm_get_errno()) 112 | { 113 | error_code = SLURM_SUCCESS; 114 | new_part_ptr = old_part_ptr; 115 | } 116 | if (SLURM_SUCCESS != error_code) 117 | return slurm_get_errno(); 118 | node_update_at = old_node_ptr ? old_node_ptr->last_update : (time_t) nullptr; 119 | error_code = slurm_load_node(node_update_at, &new_node_ptr, SHOW_ALL); 120 | if (SLURM_SUCCESS != error_code && SLURM_NO_CHANGE_IN_DATA == slurm_get_errno()) 121 | { 122 | error_code = SLURM_SUCCESS; 123 | new_node_ptr = old_node_ptr; 124 | } 125 | if (SLURM_SUCCESS != error_code) 126 | return error_code; 127 | // enrich with node info 128 | slurm_populate_node_partitions(new_node_ptr, new_part_ptr); 129 | if (old_node_ptr && old_node_ptr != new_node_ptr){ 130 | enriched_metrics.clear(); 131 | slurm_free_node_info_msg(old_node_ptr); 132 | } 133 | if (old_part_ptr != new_part_ptr) 134 | slurm_free_partition_info_msg(old_part_ptr); 135 | int alloc_errs = 0; 136 | for (int i = 0; i < new_node_ptr->record_count; i++) 137 | { 138 | PromNodeMetric metric(new_node_ptr->node_array[i]); 139 | enriched_metrics[metric.GetHostname()] = metric; 140 | } 141 | 142 | old_node_ptr = new_node_ptr; 143 | old_part_ptr = new_part_ptr; 144 | return SLURM_SUCCESS; 145 | } 146 | 147 | void NodeMetricScraper::Print() 148 | { 149 | cout << "NodeMetrics: ["; 150 | for (auto const &p : enriched_metrics) 151 | cout << "{" << p.first << "},"; 152 | cout << "]" << endl; 153 | } 154 | 155 | int NodeMetricScraper::IterNext(PromNodeMetric *metric) 156 | { 157 | if (it == enriched_metrics.cend()) 158 | return SLURM_ERROR; 159 | *metric = it->second; 160 | it++; 161 | return SLURM_SUCCESS; 162 | } 163 | 164 | void NodeMetricScraper::IterReset() 165 | { 166 | it = enriched_metrics.cbegin(); 167 | } 168 | 169 | NodeMetricScraper::NodeMetricScraper(string conf) 170 | { 171 | if (conf == "") 172 | slurm_init(nullptr); 173 | else 174 | slurm_init(conf.c_str()); 175 | new_node_ptr = nullptr; 176 | old_node_ptr = nullptr; 177 | new_part_ptr = nullptr; 178 | old_part_ptr = nullptr; 179 | IterReset(); 180 | } 181 | -------------------------------------------------------------------------------- /cext/cnodefetcher.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | using namespace std; 12 | 13 | constexpr int MB = 1000000; 14 | 15 | class PromNodeMetric 16 | { 17 | private: 18 | node_info_t node_info; 19 | uint16_t alloc_cpus; 20 | uint64_t alloc_mem; 21 | 22 | public: 23 | PromNodeMetric(node_info_t &node_info); 24 | PromNodeMetric(); 25 | ~PromNodeMetric(); 26 | // return double to easily coerce to go float64 27 | double GetCpus(); 28 | double GetRealMemory(); 29 | double GetFreeMem(); 30 | uint64_t GetNodeState(); 31 | double GetAllocCpus(); 32 | double GetAllocMem(); 33 | double GetWeight(); 34 | double GetCpuLoad(); 35 | string GetHostname(); 36 | string GetPartitions(); 37 | }; 38 | 39 | struct NodeMetricScraper 40 | { 41 | private: 42 | partition_info_msg_t *new_part_ptr, *old_part_ptr; 43 | node_info_msg_t *new_node_ptr, *old_node_ptr; 44 | map enriched_metrics; 45 | map::const_iterator it; 46 | 47 | public: 48 | NodeMetricScraper(string conf); 49 | ~NodeMetricScraper(); 50 | int CollectNodeInfo(); 51 | void Print(); 52 | // public iterator exposure. Swig doesn't properly expose the iterator subclass 53 | // expects to be IterReset always 54 | int IterNext(PromNodeMetric *metric); 55 | void IterReset(); 56 | }; 57 | -------------------------------------------------------------------------------- /cext/justfile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | build_dir := "./build" 6 | set dotenv-path := "../.env" 7 | set dotenv-load 8 | set shell := ["bash", "-ceo", "pipefail"] 9 | 10 | default: 11 | just --list 12 | 13 | build: 14 | rm -rf {{build_dir}} 15 | mkdir {{build_dir}} 16 | CGO_CXXFLAGS="-I${SLURM_INCLUDE_DIR}" CGO_LDFLAGS="-L${SLURM_LIB_DIR} -lslurmfull" go build -o {{build_dir}}/slurm_exporter server.go 17 | 18 | devel: build 19 | LD_LIBRARY_PATH="${SLURM_LIB_DIR}" {{build_dir}}/slurm_exporter 20 | 21 | gotest: 22 | #!/bin/bash 23 | set -aeuxo pipefail 24 | CGO_CXXFLAGS="-I${SLURM_INCLUDE_DIR}" 25 | CGO_LDFLAGS="-L${SLURM_LIB_DIR} -lslurmfull" 26 | LD_LIBRARY_PATH="${SLURM_LIB_DIR}" 27 | TEST_CLUSTER=true 28 | go test 29 | 30 | cppnodetest: 31 | #!/bin/bash 32 | set -axu 33 | rm -rf {{build_dir}}/test && mkdir -p {{build_dir}}/test 34 | g++ test/test_util.cpp test/node_test.cpp *.cpp -I. -I${SLURM_INCLUDE_DIR} -L${SLURM_LIB_DIR} -lslurmfull -o {{build_dir}}/test/node_test 35 | SID=`sbatch test/sleep.sh | awk '{print $4}'` 36 | while [[ `squeue -j ${SID} --noheader -o %t` != "R" ]]; do echo "waiting for job to start..." && sleep 1; done 37 | LD_LIBRARY_PATH=$SLURM_LIB_DIR {{build_dir}}/test/node_test 38 | scancel $SID 39 | 40 | cppjobtest: 41 | #!/bin/bash 42 | set -axu 43 | rm -rf {{build_dir}}/test && mkdir -p {{build_dir}}/test 44 | g++ -g test/test_util.cpp test/job_test.cpp *.cpp -I. -I${SLURM_INCLUDE_DIR} -L${SLURM_LIB_DIR} -lslurmfull -o {{build_dir}}/test/job_test 45 | # this is assumed to be on the test cluster w/o other running slurm jobs 46 | SID=`sbatch test/sleep.sh | awk '{print $4}'` 47 | while [[ `squeue -j ${SID} --noheader -o %t` != "R" ]]; do echo "waiting for job to start..." && sleep 1; done 48 | LD_LIBRARY_PATH=$SLURM_LIB_DIR gdb {{build_dir}}/test/job_test 49 | scancel $SID 50 | 51 | 52 | swigdebug: 53 | rm -rf {{build_dir}}/swig && mkdir -p {{build_dir}}/swig 54 | swig -c++ -go -intgosize 64 -outdir {{build_dir}}/swig -o {{build_dir}}/swig/slurmcprom_wrap.cpp -verbose slurmcprom.swigcxx 55 | -------------------------------------------------------------------------------- /cext/server.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | package cext 5 | 6 | import ( 7 | "net/http" 8 | "os" 9 | 10 | "github.com/prometheus/client_golang/prometheus" 11 | "github.com/prometheus/client_golang/prometheus/promhttp" 12 | "github.com/rivosinc/prometheus-slurm-exporter/exporter" 13 | "log/slog" 14 | ) 15 | 16 | func InitPromServer(config *exporter.Config) (http.Handler, []Destructor) { 17 | textHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ 18 | Level: config.LogLevel, 19 | }) 20 | slog.SetDefault(slog.New(textHandler)) 21 | nodeCollector := exporter.NewNodeCollecter(config) 22 | cNodeFetcher := NewNodeFetcher(config.PollLimit) 23 | nodeCollector.SetFetcher(cNodeFetcher) 24 | prometheus.MustRegister(nodeCollector) 25 | CJobFetcher := NewJobFetcher(config.PollLimit) 26 | jobCollector := exporter.NewJobsController(config) 27 | jobCollector.SetFetcher(CJobFetcher) 28 | prometheus.MustRegister(jobCollector) 29 | return promhttp.Handler(), []Destructor{cNodeFetcher, CJobFetcher} 30 | } 31 | -------------------------------------------------------------------------------- /cext/test/job_test.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | void JobMetricScraper_CollectHappy(TestHandler &th) 14 | { 15 | auto scraper = JobMetricScraper(""); 16 | int errnum = scraper.CollectJobInfo(); 17 | string testname("Node Metric Scraper Collect Happy"); 18 | th.Register(TestWrapper(testname, errnum == 0)); 19 | } 20 | 21 | void JobMetricScraper_CollectTwice(TestHandler &th) 22 | { 23 | auto scraper = JobMetricScraper(""); 24 | int errnum = scraper.CollectJobInfo(); 25 | int errnum2 = scraper.CollectJobInfo(); 26 | string testname("Node Metric Scraper Cache hit Works"); 27 | th.Register(TestWrapper(testname, errnum == 0 && errnum2 == 0)); 28 | } 29 | 30 | void JobMetricScraper_CollectThrice(TestHandler &th) 31 | { 32 | auto scraper = JobMetricScraper(""); 33 | int errnum = scraper.CollectJobInfo(); 34 | int errnum2 = scraper.CollectJobInfo(); 35 | int errnum3 = scraper.CollectJobInfo(); 36 | cout << "end" << endl; 37 | string testname("Node Metric Catch Seg"); 38 | th.Register(TestWrapper(testname, errnum == 0 && errnum2 == 0 && errnum3 == 0)); 39 | } 40 | 41 | void TestIter(TestHandler &th) 42 | { 43 | JobMetricScraper scraper(""); 44 | int errnum = scraper.CollectJobInfo(); 45 | scraper.IterReset(); 46 | auto metric = new PromJobMetric; 47 | int count = 0; 48 | assert(errnum == 0); 49 | while (scraper.IterNext(metric) == 0) 50 | count++; 51 | string testname("Test Map Iteration After Collection"); 52 | th.Register(TestWrapper(testname, count > 0)); 53 | } 54 | 55 | void TestIter_Empty(TestHandler &th) 56 | { 57 | auto scraper = JobMetricScraper(""); 58 | auto metric = new PromJobMetric; 59 | string testname("Test Map Iteration Before Collection"); 60 | th.Register(TestWrapper(testname, scraper.IterNext(metric) != 0)); 61 | } 62 | 63 | void TestGetAllocCpus(TestHandler &th) 64 | { 65 | auto scraper = JobMetricScraper(""); 66 | scraper.CollectJobInfo(); 67 | auto metric = PromJobMetric(); 68 | scraper.IterReset(); 69 | scraper.IterNext(&metric); 70 | 71 | string testname("Test Get Alloc Cpus"); 72 | int cpus = metric.GetAllocCpus(); 73 | printf("cpus = %d\n", cpus); 74 | // this is identical to whats reported by squeue --json ?? 75 | // with a running job 76 | th.Register(TestWrapper(testname, cpus == 1)); 77 | } 78 | 79 | void TestGetAllocMem(TestHandler &th) 80 | { 81 | auto scraper = JobMetricScraper(""); 82 | scraper.CollectJobInfo(); 83 | auto metric = PromJobMetric(); 84 | scraper.IterReset(); 85 | scraper.IterNext(&metric); 86 | 87 | string testname("Test Get Alloc Mem"); 88 | int mem = metric.GetAllocMem(); 89 | printf("mem = %d\n", mem); 90 | // this is identical to whats reported by squeue --json ?? 91 | // with a running job 92 | th.Register(TestWrapper(testname, mem == 0)); 93 | } 94 | 95 | void TestGetUserName(TestHandler &th) 96 | { 97 | auto scraper = JobMetricScraper(""); 98 | scraper.CollectJobInfo(); 99 | auto metric = PromJobMetric(); 100 | scraper.IterReset(); 101 | scraper.IterNext(&metric); 102 | string testname("Test Get Alloc Hostname"); 103 | string username = metric.GetUserName(); 104 | th.Register(TestWrapper(testname, username == "root")); 105 | } 106 | 107 | int main() 108 | { 109 | TestHandler handler; 110 | JobMetricScraper_CollectHappy(handler); 111 | JobMetricScraper_CollectTwice(handler); 112 | JobMetricScraper_CollectThrice(handler); 113 | TestGetAllocCpus(handler); 114 | TestGetAllocMem(handler); 115 | TestGetUserName(handler); 116 | TestIter(handler); 117 | TestIter_Empty(handler); 118 | return handler.Report(); 119 | } 120 | -------------------------------------------------------------------------------- /cext/test/node_test.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | constexpr double epsilon = 0.0001; 13 | 14 | void NodeMetricScraper_CollectHappy(TestHandler &th) 15 | { 16 | auto scraper = NodeMetricScraper(""); 17 | int errnum = scraper.CollectNodeInfo(); 18 | string testname("Node Metric Scraper Collect Happy"); 19 | th.Register(TestWrapper(testname, errnum == 0)); 20 | } 21 | 22 | void NodeMetricScraper_CollectTwice(TestHandler &th) 23 | { 24 | auto scraper = NodeMetricScraper(""); 25 | int errnum = scraper.CollectNodeInfo(); 26 | int errnum2 = scraper.CollectNodeInfo(); 27 | string testname("Node Metric Scraper Cache hit Works"); 28 | th.Register(TestWrapper(testname, errnum == 0 && errnum2 == 0)); 29 | } 30 | 31 | void NodeMetricScraper_CollectThrice(TestHandler &th) 32 | { 33 | auto scraper = NodeMetricScraper(""); 34 | int errnum = scraper.CollectNodeInfo(); 35 | int errnum2 = scraper.CollectNodeInfo(); 36 | int errnum3 = scraper.CollectNodeInfo(); 37 | string testname("Node Metric Catch Seg"); 38 | th.Register(TestWrapper(testname, errnum == 0 && errnum2 == 0 && errnum3 == 0)); 39 | } 40 | 41 | void TestGetAllocMem(TestHandler &th) 42 | { 43 | auto scraper = NodeMetricScraper(""); 44 | int errnum = scraper.CollectNodeInfo(); 45 | assert(0 == errnum); 46 | scraper.IterReset(); 47 | PromNodeMetric metric; 48 | scraper.IterNext(&metric); 49 | string testname("Node Metric Scraper Mem Alloc"); 50 | double diff = fabs(1000000 - metric.GetAllocMem()); 51 | th.Register(TestWrapper(testname, diff < epsilon)); 52 | } 53 | 54 | void TestIter(TestHandler &th) 55 | { 56 | NodeMetricScraper scraper(""); 57 | int errnum = scraper.CollectNodeInfo(); 58 | scraper.IterReset(); 59 | auto metric = new PromNodeMetric; 60 | int count = 0; 61 | assert(errnum == 0); 62 | while (scraper.IterNext(metric) == 0) 63 | count++; 64 | string testname("Test Map Iteration After Collection"); 65 | th.Register(TestWrapper(testname, count > 0)); 66 | } 67 | 68 | void TestIter_Empty(TestHandler &th) 69 | { 70 | auto scraper = NodeMetricScraper(""); 71 | auto metric = new PromNodeMetric; 72 | string testname("Test Map Iteration Before Collection"); 73 | th.Register(TestWrapper(testname, scraper.IterNext(metric) != 0)); 74 | } 75 | 76 | int main() 77 | { 78 | TestHandler handler; 79 | NodeMetricScraper_CollectHappy(handler); 80 | NodeMetricScraper_CollectTwice(handler); 81 | NodeMetricScraper_CollectThrice(handler); 82 | TestGetAllocMem(handler); 83 | TestIter(handler); 84 | TestIter_Empty(handler); 85 | return handler.Report(); 86 | } 87 | -------------------------------------------------------------------------------- /cext/test/sleep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 3 | # 4 | # SPDX-License-Identifier: Apache-2.0 5 | #SBATCH -o /dev/null 6 | #SBATCH --oversubscribe 7 | #SBATCH --mem 1 8 | 9 | sleep 100 10 | -------------------------------------------------------------------------------- /cext/test/test_util.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | using namespace std; 10 | 11 | TestWrapper::TestWrapper(string testName, int errnum) 12 | { 13 | TestName = testName; 14 | Passed = errnum; 15 | } 16 | 17 | TestHandler::TestHandler() 18 | { 19 | start = chrono::high_resolution_clock::now(); 20 | } 21 | 22 | void TestHandler::Register(TestWrapper wrp) 23 | { 24 | tests.push_back(wrp); 25 | } 26 | 27 | int TestHandler::Report() 28 | { 29 | auto duration = chrono::duration_cast(chrono::high_resolution_clock::now() - start); 30 | int fails = 0; 31 | for (auto const &tw : tests) 32 | { 33 | if (tw.Passed) 34 | continue; 35 | fails++; 36 | cout << "Test " << tw.TestName; 37 | cout << " errored with code " << tw.Passed << endl; 38 | } 39 | cout << "Summary: " << endl; 40 | cout << " Ran: " << tests.size() << endl; 41 | if (fails) 42 | cout << " Failed: " << fails << endl; 43 | cout << " Passed: " << tests.size() - fails << endl; 44 | cout << "Took " << duration.count() << "ms" << endl; 45 | return fails; 46 | } 47 | -------------------------------------------------------------------------------- /cext/test/test_util.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | 10 | struct TestWrapper 11 | { 12 | TestWrapper(string testName, int errnum); 13 | string TestName; 14 | int Passed; 15 | }; 16 | 17 | class TestHandler 18 | { 19 | vector tests; 20 | chrono::system_clock::time_point start; 21 | 22 | public: 23 | void Register(TestWrapper wrp); 24 | int Report(); 25 | TestHandler(); 26 | }; 27 | -------------------------------------------------------------------------------- /cmain.go: -------------------------------------------------------------------------------- 1 | //go:build cenabled 2 | 3 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 4 | // 5 | // SPDX-License-Identifier: Apache-2.0 6 | package main 7 | 8 | import ( 9 | "flag" 10 | "log" 11 | "net/http" 12 | 13 | "github.com/rivosinc/prometheus-slurm-exporter/cext" 14 | "github.com/rivosinc/prometheus-slurm-exporter/exporter" 15 | "log/slog" 16 | ) 17 | 18 | var ( 19 | listenAddress = flag.String("web.listen-address", "", 20 | `Address to listen on for telemetry "(default: :9092)"`) 21 | metricsPath = flag.String("web.telemetry-path", "", 22 | "Path under which to expose metrics (default: /metrics)") 23 | logLevel = flag.String("web.log-level", "", "Log level: info, debug, error, warning") 24 | ) 25 | 26 | func main() { 27 | flag.Parse() 28 | cliArgs := exporter.CliFlags{ 29 | ListenAddress: *listenAddress, 30 | MetricsPath: *metricsPath, 31 | LogLevel: *logLevel, 32 | } 33 | config, err := exporter.NewConfig(&cliArgs) 34 | if err != nil { 35 | log.Fatalf("failed to init config with %q", err) 36 | } 37 | handler, fetchersToFree := cext.InitPromServer(config) 38 | defer func() { 39 | for _, fetcher := range fetchersToFree { 40 | fetcher.Deinit() 41 | } 42 | }() 43 | http.Handle(config.MetricsPath, handler) 44 | slog.Info("serving metrics at " + config.ListenAddress + config.MetricsPath) 45 | log.Fatalf("server exited with %q", http.ListenAndServe(config.ListenAddress, nil)) 46 | } 47 | -------------------------------------------------------------------------------- /docker_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 4 | # 5 | # SPDX-License-Identifier: Apache-2.0 6 | 7 | SLURM_CONF_DIR=${SLURM_CONF_DIR:-'/etc/slurm'} 8 | MUNGE_KEY=${MUNGE_KEY:-'/etc/munge/munge.key'} 9 | DOCKER_IMAGE=${1:-'slurm_exporter'} 10 | 11 | rm -rf tmp_sconf 12 | mkdir tmp_sconf 13 | cp "$SLURM_CONF_DIR/slurm*" tmp_sconf 14 | cp "$MUNGE_KEY" tmp_sconf 15 | docker build -t $DOCKER_IMAGE . 16 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 3 | # 4 | # SPDX-License-Identifier: Apache-2.0 5 | munged -f 2> /dev/null 6 | slurmctld 7 | slurmd 8 | exec $@ 9 | -------------------------------------------------------------------------------- /exporter/diags.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "encoding/json" 9 | "fmt" 10 | 11 | "log/slog" 12 | 13 | "github.com/prometheus/client_golang/prometheus" 14 | ) 15 | 16 | type IntFromOptionalStruct int 17 | 18 | func (ffoo *IntFromOptionalStruct) UnmarshalJSON(data []byte) error { 19 | // in between certain versions of data_parser, certain integer fields 20 | // can be given in actual int or in the form 21 | // {"average_time": {"set": true, "number": 1234, "infinite": false}} 22 | // create type to coerce to int 23 | var nativeInt int 24 | if err := json.Unmarshal(data, &nativeInt); err == nil { 25 | *ffoo = IntFromOptionalStruct(nativeInt) 26 | return nil 27 | } 28 | var numStruct struct { 29 | Set bool `json:"set"` 30 | Infinite bool `json:"infinite"` 31 | Number int `json:"number"` 32 | } 33 | err := json.Unmarshal(data, &numStruct) 34 | if err != nil { 35 | return err 36 | } 37 | if !numStruct.Set { 38 | *ffoo = IntFromOptionalStruct(-1) 39 | return fmt.Errorf("avg num not set") 40 | } 41 | if numStruct.Infinite { 42 | *ffoo = IntFromOptionalStruct(-1) 43 | return fmt.Errorf("num set to infinite") 44 | } 45 | *ffoo = IntFromOptionalStruct(numStruct.Number) 46 | return nil 47 | } 48 | 49 | type UserRpcInfo struct { 50 | User string `json:"user"` 51 | UserId int `json:"user_id"` 52 | Count int `json:"count"` 53 | AvgTime IntFromOptionalStruct `json:"average_time"` 54 | TotalTime int `json:"total_time"` 55 | } 56 | 57 | type MessageRpcInfo struct { 58 | MessageType string `json:"message_type"` 59 | TypeId int `json:"type_id"` 60 | Count int `json:"count"` 61 | AvgTime IntFromOptionalStruct `json:"average_time"` 62 | TotalTime int `json:"total_time"` 63 | } 64 | 65 | type DiagMetric struct { 66 | ServerThreadCount int `json:"server_thread_count"` 67 | DBDAgentQueueSize int `json:"dbd_agent_queue_size"` 68 | RpcByUser []UserRpcInfo `json:"rpcs_by_user"` 69 | RpcByMessageType []MessageRpcInfo `json:"rpcs_by_message_type"` 70 | BackfillJobCount int `json:"bf_backfilled_jobs"` 71 | BackfillCycleCountSum int `json:"bf_cycle_sum"` 72 | BackfillCycleCounter int `json:"bf_cycle_counter"` 73 | BackfillLastDepth int `json:"bf_last_depth"` 74 | BackfillLastDepthTry int `json:"bf_last_depth_try"` 75 | } 76 | 77 | type SdiagResponse struct { 78 | // Response coercible between slurm 23 and 24 data versions 79 | Meta struct { 80 | SlurmVersion SlurmVersion `json:"Slurm"` 81 | Plugins map[string]string `json:"plugins"` 82 | Plugin map[string]string `json:"plugin"` 83 | } `json:"meta"` 84 | Statistics DiagMetric 85 | Errors []string `json:"errors"` 86 | Warnings []string `json:"warnings"` 87 | } 88 | 89 | func (sr *SdiagResponse) IsDataParserPlugin() bool { 90 | if sr.Meta.Plugins != nil { 91 | _, ok := sr.Meta.Plugins["data_parser"] 92 | return ok 93 | } 94 | if sr.Meta.Plugin != nil { 95 | _, ok := sr.Meta.Plugin["data_parser"] 96 | return ok 97 | } 98 | return false 99 | } 100 | 101 | func parseDiagMetrics(sdiagResp []byte) (*SdiagResponse, error) { 102 | sdiag := new(SdiagResponse) 103 | err := json.Unmarshal(sdiagResp, sdiag) 104 | return sdiag, err 105 | } 106 | 107 | type DiagnosticsCollector struct { 108 | // collector state 109 | fetcher SlurmByteScraper 110 | diagScrapeError prometheus.Counter 111 | diagScrapeDuration *prometheus.Desc 112 | // user rpc metrics 113 | slurmUserRpcCount *prometheus.Desc 114 | slurmUserRpcTotalTime *prometheus.Desc 115 | // type rpc metrics 116 | slurmTypeRpcCount *prometheus.Desc 117 | slurmTypeRpcAvgTime *prometheus.Desc 118 | slurmTypeRpcTotalTime *prometheus.Desc 119 | // daemon metrics 120 | slurmCtlThreadCount *prometheus.Desc 121 | slurmDbdAgentQueueSize *prometheus.Desc 122 | slurmBackfillJobCount *prometheus.Desc 123 | slurmBackfillCycleCount *prometheus.Desc 124 | slurmBackfillLastDepth *prometheus.Desc 125 | slurmBackfillLastDepthTrySched *prometheus.Desc 126 | slurmBackfillCycleCounter *prometheus.Desc 127 | } 128 | 129 | func NewDiagsCollector(config *Config) *DiagnosticsCollector { 130 | cliOpts := config.cliOpts 131 | return &DiagnosticsCollector{ 132 | fetcher: NewCliScraper(config.cliOpts.sdiag...), 133 | slurmUserRpcCount: prometheus.NewDesc("slurm_rpc_user_count", "slurm rpc count per user", []string{"user"}, nil), 134 | slurmUserRpcTotalTime: prometheus.NewDesc("slurm_rpc_user_total_time", "slurm rpc avg time per user", []string{"user"}, nil), 135 | slurmTypeRpcCount: prometheus.NewDesc("slurm_rpc_msg_type_count", "slurm rpc count per message type", []string{"type"}, nil), 136 | slurmTypeRpcAvgTime: prometheus.NewDesc("slurm_rpc_msg_type_avg_time", "slurm rpc total time consumed per message type", []string{"type"}, nil), 137 | slurmTypeRpcTotalTime: prometheus.NewDesc("slurm_rpc_msg_type_total_time", "slurm rpc avg time per message type", []string{"type"}, nil), 138 | slurmCtlThreadCount: prometheus.NewDesc("slurm_daemon_thread_count", "slurm daemon thread count", nil, nil), 139 | slurmDbdAgentQueueSize: prometheus.NewDesc("slurm_dbd_agent_queue_size", "slurmDbd queue size. Number of threads interacting with SlrumDBD. Will grow rapidly if DB is down or under stress", nil, nil), 140 | slurmBackfillJobCount: prometheus.NewDesc("slurm_backfill_job_count", "slurm number of jobs started thanks to backfilling since last slurm start", nil, nil), 141 | slurmBackfillCycleCount: prometheus.NewDesc("slurm_backfill_cycle_count", "slurm number of Number of backfill scheduling cycles since last reset", nil, nil), 142 | slurmBackfillLastDepth: prometheus.NewDesc("slurm_backfill_last_depth", "slurm number of processed jobs during last backfilling scheduling cycle. It counts every job even if that job can not be started due to dependencies or limits", nil, nil), 143 | slurmBackfillLastDepthTrySched: prometheus.NewDesc("slurm_backfill_last_depth_try_sched", "slurm number of processed jobs during last backfilling scheduling cycle. It counts only jobs with a chance to start using available resources", nil, nil), 144 | slurmBackfillCycleCounter: prometheus.NewDesc("slurm_backfill_cycle_counter", "slurm number of backfill scheduling cycles since last reset", nil, nil), 145 | diagScrapeError: prometheus.NewCounter(prometheus.CounterOpts{ 146 | Name: "slurm_diag_scrape_error", 147 | Help: "slurm diag scrape erro", 148 | }), 149 | diagScrapeDuration: prometheus.NewDesc("slurm_diag_scrape_duration", fmt.Sprintf("how long the cmd %v took (ms)", cliOpts.sdiag), nil, nil), 150 | } 151 | } 152 | 153 | func (sc *DiagnosticsCollector) Describe(ch chan<- *prometheus.Desc) { 154 | ch <- sc.slurmUserRpcCount 155 | ch <- sc.slurmUserRpcTotalTime 156 | ch <- sc.slurmTypeRpcCount 157 | ch <- sc.slurmTypeRpcAvgTime 158 | ch <- sc.slurmTypeRpcTotalTime 159 | ch <- sc.slurmCtlThreadCount 160 | ch <- sc.diagScrapeDuration 161 | ch <- sc.slurmDbdAgentQueueSize 162 | ch <- sc.slurmBackfillJobCount 163 | ch <- sc.slurmBackfillCycleCount 164 | ch <- sc.slurmBackfillLastDepth 165 | ch <- sc.slurmBackfillLastDepthTrySched 166 | ch <- sc.slurmBackfillCycleCounter 167 | ch <- sc.diagScrapeError.Desc() 168 | } 169 | 170 | func (sc *DiagnosticsCollector) Collect(ch chan<- prometheus.Metric) { 171 | defer func() { 172 | ch <- sc.diagScrapeError 173 | }() 174 | sdiag, err := sc.fetcher.FetchRawBytes() 175 | if err != nil { 176 | sc.diagScrapeError.Inc() 177 | slog.Error(fmt.Sprintf("sdiag fetch error %q", err)) 178 | return 179 | } 180 | ch <- prometheus.MustNewConstMetric(sc.diagScrapeDuration, prometheus.GaugeValue, float64(sc.fetcher.Duration().Abs().Milliseconds())) 181 | sdiagResponse, err := parseDiagMetrics(sdiag) 182 | if err != nil { 183 | sc.diagScrapeError.Inc() 184 | slog.Error(fmt.Sprintf("diag parse error: %q", err)) 185 | return 186 | } 187 | if !sdiagResponse.IsDataParserPlugin() { 188 | sc.diagScrapeError.Inc() 189 | slog.Error("only the data_parser plugin is supported") 190 | return 191 | } 192 | emitNonZero := func(desc *prometheus.Desc, val float64, label string) { 193 | if val > 0 { 194 | ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, label) 195 | } 196 | } 197 | ch <- prometheus.MustNewConstMetric(sc.slurmCtlThreadCount, prometheus.GaugeValue, float64(sdiagResponse.Statistics.ServerThreadCount)) 198 | ch <- prometheus.MustNewConstMetric(sc.slurmDbdAgentQueueSize, prometheus.GaugeValue, float64(sdiagResponse.Statistics.DBDAgentQueueSize)) 199 | ch <- prometheus.MustNewConstMetric(sc.slurmBackfillJobCount, prometheus.GaugeValue, float64(sdiagResponse.Statistics.BackfillJobCount)) 200 | ch <- prometheus.MustNewConstMetric(sc.slurmBackfillCycleCount, prometheus.GaugeValue, float64(sdiagResponse.Statistics.BackfillCycleCountSum)) 201 | ch <- prometheus.MustNewConstMetric(sc.slurmBackfillLastDepth, prometheus.GaugeValue, float64(sdiagResponse.Statistics.BackfillLastDepth)) 202 | ch <- prometheus.MustNewConstMetric(sc.slurmBackfillLastDepthTrySched, prometheus.GaugeValue, float64(sdiagResponse.Statistics.BackfillLastDepthTry)) 203 | ch <- prometheus.MustNewConstMetric(sc.slurmBackfillCycleCounter, prometheus.GaugeValue, float64(sdiagResponse.Statistics.BackfillCycleCounter)) 204 | for _, userRpcInfo := range sdiagResponse.Statistics.RpcByUser { 205 | emitNonZero(sc.slurmUserRpcCount, float64(userRpcInfo.Count), userRpcInfo.User) 206 | emitNonZero(sc.slurmUserRpcTotalTime, float64(userRpcInfo.TotalTime), userRpcInfo.User) 207 | } 208 | for _, typeRpcInfo := range sdiagResponse.Statistics.RpcByMessageType { 209 | emitNonZero(sc.slurmTypeRpcAvgTime, float64(typeRpcInfo.AvgTime), typeRpcInfo.MessageType) 210 | emitNonZero(sc.slurmTypeRpcCount, float64(typeRpcInfo.Count), typeRpcInfo.MessageType) 211 | emitNonZero(sc.slurmTypeRpcTotalTime, float64(typeRpcInfo.TotalTime), typeRpcInfo.MessageType) 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /exporter/diags_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "testing" 9 | 10 | "github.com/prometheus/client_golang/prometheus" 11 | "github.com/stretchr/testify/assert" 12 | ) 13 | 14 | func TestParseDiagJson(t *testing.T) { 15 | assert := assert.New(t) 16 | fetcher := MockScraper{fixture: "fixtures/sdiag.json"} 17 | sdiag, err := fetcher.FetchRawBytes() 18 | assert.NoError(err) 19 | resp, err := parseDiagMetrics(sdiag) 20 | assert.NoError(err) 21 | assert.Contains(resp.Meta.Plugins, "data_parser") 22 | } 23 | 24 | func TestDiagCollect(t *testing.T) { 25 | assert := assert.New(t) 26 | config, err := NewConfig(new(CliFlags)) 27 | assert.NoError(err) 28 | dc := NewDiagsCollector(config) 29 | dc.fetcher = &MockScraper{fixture: "fixtures/sdiag.json"} 30 | metricChan := make(chan prometheus.Metric) 31 | go func() { 32 | dc.Collect(metricChan) 33 | close(metricChan) 34 | }() 35 | metrics := make([]prometheus.Metric, 0) 36 | for m, ok := <-metricChan; ok; m, ok = <-metricChan { 37 | metrics = append(metrics, m) 38 | t.Logf("Received metric %s", m.Desc().String()) 39 | } 40 | assert.NotEmpty(metrics) 41 | } 42 | 43 | func TestDiagCollect_2405(t *testing.T) { 44 | assert := assert.New(t) 45 | config, err := NewConfig(new(CliFlags)) 46 | assert.NoError(err) 47 | dc := NewDiagsCollector(config) 48 | dc.fetcher = &MockScraper{fixture: "fixtures/sdiag_2405.json"} 49 | metricChan := make(chan prometheus.Metric) 50 | go func() { 51 | dc.Collect(metricChan) 52 | close(metricChan) 53 | }() 54 | metrics := make([]prometheus.Metric, 0) 55 | for m, ok := <-metricChan; ok; m, ok = <-metricChan { 56 | metrics = append(metrics, m) 57 | t.Logf("Received metric %s", m.Desc().String()) 58 | } 59 | assert.NotEmpty(metrics) 60 | } 61 | 62 | func TestDiagDescribe(t *testing.T) { 63 | assert := assert.New(t) 64 | ch := make(chan *prometheus.Desc) 65 | config, err := NewConfig(new(CliFlags)) 66 | assert.Nil(err) 67 | dc := NewDiagsCollector(config) 68 | dc.fetcher = &MockScraper{fixture: "fixtures/sdiag.json"} 69 | go func() { 70 | dc.Describe(ch) 71 | close(ch) 72 | }() 73 | descs := make([]*prometheus.Desc, 0) 74 | for desc, ok := <-ch; ok; desc, ok = <-ch { 75 | descs = append(descs, desc) 76 | } 77 | assert.NotEmpty(descs) 78 | } 79 | 80 | func TestDataParserVersionDiscovery_Slurm23(t *testing.T) { 81 | assert := assert.New(t) 82 | fetcher := MockScraper{fixture: "fixtures/sdiag.json"} 83 | sdiag, err := fetcher.FetchRawBytes() 84 | assert.NoError(err) 85 | resp, err := parseDiagMetrics(sdiag) 86 | assert.NoError(err) 87 | assert.True(resp.IsDataParserPlugin()) 88 | } 89 | 90 | func TestDataParserVersionDiscovery_Slurm24(t *testing.T) { 91 | assert := assert.New(t) 92 | fetcher := MockScraper{fixture: "fixtures/sdiag_2405.json"} 93 | sdiag, err := fetcher.FetchRawBytes() 94 | assert.NoError(err) 95 | resp, err := parseDiagMetrics(sdiag) 96 | assert.NoError(err) 97 | assert.Truef(resp.IsDataParserPlugin(), "parsed metadata struct %+v", resp.Meta) 98 | } 99 | -------------------------------------------------------------------------------- /exporter/fixtures/license_out.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": { 3 | "plugins": { 4 | "data_parser": "data_parser/v0.0.39", 5 | "accounting_storage": "accounting_storage/slurmdbd" 6 | }, 7 | "command": [ 8 | "show", 9 | "lic" 10 | ], 11 | "Slurm": { 12 | "version": { 13 | "major": 23, 14 | "micro": 4, 15 | "minor": 2 16 | }, 17 | "release": "23.02.4" 18 | } 19 | }, 20 | "licenses": [{ 21 | "LicenseName": "AscentLintBase", 22 | "Total": 420, 23 | "Used": 213, 24 | "Free": 205, 25 | "Remote": true, 26 | "Reserved": 357, 27 | "LastConsumed": 218, 28 | "LastDeficit": 2, 29 | "LastUpdate": "2024-12-16T08:42:16" 30 | }], 31 | "warnings": [], 32 | "errors": [] 33 | } 34 | -------------------------------------------------------------------------------- /exporter/fixtures/license_out.json.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/sacctmgr.txt: -------------------------------------------------------------------------------- 1 | |root|||| 2 | |account1|993|15917500|| 3 | |account2|198|3183500|| 4 | |account3|||| 5 | |account4|7948|95505000|8000|60000 6 | |account5|3974|47752500|4000|30000 7 | shouldignore|account5|3978|477500|405|301 8 | -------------------------------------------------------------------------------- /exporter/fixtures/sacctmgr.txt.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/sdiag.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": { 3 | "plugins": { 4 | "data_parser": "data_parser/v0.0.39", 5 | "accounting_storage": "accounting_storage/slurmdbd" 6 | }, 7 | "command": [ 8 | "sdiag", 9 | "--json" 10 | ], 11 | "Slurm": { 12 | "version": { 13 | "major": 23, 14 | "micro": 4, 15 | "minor": 2 16 | }, 17 | "release": "23.02.4" 18 | } 19 | }, 20 | "statistics": { 21 | "parts_packed": 1, 22 | "req_time": 1698885571, 23 | "req_time_start": 1698883200, 24 | "server_thread_count": 3, 25 | "agent_queue_size": 0, 26 | "agent_count": 0, 27 | "agent_thread_count": 0, 28 | "dbd_agent_queue_size": 5, 29 | "gettimeofday_latency": 18, 30 | "schedule_cycle_max": 1193926, 31 | "schedule_cycle_last": 10824, 32 | "schedule_cycle_total": 3818, 33 | "schedule_cycle_mean": 30921, 34 | "schedule_cycle_mean_depth": 134, 35 | "schedule_cycle_per_minute": 97, 36 | "schedule_queue_length": 2647, 37 | "jobs_submitted": 5824, 38 | "jobs_started": 3671, 39 | "jobs_completed": 3681, 40 | "jobs_canceled": 41, 41 | "jobs_failed": 0, 42 | "jobs_pending": 13455, 43 | "jobs_running": 4993, 44 | "job_states_ts": 1698885543, 45 | "bf_backfilled_jobs": 2488, 46 | "bf_last_backfilled_jobs": 1316, 47 | "bf_backfilled_het_jobs": 0, 48 | "bf_cycle_counter": 72, 49 | "bf_cycle_mean": 2382553, 50 | "bf_depth_mean": 113, 51 | "bf_depth_mean_try": 37, 52 | "bf_cycle_sum": 171543834, 53 | "bf_cycle_last": 4097505, 54 | "bf_last_depth": 362, 55 | "bf_last_depth_try": 72, 56 | "bf_depth_sum": 8141, 57 | "bf_depth_try_sum": 2695, 58 | "bf_queue_len": 2689, 59 | "bf_queue_len_mean": 3371, 60 | "bf_queue_len_sum": 242773, 61 | "bf_table_size": 65, 62 | "bf_table_size_mean": 3371, 63 | "bf_when_last_cycle": 1698885541, 64 | "bf_active": false, 65 | "rpcs_by_message_type": [ 66 | { 67 | "message_type": "REQUEST_FED_INFO", 68 | "type_id": 2049, 69 | "count": 94114, 70 | "average_time": 43, 71 | "total_time": 4069862 72 | }, 73 | { 74 | "message_type": "REQUEST_JOB_USER_INFO", 75 | "type_id": 2039, 76 | "count": 12555, 77 | "average_time": 148789, 78 | "total_time": 1868051447 79 | }, 80 | { 81 | "message_type": "REQUEST_SUBMIT_BATCH_JOB", 82 | "type_id": 4003, 83 | "count": 9038, 84 | "average_time": 51838, 85 | "total_time": 468516537 86 | } 87 | ], 88 | "rpcs_by_user": [ 89 | { 90 | "user": "root", 91 | "user_id": 0, 92 | "count": 141368, 93 | "average_time": 175628, 94 | "total_time": 24828276785 95 | }, 96 | { 97 | "user": "abdh", 98 | "user_id": 1977600400, 99 | "count": 20954, 100 | "average_time": 44311, 101 | "total_time": 928512674 102 | } 103 | ] 104 | }, 105 | "warnings": [], 106 | "errors": [] 107 | } 108 | -------------------------------------------------------------------------------- /exporter/fixtures/sdiag.json.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/sdiag_2405.json: -------------------------------------------------------------------------------- 1 | { 2 | "statistics": { 3 | "parts_packed": 1, 4 | "req_time": { 5 | "set": true, 6 | "infinite": false, 7 | "number": 1739832148 8 | }, 9 | "req_time_start": { 10 | "set": true, 11 | "infinite": false, 12 | "number": 1739822537 13 | }, 14 | "server_thread_count": 2, 15 | "agent_queue_size": 0, 16 | "agent_count": 0, 17 | "agent_thread_count": 0, 18 | "dbd_agent_queue_size": 0, 19 | "gettimeofday_latency": 33, 20 | "schedule_cycle_max": 1666, 21 | "schedule_cycle_last": 110, 22 | "schedule_cycle_sum": 10791, 23 | "schedule_cycle_total": 162, 24 | "schedule_cycle_mean": 66, 25 | "schedule_cycle_mean_depth": 0, 26 | "schedule_cycle_per_minute": 1, 27 | "schedule_cycle_depth": 0, 28 | "schedule_exit": { 29 | "end_job_queue": 162, 30 | "default_queue_depth": 0, 31 | "max_job_start": 0, 32 | "max_rpc_cnt": 0, 33 | "max_sched_time": 0, 34 | "licenses": 0 35 | }, 36 | "schedule_queue_length": 0, 37 | "jobs_submitted": 1, 38 | "jobs_started": 1, 39 | "jobs_completed": 1, 40 | "jobs_canceled": 0, 41 | "jobs_failed": 0, 42 | "jobs_pending": 0, 43 | "jobs_running": 0, 44 | "job_states_ts": { 45 | "set": true, 46 | "infinite": false, 47 | "number": 1739832137 48 | }, 49 | "bf_backfilled_jobs": 0, 50 | "bf_last_backfilled_jobs": 0, 51 | "bf_backfilled_het_jobs": 0, 52 | "bf_cycle_counter": 0, 53 | "bf_cycle_mean": 0, 54 | "bf_depth_mean": 0, 55 | "bf_depth_mean_try": 0, 56 | "bf_cycle_sum": 0, 57 | "bf_cycle_last": 0, 58 | "bf_cycle_max": 0, 59 | "bf_exit": { 60 | "end_job_queue": 0, 61 | "bf_max_job_start": 0, 62 | "bf_max_job_test": 0, 63 | "bf_max_time": 0, 64 | "bf_node_space_size": 0, 65 | "state_changed": 0 66 | }, 67 | "bf_last_depth": 0, 68 | "bf_last_depth_try": 0, 69 | "bf_depth_sum": 0, 70 | "bf_depth_try_sum": 0, 71 | "bf_queue_len": 0, 72 | "bf_queue_len_mean": 0, 73 | "bf_queue_len_sum": 0, 74 | "bf_table_size": 0, 75 | "bf_table_size_sum": 0, 76 | "bf_table_size_mean": 0, 77 | "bf_when_last_cycle": { 78 | "set": true, 79 | "infinite": false, 80 | "number": 0 81 | }, 82 | "bf_active": false, 83 | "rpcs_by_message_type": [ 84 | { 85 | "type_id": 1002, 86 | "message_type": "MESSAGE_NODE_REGISTRATION_STATUS", 87 | "count": 6, 88 | "queued": 0, 89 | "dropped": 0, 90 | "cycle_last": 0, 91 | "cycle_max": 0, 92 | "total_time": 4969, 93 | "average_time": { 94 | "set": true, 95 | "infinite": false, 96 | "number": 828 97 | } 98 | }, 99 | { 100 | "type_id": 4001, 101 | "message_type": "REQUEST_RESOURCE_ALLOCATION", 102 | "count": 1, 103 | "queued": 0, 104 | "dropped": 0, 105 | "cycle_last": 0, 106 | "cycle_max": 0, 107 | "total_time": 17873, 108 | "average_time": { 109 | "set": true, 110 | "infinite": false, 111 | "number": 17873 112 | } 113 | }, 114 | { 115 | "type_id": 4019, 116 | "message_type": "REQUEST_JOB_READY", 117 | "count": 1, 118 | "queued": 0, 119 | "dropped": 0, 120 | "cycle_last": 0, 121 | "cycle_max": 0, 122 | "total_time": 332, 123 | "average_time": { 124 | "set": true, 125 | "infinite": false, 126 | "number": 332 127 | } 128 | }, 129 | { 130 | "type_id": 5001, 131 | "message_type": "REQUEST_JOB_STEP_CREATE", 132 | "count": 1, 133 | "queued": 0, 134 | "dropped": 0, 135 | "cycle_last": 0, 136 | "cycle_max": 0, 137 | "total_time": 6331, 138 | "average_time": { 139 | "set": true, 140 | "infinite": false, 141 | "number": 6331 142 | } 143 | }, 144 | { 145 | "type_id": 5017, 146 | "message_type": "REQUEST_COMPLETE_JOB_ALLOCATION", 147 | "count": 1, 148 | "queued": 0, 149 | "dropped": 0, 150 | "cycle_last": 0, 151 | "cycle_max": 0, 152 | "total_time": 2113, 153 | "average_time": { 154 | "set": true, 155 | "infinite": false, 156 | "number": 2113 157 | } 158 | }, 159 | { 160 | "type_id": 5016, 161 | "message_type": "REQUEST_STEP_COMPLETE", 162 | "count": 1, 163 | "queued": 0, 164 | "dropped": 0, 165 | "cycle_last": 0, 166 | "cycle_max": 0, 167 | "total_time": 739, 168 | "average_time": { 169 | "set": true, 170 | "infinite": false, 171 | "number": 739 172 | } 173 | }, 174 | { 175 | "type_id": 6012, 176 | "message_type": "MESSAGE_EPILOG_COMPLETE", 177 | "count": 1, 178 | "queued": 0, 179 | "dropped": 0, 180 | "cycle_last": 0, 181 | "cycle_max": 0, 182 | "total_time": 635, 183 | "average_time": { 184 | "set": true, 185 | "infinite": false, 186 | "number": 635 187 | } 188 | }, 189 | { 190 | "type_id": 2035, 191 | "message_type": "REQUEST_STATS_INFO", 192 | "count": 5, 193 | "queued": 0, 194 | "dropped": 0, 195 | "cycle_last": 0, 196 | "cycle_max": 0, 197 | "total_time": 1951, 198 | "average_time": { 199 | "set": true, 200 | "infinite": false, 201 | "number": 390 202 | } 203 | }, 204 | { 205 | "type_id": 2009, 206 | "message_type": "REQUEST_PARTITION_INFO", 207 | "count": 6, 208 | "queued": 0, 209 | "dropped": 0, 210 | "cycle_last": 0, 211 | "cycle_max": 0, 212 | "total_time": 1391, 213 | "average_time": { 214 | "set": true, 215 | "infinite": false, 216 | "number": 231 217 | } 218 | }, 219 | { 220 | "type_id": 2003, 221 | "message_type": "REQUEST_JOB_INFO", 222 | "count": 3, 223 | "queued": 0, 224 | "dropped": 0, 225 | "cycle_last": 0, 226 | "cycle_max": 0, 227 | "total_time": 2975, 228 | "average_time": { 229 | "set": true, 230 | "infinite": false, 231 | "number": 991 232 | } 233 | }, 234 | { 235 | "type_id": 2007, 236 | "message_type": "REQUEST_NODE_INFO", 237 | "count": 3, 238 | "queued": 0, 239 | "dropped": 0, 240 | "cycle_last": 0, 241 | "cycle_max": 0, 242 | "total_time": 1051, 243 | "average_time": { 244 | "set": true, 245 | "infinite": false, 246 | "number": 350 247 | } 248 | } 249 | ], 250 | "rpcs_by_user": [ 251 | { 252 | "user_id": 0, 253 | "user": "root", 254 | "count": 29, 255 | "total_time": 40360, 256 | "average_time": { 257 | "set": true, 258 | "infinite": false, 259 | "number": 1391 260 | } 261 | } 262 | ], 263 | "pending_rpcs": [], 264 | "pending_rpcs_by_hostlist": [] 265 | }, 266 | "meta": { 267 | "plugin": { 268 | "type": "", 269 | "name": "", 270 | "data_parser": "data_parser/v0.0.41", 271 | "accounting_storage": "" 272 | }, 273 | "client": { 274 | "source": "/dev/pts/0", 275 | "user": "root", 276 | "group": "root" 277 | }, 278 | "command": ["sdiag"], 279 | "slurm": { 280 | "version": { 281 | "major": "24", 282 | "micro": "5", 283 | "minor": "05" 284 | }, 285 | "release": "24.05.5", 286 | "cluster": "default-cluster" 287 | } 288 | }, 289 | "errors": [], 290 | "warnings": [] 291 | } 292 | -------------------------------------------------------------------------------- /exporter/fixtures/sdiag_2405.json.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/sinfo_fallback.txt: -------------------------------------------------------------------------------- 1 | {"s": "completing", "mem": 770000, "n": "cs156", "l": "N/A", "p": "hw", "fmem": "N/A", "cstate": "56/8/0/64", "w": 1} 2 | {"s": "allocated", "mem": 1000000, "n": "cs25", "l": "20.66", "p": "hw", "fmem": "89124", "cstate": "64/0/0/64", "w": 1} 3 | {"s": "allocated", "mem": 1000000, "n": "cs25", "l": "20.66", "p": "hw-l", "fmem": "89124", "cstate": "64/0/0/64", "w": 1} 4 | {"s": "allocated", "mem": 1000000, "n": "cs25", "l": "20.66", "p": "hw-m", "fmem": "89124", "cstate": "64/0/0/64", "w": 1} 5 | {"s": "allocated", "mem": 1000000, "n": "cs25", "l": "20.66", "p": "hw-h", "fmem": "89124", "cstate": "64/0/0/64", "w": 1} 6 | {"s": "allocated", "mem": 1000000, "n": "cs25", "l": "20.66", "p": "cdn", "fmem": "89124", "cstate": "64/0/0/64", "w": 1} 7 | {"s": "idle", "mem": 1000000, "n": "cs31", "l": "2.59", "p": "cdn", "fmem": "751243", "cstate": "0/64/0/64", "w": 1} 8 | {"s": "mixed", "mem": 770000, "n": "cs53", "l": "16.12", "p": "hw", "fmem": "485125", "cstate": "52/12/0/64", "w": 1} 9 | -------------------------------------------------------------------------------- /exporter/fixtures/sinfo_fallback.txt.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/sinfo_out.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": { 3 | "plugin": { 4 | "type": "openapi/v0.0.37", 5 | "name": "Slurm OpenAPI v0.0.37" 6 | }, 7 | "Slurm": { 8 | "version": { 9 | "major": 21, 10 | "micro": 5, 11 | "minor": 8 12 | }, 13 | "release": "21.08.5" 14 | } 15 | }, 16 | "errors": [], 17 | "nodes": [ 18 | { 19 | "architecture": "x86_64", 20 | "burstbuffer_network_address": "", 21 | "boards": 1, 22 | "boot_time": 1671873827, 23 | "comment": "", 24 | "cores": 16, 25 | "cpu_binding": 0, 26 | "cpu_load": 1, 27 | "extra": "", 28 | "free_memory": 337330, 29 | "cpus": 64, 30 | "last_busy": 1685734519, 31 | "features": "", 32 | "active_features": "", 33 | "gres": "", 34 | "gres_drained": "N/A", 35 | "gres_used": "", 36 | "mcs_label": "", 37 | "name": "cs2.example.company.com", 38 | "next_state_after_reboot": "invalid", 39 | "address": "cs2.example.company.com", 40 | "hostname": "cs2.example.company.com", 41 | "state": "mixed", 42 | "state_flags": [], 43 | "next_state_after_reboot_flags": [], 44 | "operating_system": "Linux 3.10.0-1160.80.1.el7.x86_64 #1 SMP Tue Nov 8 15:48:59 UTC 2022", 45 | "owner": null, 46 | "partitions": [ 47 | "hw" 48 | ], 49 | "port": 6818, 50 | "real_memory": 500000, 51 | "reason": "", 52 | "reason_changed_at": 0, 53 | "reason_set_by_user": null, 54 | "slurmd_start_time": 1685737510, 55 | "sockets": 2, 56 | "threads": 2, 57 | "temporary_disk": 0, 58 | "weight": 1, 59 | "tres": "cpu=64,mem=500000M,billing=64", 60 | "slurmd_version": "21.08.5", 61 | "alloc_memory": 114688, 62 | "alloc_cpus": 4, 63 | "idle_cpus": 60, 64 | "tres_used": "cpu=4,mem=112G", 65 | "tres_weighted": 4.0 66 | }, 67 | { 68 | "architecture": "x86_64", 69 | "burstbuffer_network_address": "", 70 | "boards": 1, 71 | "boot_time": 1671873826, 72 | "comment": "", 73 | "cores": 16, 74 | "cpu_binding": 0, 75 | "cpu_load": 4, 76 | "extra": "", 77 | "free_memory": 494857, 78 | "cpus": 64, 79 | "last_busy": 1685734525, 80 | "features": "", 81 | "active_features": "", 82 | "gres": "", 83 | "gres_drained": "N/A", 84 | "gres_used": "", 85 | "mcs_label": "", 86 | "name": "cs3.example.company.com", 87 | "next_state_after_reboot": "invalid", 88 | "address": "cs3.example.company.com", 89 | "hostname": "cs3.example.company.com", 90 | "state": "idle", 91 | "state_flags": [], 92 | "next_state_after_reboot_flags": [], 93 | "operating_system": "Linux 3.10.0-1160.80.1.el7.x86_64 #1 SMP Tue Nov 8 15:48:59 UTC 2022", 94 | "owner": null, 95 | "partitions": [ 96 | "hw" 97 | ], 98 | "port": 6818, 99 | "real_memory": 500000, 100 | "reason": "", 101 | "reason_changed_at": 0, 102 | "reason_set_by_user": null, 103 | "slurmd_start_time": 1685737508, 104 | "sockets": 2, 105 | "threads": 2, 106 | "temporary_disk": 0, 107 | "weight": 1, 108 | "tres": "cpu=64,mem=500000M,billing=64", 109 | "slurmd_version": "21.08.5", 110 | "alloc_memory": 0, 111 | "alloc_cpus": 0, 112 | "idle_cpus": 64, 113 | "tres_used": null, 114 | "tres_weighted": 0.0 115 | }, 116 | { 117 | "architecture": "x86_64", 118 | "burstbuffer_network_address": "", 119 | "boards": 1, 120 | "boot_time": 1671873824, 121 | "comment": "", 122 | "cores": 16, 123 | "cpu_binding": 0, 124 | "cpu_load": 2, 125 | "extra": "", 126 | "free_memory": 495693, 127 | "cpus": 64, 128 | "last_busy": 1685734525, 129 | "features": "", 130 | "active_features": "", 131 | "gres": "", 132 | "gres_drained": "N/A", 133 | "gres_used": "", 134 | "mcs_label": "", 135 | "name": "cs4.example.company.com", 136 | "next_state_after_reboot": "invalid", 137 | "address": "cs4.example.company.com", 138 | "hostname": "cs4.example.company.com", 139 | "state": "allocated", 140 | "state_flags": [], 141 | "next_state_after_reboot_flags": [], 142 | "operating_system": "Linux 3.10.0-1160.80.1.el7.x86_64 #1 SMP Tue Nov 8 15:48:59 UTC 2022", 143 | "owner": null, 144 | "partitions": [ 145 | "hw" 146 | ], 147 | "port": 6818, 148 | "real_memory": 500000, 149 | "reason": "", 150 | "reason_changed_at": 0, 151 | "reason_set_by_user": null, 152 | "slurmd_start_time": 1685737506, 153 | "sockets": 2, 154 | "threads": 2, 155 | "temporary_disk": 0, 156 | "weight": 1, 157 | "tres": "cpu=64,mem=500000M,billing=64", 158 | "slurmd_version": "21.08.5", 159 | "alloc_memory": 0, 160 | "alloc_cpus": 0, 161 | "idle_cpus": 64, 162 | "tres_used": null, 163 | "tres_weighted": 0.0 164 | }, 165 | { 166 | "architecture": "x86_64", 167 | "burstbuffer_network_address": "", 168 | "boards": 1, 169 | "boot_time": 1671873824, 170 | "comment": "", 171 | "cores": 16, 172 | "cpu_binding": 0, 173 | "cpu_load": 2, 174 | "extra": "", 175 | "free_memory": 495693, 176 | "cpus": 64, 177 | "last_busy": 1685734525, 178 | "features": "", 179 | "active_features": "", 180 | "gres": "", 181 | "gres_drained": "N/A", 182 | "gres_used": "", 183 | "mcs_label": "", 184 | "name": "cs5.example.company.com", 185 | "next_state_after_reboot": "invalid", 186 | "address": "cs5.example.company.com", 187 | "hostname": "cs5.example.company.com", 188 | "state": "down", 189 | "state_flags": [], 190 | "next_state_after_reboot_flags": [], 191 | "operating_system": "Linux 3.10.0-1160.80.1.el7.x86_64 #1 SMP Tue Nov 8 15:48:59 UTC 2022", 192 | "owner": null, 193 | "partitions": [ 194 | "hw" 195 | ], 196 | "port": 6818, 197 | "real_memory": 500000, 198 | "reason": "", 199 | "reason_changed_at": 0, 200 | "reason_set_by_user": null, 201 | "slurmd_start_time": 1685737506, 202 | "sockets": 2, 203 | "threads": 2, 204 | "temporary_disk": 0, 205 | "weight": 1, 206 | "tres": "cpu=64,mem=500000M,billing=64", 207 | "slurmd_version": "21.08.5", 208 | "alloc_memory": 0, 209 | "alloc_cpus": 0, 210 | "idle_cpus": 64, 211 | "tres_used": null, 212 | "tres_weighted": 0.0 213 | } 214 | ] 215 | } 216 | -------------------------------------------------------------------------------- /exporter/fixtures/sinfo_out.json.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/squeue_fallback.txt: -------------------------------------------------------------------------------- 1 | {"a": "account1", "id": 26515966, "end_time": "2023-09-21T00:21:42", "state": "RUNNING", "p": "hw-h", "cpu": 1, "mem": "128G", "array_id": "N/A", "r": "cs10"} 2 | {"a": "account1", "id": 50580016, "end_time": "2023-09-21T14:31:11", "state": "RUNNING", "p": "hw-l", "cpu": 1, "mem": "62.50G", "array_id": "N/A", "r": "cs10"} 3 | {"a": "account1", "id": 51447051, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A", "r": "(Dependency)"} 4 | {"a": "account1", "id": 51447052, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A", "r": "((ReqNodeNotAvail, UnavailableNodes:cs[100,101,102]))"} 5 | {"a": "account1", "id": 51447053, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A", "r": "(Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions)"} 6 | {"a": "account1", "id": 18804, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": 24, "mem": "118G", "array_id": "N/A", "r": "(Priority)"} 7 | # test counter inc with faulty inputs 8 | {"a": "account1", "id": 18805, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": xx, "mem": "118G", "array_id": "N/A"} 9 | {"a": "account1", "id": 18806, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": xx, "mem": "118G", "array_id": "N/A"} 10 | -------------------------------------------------------------------------------- /exporter/fixtures/squeue_fallback.txt.license: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/squeue_out.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": { 3 | "plugin": { 4 | "type": "openapi/v0.0.37", 5 | "name": "Slurm OpenAPI v0.0.37" 6 | }, 7 | "Slurm": { 8 | "version": { 9 | "major": 21, 10 | "micro": 5, 11 | "minor": 8 12 | }, 13 | "release": "21.08.5" 14 | } 15 | }, 16 | "errors": [], 17 | "jobs": [ 18 | { 19 | "account": "account1", 20 | "accrue_time": 1684114900, 21 | "admin_comment": "", 22 | "array_job_id": 0, 23 | "array_task_id": null, 24 | "array_max_tasks": 0, 25 | "array_task_string": "", 26 | "association_id": 119, 27 | "batch_features": "", 28 | "batch_flag": true, 29 | "batch_host": "cs75", 30 | "flags": [ 31 | "JOB_CPUS_SET ", 32 | "JOB_ACCRUE_OVER", 33 | "JOB_WAS_RUNNING", 34 | "JOB_MEM_SET" 35 | ], 36 | "burst_buffer": "", 37 | "burst_buffer_state": "", 38 | "cluster": "rivos", 39 | "cluster_features": "", 40 | "command": "sleep 100", 41 | "comment": "", 42 | "contiguous": false, 43 | "core_spec": null, 44 | "thread_spec": null, 45 | "cores_per_socket": null, 46 | "billable_tres": 1.0, 47 | "cpus_per_task": null, 48 | "cpu_frequency_minimum": null, 49 | "cpu_frequency_maximum": null, 50 | "cpu_frequency_governor": null, 51 | "cpus_per_tres": "", 52 | "deadline": 0, 53 | "delay_boot": 0, 54 | "dependency": "", 55 | "derived_exit_code": 0, 56 | "eligible_time": 1684114900, 57 | "end_time": 1686633833, 58 | "excluded_nodes": "", 59 | "exit_code": 0, 60 | "features": "a100-80gb&preemptible&gpu-1", 61 | "federation_origin": "", 62 | "federation_siblings_active": "", 63 | "federation_siblings_viable": "", 64 | "gres_detail": [], 65 | "group_id": 1977700000, 66 | "job_id": 26515966, 67 | "job_resources": { 68 | "nodes": "cs75", 69 | "allocated_cpus": 1, 70 | "allocated_hosts": 1, 71 | "allocated_nodes": { 72 | "0": { 73 | "sockets": { 74 | "1": "unassigned" 75 | }, 76 | "cores": { 77 | "0": "unassigned" 78 | }, 79 | "memory": 64000, 80 | "cpus": 1 81 | } 82 | } 83 | }, 84 | "job_state": "RUNNING", 85 | "last_sched_evaluation": 1684114921, 86 | "licenses": "", 87 | "max_cpus": 0, 88 | "max_nodes": 0, 89 | "mcs_label": "", 90 | "memory_per_tres": "", 91 | "name": "job_name", 92 | "nodes": "cs75", 93 | "nice": null, 94 | "tasks_per_core": null, 95 | "tasks_per_node": 0, 96 | "tasks_per_socket": null, 97 | "tasks_per_board": 0, 98 | "cpus": 1, 99 | "node_count": 1, 100 | "tasks": 1, 101 | "het_job_id": 0, 102 | "het_job_id_set": "", 103 | "het_job_offset": 0, 104 | "partition": "hw-l", 105 | "memory_per_node": null, 106 | "memory_per_cpu": 64000, 107 | "minimum_cpus_per_node": 1, 108 | "minimum_tmp_disk_per_node": 0, 109 | "preempt_time": 0, 110 | "pre_sus_time": 1381100, 111 | "priority": 1013, 112 | "profile": null, 113 | "qos": "normal", 114 | "reboot": false, 115 | "required_nodes": "", 116 | "requeue": true, 117 | "resize_time": 0, 118 | "restart_cnt": 0, 119 | "resv_name": "", 120 | "shared": null, 121 | "show_flags": [ 122 | "SHOW_ALL", 123 | "SHOW_DETAIL", 124 | "SHOW_LOCAL" 125 | ], 126 | "sockets_per_board": 0, 127 | "sockets_per_node": null, 128 | "start_time": 1684114921, 129 | "state_description": "", 130 | "state_reason": "None", 131 | "standard_error": "/nfs/user/stderr.log", 132 | "standard_input": "/dev/null", 133 | "standard_output": "/nfs/user/stdout.log", 134 | "submit_time": 1684114900, 135 | "suspend_time": 1686200533, 136 | "system_comment": "", 137 | "time_limit": 30240, 138 | "time_minimum": 0, 139 | "threads_per_core": 1, 140 | "tres_bind": "", 141 | "tres_freq": "", 142 | "tres_per_job": "", 143 | "tres_per_node": "", 144 | "tres_per_socket": "", 145 | "tres_per_task": "", 146 | "tres_req_str": "cpu=1,mem=62.50G,node=1,billing=1", 147 | "tres_alloc_str": "cpu=1,mem=62.50G,node=1,billing=1", 148 | "user_id": 1977600017, 149 | "user_name": "bkd", 150 | "wckey": "", 151 | "current_working_directory": "/somedir/on/nfs" 152 | }, 153 | { 154 | "account": "account1", 155 | "accrue_time": 0, 156 | "admin_comment": "", 157 | "allocating_node": "bkd", 158 | "array_job_id": { 159 | "set": true, 160 | "infinite": false, 161 | "number": 58948420 162 | }, 163 | "array_task_id": { 164 | "set": false, 165 | "infinite": false, 166 | "number": 0 167 | }, 168 | "array_max_tasks": { 169 | "set": true, 170 | "infinite": false, 171 | "number": 0 172 | }, 173 | "array_task_string": "1-10", 174 | "association_id": 4001, 175 | "batch_features": "", 176 | "batch_flag": true, 177 | "batch_host": "", 178 | "flags": [ 179 | "EXACT_TASK_COUNT_REQUESTED", 180 | "EXACT_CPU_COUNT_REQUESTED", 181 | "EXACT_MEMORY_REQUESTED", 182 | "USING_DEFAULT_QOS", 183 | "USING_DEFAULT_WCKEY", 184 | "DEPENDENT" 185 | ], 186 | "burst_buffer": "", 187 | "burst_buffer_state": "", 188 | "cluster": "rivos", 189 | "cluster_features": "", 190 | "command": "", 191 | "comment": "", 192 | "container": "", 193 | "container_id": "", 194 | "contiguous": false, 195 | "core_spec": 0, 196 | "thread_spec": 32766, 197 | "cores_per_socket": { 198 | "set": false, 199 | "infinite": false, 200 | "number": 0 201 | }, 202 | "billable_tres": { 203 | "set": false, 204 | "infinite": false, 205 | "number": 0.0 206 | }, 207 | "cpus_per_task": { 208 | "set": true, 209 | "infinite": false, 210 | "number": 1 211 | }, 212 | "cpu_frequency_minimum": { 213 | "set": false, 214 | "infinite": false, 215 | "number": 0 216 | }, 217 | "cpu_frequency_maximum": { 218 | "set": false, 219 | "infinite": false, 220 | "number": 0 221 | }, 222 | "cpu_frequency_governor": { 223 | "set": false, 224 | "infinite": false, 225 | "number": 0 226 | }, 227 | "cpus_per_tres": "", 228 | "cron": "", 229 | "deadline": 1729802248, 230 | "delay_boot": { 231 | "set": true, 232 | "infinite": false, 233 | "number": 0 234 | }, 235 | "dependency": "afterok:58948419(unfulfilled)", 236 | "derived_exit_code": { 237 | "set": true, 238 | "infinite": false, 239 | "number": 0 240 | }, 241 | "eligible_time": 0, 242 | "end_time": 0, 243 | "excluded_nodes": "", 244 | "exit_code": { 245 | "set": true, 246 | "infinite": false, 247 | "number": 0 248 | }, 249 | "extra": "", 250 | "failed_node": "", 251 | "features": "", 252 | "federation_origin": "", 253 | "federation_siblings_active": "", 254 | "federation_siblings_viable": "", 255 | "gres_detail": [ 256 | ], 257 | "group_id": 1977700000, 258 | "group_name": "rvs", 259 | "het_job_id": { 260 | "set": true, 261 | "infinite": false, 262 | "number": 0 263 | }, 264 | "het_job_id_set": "", 265 | "het_job_offset": { 266 | "set": true, 267 | "infinite": false, 268 | "number": 0 269 | }, 270 | "job_id": 58948420, 271 | "job_resources": { 272 | }, 273 | "job_size_str": [ 274 | ], 275 | "job_state": "PENDING", 276 | "last_sched_evaluation": 1729715848, 277 | "licenses": "rtl_single_core@r,", 278 | "mail_type": [ 279 | ], 280 | "mail_user": "bkd", 281 | "max_cpus": { 282 | "set": true, 283 | "infinite": false, 284 | "number": 0 285 | }, 286 | "max_nodes": { 287 | "set": true, 288 | "infinite": false, 289 | "number": 0 290 | }, 291 | "mcs_label": "", 292 | "memory_per_tres": "", 293 | "name": "some job name", 294 | "network": "", 295 | "nodes": "", 296 | "nice": 0, 297 | "tasks_per_core": { 298 | "set": false, 299 | "infinite": true, 300 | "number": 0 301 | }, 302 | "tasks_per_tres": { 303 | "set": true, 304 | "infinite": false, 305 | "number": 0 306 | }, 307 | "tasks_per_node": { 308 | "set": true, 309 | "infinite": false, 310 | "number": 0 311 | }, 312 | "tasks_per_socket": { 313 | "set": false, 314 | "infinite": true, 315 | "number": 0 316 | }, 317 | "tasks_per_board": { 318 | "set": true, 319 | "infinite": false, 320 | "number": 0 321 | }, 322 | "cpus": { 323 | "set": true, 324 | "infinite": false, 325 | "number": 1 326 | }, 327 | "node_count": { 328 | "set": true, 329 | "infinite": false, 330 | "number": 1 331 | }, 332 | "tasks": { 333 | "set": true, 334 | "infinite": false, 335 | "number": 1 336 | }, 337 | "partition": "hw-m", 338 | "prefer": "", 339 | "memory_per_cpu": { 340 | "set": false, 341 | "infinite": false, 342 | "number": 0 343 | }, 344 | "memory_per_node": { 345 | "set": true, 346 | "infinite": false, 347 | "number": 131072 348 | }, 349 | "minimum_cpus_per_node": { 350 | "set": true, 351 | "infinite": false, 352 | "number": 1 353 | }, 354 | "minimum_tmp_disk_per_node": { 355 | "set": true, 356 | "infinite": false, 357 | "number": 0 358 | }, 359 | "power": { 360 | "flags": [ 361 | ] 362 | }, 363 | "preempt_time": 0, 364 | "preemptable_time": 0, 365 | "pre_sus_time": 0, 366 | "hold": false, 367 | "priority": { 368 | "set": true, 369 | "infinite": false, 370 | "number": 1368 371 | }, 372 | "profile": [ 373 | "NOT_SET" 374 | ], 375 | "qos": "normal", 376 | "reboot": false, 377 | "required_nodes": "", 378 | "minimum_switches": 0, 379 | "requeue": true, 380 | "resize_time": 0, 381 | "restart_cnt": 0, 382 | "resv_name": "", 383 | "scheduled_nodes": "", 384 | "selinux_context": "", 385 | "shared": [ 386 | "oversubscribe" 387 | ], 388 | "exclusive": [ 389 | "false" 390 | ], 391 | "oversubscribe": true, 392 | "show_flags": [ 393 | "DETAIL", 394 | "LOCAL" 395 | ], 396 | "sockets_per_board": 0, 397 | "sockets_per_node": { 398 | "set": false, 399 | "infinite": false, 400 | "number": 0 401 | }, 402 | "start_time": 0, 403 | "state_description": "", 404 | "state_reason": "Dependency", 405 | "standard_error": "\/path\/to\/some\/dir.\/logs\/slurm-58948420.out", 406 | "standard_input": "\/dev\/null", 407 | "standard_output": "\/path\/to\/some\/dir\/.\/logs\/slurm-58948420.out", 408 | "submit_time": 1729715848, 409 | "suspend_time": 0, 410 | "system_comment": "", 411 | "time_limit": { 412 | "set": true, 413 | "infinite": false, 414 | "number": 1439 415 | }, 416 | "time_minimum": { 417 | "set": true, 418 | "infinite": false, 419 | "number": 0 420 | }, 421 | "threads_per_core": { 422 | "set": false, 423 | "infinite": false, 424 | "number": 0 425 | }, 426 | "tres_bind": "", 427 | "tres_freq": "", 428 | "tres_per_job": "", 429 | "tres_per_node": "", 430 | "tres_per_socket": "", 431 | "tres_per_task": "", 432 | "tres_req_str": "cpu=1,mem=128G,node=1,billing=1", 433 | "tres_alloc_str": "", 434 | "user_id": 1234, 435 | "user_name": "bkd", 436 | "maximum_switch_wait_time": 0, 437 | "wckey": "", 438 | "current_working_directory": "\/path\/to\/some\/dir" 439 | } 440 | ] 441 | } 442 | -------------------------------------------------------------------------------- /exporter/fixtures/squeue_out.json.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/fixtures/trace_info_body.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": 35642, 3 | "cpus": 0.0, 4 | "threads": 1, 5 | "mem": 1032192, 6 | "read_bytes": 20, 7 | "write_bytes": 10, 8 | "username": "abdh", 9 | "io_wait": 0, 10 | "hostname": "somehost", 11 | "job_id": 10 12 | } 13 | -------------------------------------------------------------------------------- /exporter/fixtures/trace_info_body.json.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /exporter/jobs_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "strings" 9 | "testing" 10 | "time" 11 | 12 | "github.com/prometheus/client_golang/prometheus" 13 | dto "github.com/prometheus/client_model/go" 14 | "github.com/stretchr/testify/assert" 15 | ) 16 | 17 | var MockJobInfoScraper = &MockScraper{fixture: "fixtures/squeue_out.json"} 18 | 19 | func CollectCounterValue(counter prometheus.Counter) float64 { 20 | metricChan := make(chan prometheus.Metric, 1) 21 | counter.Collect(metricChan) 22 | dtoMetric := new(dto.Metric) 23 | (<-metricChan).Write(dtoMetric) 24 | return dtoMetric.GetCounter().GetValue() 25 | } 26 | 27 | func TestNewJobsController(t *testing.T) { 28 | assert := assert.New(t) 29 | config := &Config{ 30 | PollLimit: 10, 31 | TraceConf: &TraceConfig{ 32 | sharedFetcher: &JobCliFallbackFetcher{ 33 | scraper: &MockScraper{fixture: "fixtures/squeue_fallback.txt"}, 34 | cache: NewAtomicThrottledCache[JobMetric](1), 35 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 36 | }, 37 | }, 38 | cliOpts: &CliOpts{ 39 | fallback: true, 40 | }, 41 | } 42 | jc := NewJobsController(config) 43 | assert.NotNil(jc) 44 | } 45 | 46 | func TestParseJobMetrics(t *testing.T) { 47 | assert := assert.New(t) 48 | scraper := &MockScraper{fixture: "fixtures/squeue_out.json"} 49 | fetcher := &JobJsonFetcher{ 50 | scraper: scraper, 51 | cache: NewAtomicThrottledCache[JobMetric](100), 52 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 53 | } 54 | jms, err := fetcher.fetch() 55 | assert.NoError(err) 56 | // test parse of single job 57 | var job *JobMetric 58 | for _, m := range jms { 59 | if m.JobId == 26515966 { 60 | job = &m 61 | break 62 | } 63 | } 64 | assert.NotNil(job) 65 | assert.Equal(6.4e13, totalAllocMem(&job.JobResources)) 66 | } 67 | 68 | func TestParseCliFallback(t *testing.T) { 69 | assert := assert.New(t) 70 | cliFallbackFetcher := &JobCliFallbackFetcher{ 71 | scraper: &MockScraper{fixture: "fixtures/squeue_fallback.txt"}, 72 | cache: NewAtomicThrottledCache[JobMetric](100), 73 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 74 | } 75 | metrics, err := cliFallbackFetcher.fetch() 76 | assert.Nil(err) 77 | assert.NotEmpty(metrics) 78 | nodeAvailMetricsCount := 0 79 | for _, metric := range metrics { 80 | if strings.Contains(metric.StateReason, reqNodeNotAvailReason) { 81 | nodeAvailMetricsCount++ 82 | } 83 | } 84 | assert.Equal(1, nodeAvailMetricsCount) 85 | assert.Equal(2., CollectCounterValue(cliFallbackFetcher.errCounter)) 86 | } 87 | 88 | func TestUserJobMetric(t *testing.T) { 89 | // setup 90 | assert := assert.New(t) 91 | scraper := &MockScraper{fixture: "fixtures/squeue_out.json"} 92 | fetcher := &JobJsonFetcher{ 93 | scraper: scraper, 94 | cache: NewAtomicThrottledCache[JobMetric](100), 95 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 96 | } 97 | jms, err := fetcher.fetch() 98 | assert.Nil(err) 99 | 100 | //test 101 | state := "RUNNING" 102 | expectedUser := "bkd" 103 | 104 | for user, metric := range parseUserJobMetrics(jms) { 105 | if user == expectedUser { 106 | assert.Equal(2., metric.totalJobCount) 107 | assert.Equal(1., metric.allocCpu[state]) 108 | assert.Equal(1., metric.stateJobCount[state]) 109 | assert.Equal(6.4e+13, metric.allocMemory[state]) 110 | } else { 111 | t.Fatal("unexpected user in reseult") 112 | } 113 | } 114 | } 115 | 116 | func TestJobCollect(t *testing.T) { 117 | assert := assert.New(t) 118 | config := &Config{ 119 | PollLimit: 10, 120 | TraceConf: &TraceConfig{ 121 | sharedFetcher: &JobJsonFetcher{ 122 | scraper: &MockScraper{fixture: "fixtures/squeue_out.json"}, 123 | cache: NewAtomicThrottledCache[JobMetric](1), 124 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 125 | }, 126 | rate: 10, 127 | }, 128 | cliOpts: &CliOpts{}, 129 | } 130 | jc := NewJobsController(config) 131 | jobChan := make(chan prometheus.Metric) 132 | go func() { 133 | jc.Collect(jobChan) 134 | close(jobChan) 135 | }() 136 | jobMetrics := make([]prometheus.Metric, 0) 137 | for metric, ok := <-jobChan; ok; metric, ok = <-jobChan { 138 | t.Log(metric.Desc().String()) 139 | jobMetrics = append(jobMetrics, metric) 140 | } 141 | assert.NotEmpty(jobMetrics) 142 | } 143 | 144 | func TestJobCollect_Fallback(t *testing.T) { 145 | assert := assert.New(t) 146 | config := &Config{ 147 | PollLimit: 10, 148 | TraceConf: &TraceConfig{ 149 | sharedFetcher: &JobCliFallbackFetcher{ 150 | scraper: &MockScraper{fixture: "fixtures/squeue_fallback.txt"}, 151 | cache: NewAtomicThrottledCache[JobMetric](1), 152 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 153 | }, 154 | rate: 10, 155 | }, 156 | cliOpts: &CliOpts{ 157 | fallback: true, 158 | }, 159 | } 160 | jc := NewJobsController(config) 161 | jobChan := make(chan prometheus.Metric) 162 | go func() { 163 | jc.Collect(jobChan) 164 | close(jobChan) 165 | }() 166 | jobMetrics := make([]prometheus.Metric, 0) 167 | for metric, ok := <-jobChan; ok; metric, ok = <-jobChan { 168 | t.Log(metric.Desc().String()) 169 | jobMetrics = append(jobMetrics, metric) 170 | } 171 | assert.NotEmpty(jobMetrics) 172 | 173 | } 174 | 175 | func TestParsePartitionJobMetrics(t *testing.T) { 176 | assert := assert.New(t) 177 | scraper := &MockScraper{fixture: "fixtures/squeue_out.json"} 178 | fetcher := &JobJsonFetcher{ 179 | scraper: scraper, 180 | cache: NewAtomicThrottledCache[JobMetric](100), 181 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 182 | } 183 | jms, err := fetcher.fetch() 184 | assert.Nil(err) 185 | 186 | partitionJobMetrics := parsePartitionJobMetrics(jms) 187 | assert.Equal(float64(1), partitionJobMetrics["hw-l"].partitionState["RUNNING"]) 188 | } 189 | 190 | func TestParsePartMetrics(t *testing.T) { 191 | assert := assert.New(t) 192 | scraper := &MockScraper{fixture: "fixtures/squeue_out.json"} 193 | fetcher := &JobJsonFetcher{ 194 | scraper: scraper, 195 | cache: NewAtomicThrottledCache[JobMetric](100), 196 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 197 | } 198 | jms, err := fetcher.fetch() 199 | assert.Nil(err) 200 | 201 | featureMetrics := parseFeatureMetric(jms) 202 | assert.Equal(1., featureMetrics["a100-80gb"].total) 203 | assert.Equal(1., featureMetrics["preemptible"].allocCpu) 204 | } 205 | 206 | func TestJobDescribe(t *testing.T) { 207 | assert := assert.New(t) 208 | ch := make(chan *prometheus.Desc) 209 | config, err := NewConfig(new(CliFlags)) 210 | assert.Nil(err) 211 | config.TraceConf.sharedFetcher = &JobJsonFetcher{ 212 | scraper: MockJobInfoScraper, 213 | cache: NewAtomicThrottledCache[JobMetric](1), 214 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 215 | } 216 | config.TraceConf.rate = 10 217 | jc := NewJobsController(config) 218 | go func() { 219 | jc.Describe(ch) 220 | close(ch) 221 | }() 222 | descs := make([]*prometheus.Desc, 0) 223 | for desc, ok := <-ch; ok; desc, ok = <-ch { 224 | descs = append(descs, desc) 225 | } 226 | assert.NotEmpty(descs) 227 | } 228 | 229 | func TestNAbleTimeJson(t *testing.T) { 230 | assert := assert.New(t) 231 | data := `"2023-09-21T14:31:11"` 232 | var nat NAbleTime 233 | err := nat.UnmarshalJSON([]byte(data)) 234 | assert.Nil(err) 235 | assert.True(nat.Equal(time.Date(2023, 9, 21, 14, 31, 11, 0, time.UTC))) 236 | } 237 | 238 | func TestNAbleTimeJson_NA(t *testing.T) { 239 | assert := assert.New(t) 240 | data := `"N/A"` 241 | var nat NAbleTime 242 | err := nat.UnmarshalJSON([]byte(data)) 243 | assert.Nil(err) 244 | assert.True(nat.Equal(time.Time{})) 245 | } 246 | 247 | func TestParseCliFallbackEmpty(t *testing.T) { 248 | assert := assert.New(t) 249 | scraper := &StringByteScraper{msg: ""} 250 | cliFallbackFetcher := &JobCliFallbackFetcher{ 251 | scraper: scraper, 252 | cache: NewAtomicThrottledCache[JobMetric](100), 253 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 254 | } 255 | metrics, err := cliFallbackFetcher.fetch() 256 | assert.NoError(err) 257 | assert.Empty(metrics) 258 | assert.Zero(CollectCounterValue(cliFallbackFetcher.errCounter)) 259 | assert.Equal(1, scraper.Callcount) 260 | scraper.msg = "\n" 261 | metrics, err = cliFallbackFetcher.fetch() 262 | assert.NoError(err) 263 | assert.Empty(metrics) 264 | assert.Zero(CollectCounterValue(cliFallbackFetcher.errCounter)) 265 | assert.Equal(2, scraper.Callcount) 266 | } 267 | 268 | func TestCliJobFetcherCacheHit(t *testing.T) { 269 | assert := assert.New(t) 270 | scraper := &MockScraper{fixture: "fixtures/squeue_fallback.txt"} 271 | cliFallbackFetcher := &JobCliFallbackFetcher{ 272 | scraper: scraper, 273 | cache: NewAtomicThrottledCache[JobMetric](100), 274 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 275 | } 276 | metrics, err := cliFallbackFetcher.FetchMetrics() 277 | assert.NotEmpty(metrics) 278 | assert.NoError(err) 279 | assert.Equal(1, scraper.CallCount) 280 | metrics, err = cliFallbackFetcher.FetchMetrics() 281 | assert.NotEmpty(metrics) 282 | assert.NoError(err) 283 | // assert cache hit 284 | assert.Equal(1, scraper.CallCount) 285 | } 286 | 287 | func TestCliJobFetcherCacheMiss(t *testing.T) { 288 | assert := assert.New(t) 289 | scraper := &MockScraper{fixture: "fixtures/squeue_fallback.txt"} 290 | cliFallbackFetcher := &JobCliFallbackFetcher{ 291 | scraper: scraper, 292 | cache: NewAtomicThrottledCache[JobMetric](0), 293 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 294 | } 295 | metrics, err := cliFallbackFetcher.FetchMetrics() 296 | assert.NotEmpty(metrics) 297 | assert.NoError(err) 298 | assert.Equal(1, scraper.CallCount) 299 | metrics, err = cliFallbackFetcher.FetchMetrics() 300 | assert.NotEmpty(metrics) 301 | assert.NoError(err) 302 | // assert cache hit 303 | assert.Equal(2, scraper.CallCount) 304 | } 305 | 306 | func TestJsonJobFetcherCacheHit(t *testing.T) { 307 | assert := assert.New(t) 308 | scraper := &MockScraper{fixture: "fixtures/squeue_out.json"} 309 | cliFallbackFetcher := &JobJsonFetcher{ 310 | scraper: scraper, 311 | cache: NewAtomicThrottledCache[JobMetric](100), 312 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 313 | } 314 | metrics, err := cliFallbackFetcher.FetchMetrics() 315 | assert.NotEmpty(metrics) 316 | assert.NoError(err) 317 | assert.Equal(1, scraper.CallCount) 318 | metrics, err = cliFallbackFetcher.FetchMetrics() 319 | assert.NotEmpty(metrics) 320 | assert.NoError(err) 321 | // assert cache hit 322 | assert.Equal(1, scraper.CallCount) 323 | } 324 | 325 | func TestJsonJobFetcherCacheMiss(t *testing.T) { 326 | assert := assert.New(t) 327 | scraper := &MockScraper{fixture: "fixtures/squeue_out.json"} 328 | cliFallbackFetcher := &JobJsonFetcher{ 329 | scraper: scraper, 330 | cache: NewAtomicThrottledCache[JobMetric](0), 331 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 332 | } 333 | metrics, err := cliFallbackFetcher.FetchMetrics() 334 | assert.NotEmpty(metrics) 335 | assert.NoError(err) 336 | assert.Equal(1, scraper.CallCount) 337 | metrics, err = cliFallbackFetcher.FetchMetrics() 338 | assert.NotEmpty(metrics) 339 | assert.NoError(err) 340 | // assert cache hit 341 | assert.Equal(2, scraper.CallCount) 342 | } 343 | 344 | func TestParseStateReasonMetric_Fallback(t *testing.T) { 345 | assert := assert.New(t) 346 | scraper := &MockScraper{fixture: "fixtures/squeue_fallback.txt"} 347 | cliFallbackFetcher := &JobCliFallbackFetcher{ 348 | scraper: scraper, 349 | cache: NewAtomicThrottledCache[JobMetric](0), 350 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 351 | } 352 | jobMetrics, err := cliFallbackFetcher.FetchMetrics() 353 | assert.NotEmpty(jobMetrics) 354 | assert.NoError(err) 355 | m := parseStateReasonMetric(jobMetrics) 356 | assert.NotEmpty(m.pendingStateCount) 357 | assert.Equal(m.pendingStateCount["Dependency"], 1.) 358 | assert.Equal(m.pendingStateCount["Priority"], 1.) 359 | } 360 | func TestParseStateReasonMetric_Json(t *testing.T) { 361 | assert := assert.New(t) 362 | scraper := &MockScraper{fixture: "fixtures/squeue_out.json"} 363 | JsonFetcher := &JobJsonFetcher{ 364 | scraper: scraper, 365 | cache: NewAtomicThrottledCache[JobMetric](0), 366 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{Name: "errors"}), 367 | } 368 | jobMetrics, err := JsonFetcher.FetchMetrics() 369 | assert.NotEmpty(jobMetrics) 370 | assert.NoError(err) 371 | m := parseStateReasonMetric(jobMetrics) 372 | assert.NotEmpty(m.pendingStateCount) 373 | assert.Equal(m.pendingStateCount["Dependency"], 1.) 374 | } 375 | -------------------------------------------------------------------------------- /exporter/license.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | package exporter 5 | 6 | import ( 7 | "encoding/json" 8 | "fmt" 9 | "log/slog" 10 | "time" 11 | 12 | "github.com/prometheus/client_golang/prometheus" 13 | ) 14 | 15 | type LicenseMetric struct { 16 | LicenseName string `json:"LicenseName"` 17 | Total int `json:"Total"` 18 | Used int `json:"Used"` 19 | Free int `json:"Free"` 20 | Remote bool `json:"Remote"` 21 | Reserved int `json:"Reserved"` 22 | LastConsumed int `json:"LastConsumed"` 23 | LastDeficit int `json:"LastDeficit"` 24 | } 25 | 26 | type scontrolLicResponse struct { 27 | Meta struct { 28 | SlurmVersion SlurmVersion `json:"meta"` 29 | } 30 | Licenses []LicenseMetric `json:"licenses"` 31 | } 32 | 33 | type CliJsonLicMetricFetcher struct { 34 | scraper SlurmByteScraper 35 | cache *AtomicThrottledCache[LicenseMetric] 36 | errorCounter prometheus.Counter 37 | } 38 | 39 | func (cjl *CliJsonLicMetricFetcher) fetch() ([]LicenseMetric, error) { 40 | licBytes, err := cjl.scraper.FetchRawBytes() 41 | if err != nil { 42 | slog.Error(fmt.Sprintf("fetch error %q", err)) 43 | cjl.errorCounter.Inc() 44 | return nil, err 45 | } 46 | lic := new(scontrolLicResponse) 47 | if err := json.Unmarshal(licBytes, lic); err != nil { 48 | slog.Error(fmt.Sprintf("Unmarshaling license metrics %q", err)) 49 | return nil, err 50 | } 51 | return lic.Licenses, nil 52 | } 53 | 54 | func (cjl *CliJsonLicMetricFetcher) FetchMetrics() ([]LicenseMetric, error) { 55 | return cjl.cache.FetchOrThrottle(cjl.fetch) 56 | } 57 | 58 | func (cjl *CliJsonLicMetricFetcher) ScrapeDuration() time.Duration { 59 | return cjl.cache.duration 60 | } 61 | 62 | func (cjl *CliJsonLicMetricFetcher) ScrapeError() prometheus.Counter { 63 | return cjl.errorCounter 64 | } 65 | 66 | type LicCollector struct { 67 | fetcher SlurmMetricFetcher[LicenseMetric] 68 | licTotal *prometheus.Desc 69 | licUsed *prometheus.Desc 70 | licFree *prometheus.Desc 71 | licReserved *prometheus.Desc 72 | licLastConsumed *prometheus.Desc 73 | licLastDeficit *prometheus.Desc 74 | licScrapeError prometheus.Counter 75 | } 76 | 77 | func NewLicCollector(config *Config) *LicCollector { 78 | cliOpts := config.cliOpts 79 | fetcher := &CliJsonLicMetricFetcher{ 80 | scraper: NewCliScraper(cliOpts.lic...), 81 | cache: NewAtomicThrottledCache[LicenseMetric](config.PollLimit), 82 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{ 83 | Name: "slurm_lic_scrape_error", 84 | Help: "slurm license scrape error", 85 | }), 86 | } 87 | return &LicCollector{ 88 | fetcher: fetcher, 89 | licTotal: prometheus.NewDesc("slurm_lic_total", "slurm license total", []string{"name"}, nil), 90 | licUsed: prometheus.NewDesc("slurm_lic_used", "slurm license used", []string{"name"}, nil), 91 | licFree: prometheus.NewDesc("slurm_lic_free", "slurm license free", []string{"name"}, nil), 92 | licLastConsumed: prometheus.NewDesc("slurm_lic_last_consumed", "slurm license last_consumed", []string{"name"}, nil), 93 | licLastDeficit: prometheus.NewDesc("slurm_lic_last_deficit", "slurm license last_deficit", []string{"name"}, nil), 94 | licReserved: prometheus.NewDesc("slurm_lic_reserved", "slurm license reserved", []string{"name"}, nil), 95 | licScrapeError: prometheus.NewCounter(prometheus.CounterOpts{ 96 | Name: "slurm_lic_scrape_error", 97 | Help: "slurm license scrape error", 98 | }), 99 | } 100 | } 101 | 102 | func (lc *LicCollector) Describe(ch chan<- *prometheus.Desc) { 103 | ch <- lc.licTotal 104 | ch <- lc.licUsed 105 | ch <- lc.licFree 106 | ch <- lc.licReserved 107 | ch <- lc.licLastConsumed 108 | ch <- lc.licLastDeficit 109 | ch <- lc.licScrapeError.Desc() 110 | } 111 | 112 | func (lc *LicCollector) Collect(ch chan<- prometheus.Metric) { 113 | defer func() { 114 | ch <- lc.licScrapeError 115 | }() 116 | licMetrics, err := lc.fetcher.FetchMetrics() 117 | if err != nil { 118 | lc.licScrapeError.Inc() 119 | slog.Error(fmt.Sprintf("lic parse error %q", err)) 120 | return 121 | } 122 | for _, lic := range licMetrics { 123 | if lic.Total > 0 { 124 | ch <- prometheus.MustNewConstMetric(lc.licTotal, prometheus.GaugeValue, float64(lic.Total), lic.LicenseName) 125 | } 126 | if lic.Free > 0 { 127 | ch <- prometheus.MustNewConstMetric(lc.licFree, prometheus.GaugeValue, float64(lic.Free), lic.LicenseName) 128 | } 129 | if lic.Used > 0 { 130 | ch <- prometheus.MustNewConstMetric(lc.licUsed, prometheus.GaugeValue, float64(lic.Used), lic.LicenseName) 131 | } 132 | if lic.Reserved > 0 { 133 | ch <- prometheus.MustNewConstMetric(lc.licReserved, prometheus.GaugeValue, float64(lic.Reserved), lic.LicenseName) 134 | } 135 | if lic.LastConsumed > 0 { 136 | ch <- prometheus.MustNewConstMetric(lc.licLastConsumed, prometheus.GaugeValue, float64(lic.LastConsumed), lic.LicenseName) 137 | } 138 | if lic.Reserved > 0 { 139 | ch <- prometheus.MustNewConstMetric(lc.licLastDeficit, prometheus.GaugeValue, float64(lic.LastDeficit), lic.LicenseName) 140 | } 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /exporter/license_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | package exporter 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/prometheus/client_golang/prometheus" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | var MockLicFetcher = &MockScraper{fixture: "fixtures/license_out.json"} 14 | 15 | func TestNewLicController(t *testing.T) { 16 | assert := assert.New(t) 17 | config := Config{ 18 | PollLimit: 10, 19 | cliOpts: &CliOpts{ 20 | licEnabled: true, 21 | }, 22 | } 23 | lc := NewLicCollector(&config) 24 | assert.NotNil(lc) 25 | } 26 | 27 | func TestLicCollect(t *testing.T) { 28 | assert := assert.New(t) 29 | config := Config{ 30 | PollLimit: 10, 31 | cliOpts: &CliOpts{ 32 | licEnabled: true, 33 | }, 34 | } 35 | lc := NewLicCollector(&config) 36 | lc.fetcher = &CliJsonLicMetricFetcher{ 37 | scraper: MockLicFetcher, 38 | cache: NewAtomicThrottledCache[LicenseMetric](1), 39 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 40 | } 41 | lcChan := make(chan prometheus.Metric) 42 | go func() { 43 | lc.Collect(lcChan) 44 | close(lcChan) 45 | }() 46 | licMetrics := make([]prometheus.Metric, 0) 47 | for metric, ok := <-lcChan; ok; metric, ok = <-lcChan { 48 | t.Log(metric.Desc().String()) 49 | licMetrics = append(licMetrics, metric) 50 | } 51 | assert.NotEmpty(licMetrics) 52 | } 53 | 54 | func TestLicCollect_ColectionE(t *testing.T) { 55 | assert := assert.New(t) 56 | config := Config{ 57 | PollLimit: 10, 58 | cliOpts: &CliOpts{ 59 | licEnabled: true, 60 | }, 61 | } 62 | lc := NewLicCollector(&config) 63 | lc.fetcher = &CliJsonLicMetricFetcher{ 64 | scraper: MockLicFetcher, 65 | cache: NewAtomicThrottledCache[LicenseMetric](1), 66 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 67 | } 68 | lcChan := make(chan prometheus.Metric) 69 | go func() { 70 | lc.Collect(lcChan) 71 | close(lcChan) 72 | }() 73 | licMetrics := make([]prometheus.Metric, 0) 74 | for metric, ok := <-lcChan; ok; metric, ok = <-lcChan { 75 | t.Log(metric.Desc().String()) 76 | licMetrics = append(licMetrics, metric) 77 | } 78 | 79 | assert.Equal(7, len(licMetrics)) 80 | } 81 | 82 | func TestLicDescribe(t *testing.T) { 83 | assert := assert.New(t) 84 | config := Config{ 85 | PollLimit: 10, 86 | cliOpts: &CliOpts{ 87 | licEnabled: true, 88 | }, 89 | } 90 | lc := NewLicCollector(&config) 91 | lc.fetcher = &CliJsonLicMetricFetcher{ 92 | scraper: MockLicFetcher, 93 | cache: NewAtomicThrottledCache[LicenseMetric](1), 94 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 95 | } 96 | lcChan := make(chan *prometheus.Desc) 97 | go func() { 98 | lc.Describe(lcChan) 99 | close(lcChan) 100 | }() 101 | licMetrics := make([]*prometheus.Desc, 0) 102 | for desc, ok := <-lcChan; ok; desc, ok = <-lcChan { 103 | t.Log(desc.String()) 104 | licMetrics = append(licMetrics, desc) 105 | } 106 | assert.NotEmpty(licMetrics) 107 | } 108 | -------------------------------------------------------------------------------- /exporter/limits.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | package exporter 5 | 6 | import ( 7 | "bytes" 8 | "encoding/csv" 9 | "fmt" 10 | "io" 11 | "log" 12 | "strconv" 13 | "time" 14 | 15 | "github.com/prometheus/client_golang/prometheus" 16 | "log/slog" 17 | ) 18 | 19 | type AccountLimitMetric struct { 20 | Account string 21 | // limit to the amount of resources for a particular account in the RUNNING state 22 | AllocatedMem float64 23 | AllocatedCPU float64 24 | AllocatedJobs float64 25 | // limit to the amount of resources that can be either PENDING or RUNNING 26 | TotalJobs float64 27 | } 28 | 29 | type AccountCsvFetcher struct { 30 | scraper SlurmByteScraper 31 | errorCounter prometheus.Counter 32 | cache *AtomicThrottledCache[AccountLimitMetric] 33 | } 34 | 35 | func (acf *AccountCsvFetcher) fetchFromCli() ([]AccountLimitMetric, error) { 36 | cliCsv, err := acf.scraper.FetchRawBytes() 37 | if err != nil { 38 | acf.errorCounter.Inc() 39 | slog.Error(fmt.Sprintf("failed to scrape account metrics with %q", err)) 40 | return nil, err 41 | } 42 | 43 | reader := csv.NewReader(bytes.NewBuffer(cliCsv)) 44 | reader.Comma = '|' 45 | accountMetrics := make([]AccountLimitMetric, 0) 46 | for records, err := reader.Read(); err != io.EOF; records, err = reader.Read() { 47 | if err != nil { 48 | acf.errorCounter.Inc() 49 | slog.Error(fmt.Sprintf("failed to scrape account metric row %v", records)) 50 | continue 51 | } 52 | if len(records) != 6 { 53 | acf.errorCounter.Inc() 54 | slog.Error(fmt.Sprintf("failed to scrape account metric row %v", records)) 55 | continue 56 | } 57 | user, account, cpu, mem, runningJobs, totalJobs := records[0], records[1], records[2], records[3], records[4], records[5] 58 | 59 | if user != "" { 60 | // sacctmgr will display account limits by setting the user to "" 61 | // otherwise the user -> account association is shown 62 | // i.e user Bob can allocate x cpu within account Blah 63 | continue 64 | } 65 | metric := AccountLimitMetric{Account: account} 66 | if mem != "" { 67 | if memMb, err := strconv.ParseFloat(mem, 64); err != nil { 68 | slog.Error(fmt.Sprintf("failed to scrape account metric mem string %s", mem)) 69 | acf.errorCounter.Inc() 70 | } else { 71 | metric.AllocatedMem = memMb * 1e6 72 | } 73 | } 74 | if cpu != "" { 75 | if cpuCount, err := strconv.ParseFloat(cpu, 64); err != nil { 76 | slog.Error(fmt.Sprintf("failed to scrape account metric cpu string %s", cpu)) 77 | acf.errorCounter.Inc() 78 | } else { 79 | metric.AllocatedCPU = cpuCount 80 | } 81 | } 82 | if runningJobs != "" { 83 | if runnableJobs, err := strconv.ParseFloat(runningJobs, 64); err != nil { 84 | slog.Error(fmt.Sprintf("failed to scrape account metric AllocatableJobs (jobs in RUNNING state) with err: %q", err)) 85 | acf.errorCounter.Inc() 86 | } else { 87 | metric.AllocatedJobs = runnableJobs 88 | } 89 | } 90 | if totalJobs != "" { 91 | if allJobs, err := strconv.ParseFloat(totalJobs, 64); err != nil { 92 | slog.Error(fmt.Sprintf("failed to scrape account metric TotalJobs (jobs in RUNNING or PENDING state) with err: %q", err)) 93 | acf.errorCounter.Inc() 94 | } else { 95 | metric.TotalJobs = allJobs 96 | } 97 | } 98 | accountMetrics = append(accountMetrics, metric) 99 | } 100 | return accountMetrics, nil 101 | } 102 | 103 | func (acf *AccountCsvFetcher) FetchMetrics() ([]AccountLimitMetric, error) { 104 | return acf.cache.FetchOrThrottle(acf.fetchFromCli) 105 | } 106 | 107 | func (acf *AccountCsvFetcher) ScrapeError() prometheus.Counter { 108 | return acf.errorCounter 109 | } 110 | 111 | func (acf *AccountCsvFetcher) ScrapeDuration() time.Duration { 112 | return acf.scraper.Duration() 113 | } 114 | 115 | type LimitCollector struct { 116 | fetcher SlurmMetricFetcher[AccountLimitMetric] 117 | accountCpuLimit *prometheus.Desc 118 | accountMemLimit *prometheus.Desc 119 | accountJobAllocCountLimit *prometheus.Desc 120 | accountJobCountLimit *prometheus.Desc 121 | limitScrapeDuration *prometheus.Desc 122 | limitScrapeError prometheus.Counter 123 | } 124 | 125 | func NewLimitCollector(config *Config) *LimitCollector { 126 | cliOpts := config.cliOpts 127 | if !cliOpts.sacctEnabled { 128 | log.Fatal("tried to invoke limit collector while cli disabled") 129 | } 130 | return &LimitCollector{ 131 | fetcher: &AccountCsvFetcher{ 132 | scraper: NewCliScraper(cliOpts.sacctmgr...), 133 | cache: NewAtomicThrottledCache[AccountLimitMetric](config.PollLimit), 134 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{ 135 | Name: "slurm_account_scrape_error", 136 | Help: "Slurm sacct scrape error", 137 | }), 138 | }, 139 | accountCpuLimit: prometheus.NewDesc("slurm_account_cpu_limit", "slurm account cpu limit", []string{"account"}, nil), 140 | accountMemLimit: prometheus.NewDesc("slurm_account_mem_limit", "slurm account mem limit (in bytes)", []string{"account"}, nil), 141 | accountJobAllocCountLimit: prometheus.NewDesc("slurm_account_job_alloc_limit", "slurm account limit on the # of jobs allowed to be RUNNING state", []string{"account"}, nil), 142 | accountJobCountLimit: prometheus.NewDesc("slurm_account_job_limit", "slurm account limit on the # of jobs allowed to be RUNNING or PENDING state", []string{"account"}, nil), 143 | limitScrapeDuration: prometheus.NewDesc("slurm_limit_scrape_duration", "slurm sacctmgr scrape duration", nil, nil), 144 | limitScrapeError: prometheus.NewCounter(prometheus.CounterOpts{ 145 | Name: "slurm_account_collect_error", 146 | Help: "Slurm sacct collect error", 147 | }), 148 | } 149 | } 150 | 151 | func (lc *LimitCollector) Describe(ch chan<- *prometheus.Desc) { 152 | ch <- lc.accountCpuLimit 153 | ch <- lc.accountMemLimit 154 | ch <- lc.limitScrapeDuration 155 | ch <- lc.limitScrapeError.Desc() 156 | } 157 | 158 | func (lc *LimitCollector) Collect(ch chan<- prometheus.Metric) { 159 | defer func() { 160 | ch <- lc.limitScrapeError 161 | }() 162 | limitMetrics, err := lc.fetcher.FetchMetrics() 163 | if err != nil { 164 | lc.limitScrapeError.Inc() 165 | slog.Error(fmt.Sprintf("lic parse error %q", err)) 166 | return 167 | } 168 | 169 | emitNonZeroVal := func(desc *prometheus.Desc, val float64, account string) { 170 | if val != 0 { 171 | ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, val, account) 172 | } 173 | } 174 | ch <- prometheus.MustNewConstMetric(lc.limitScrapeDuration, prometheus.GaugeValue, float64(lc.fetcher.ScrapeDuration().Milliseconds())) 175 | for _, account := range limitMetrics { 176 | emitNonZeroVal(lc.accountMemLimit, account.AllocatedMem, account.Account) 177 | emitNonZeroVal(lc.accountCpuLimit, account.AllocatedCPU, account.Account) 178 | emitNonZeroVal(lc.accountJobAllocCountLimit, account.AllocatedJobs, account.Account) 179 | emitNonZeroVal(lc.accountJobCountLimit, account.TotalJobs, account.Account) 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /exporter/limits_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | package exporter 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/prometheus/client_golang/prometheus" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | var MockSacctFetcher = &MockScraper{fixture: "fixtures/sacctmgr.txt"} 14 | 15 | func TestAccountLimitFetch(t *testing.T) { 16 | assert := assert.New(t) 17 | fetcher := AccountCsvFetcher{ 18 | scraper: MockSacctFetcher, 19 | errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 20 | cache: NewAtomicThrottledCache[AccountLimitMetric](10), 21 | } 22 | accountLimits, err := fetcher.fetchFromCli() 23 | assert.NoError(err) 24 | assert.Len(accountLimits, 6) 25 | var account5Limits AccountLimitMetric 26 | for _, metric := range accountLimits { 27 | if metric.Account == "account5" { 28 | account5Limits = metric 29 | } 30 | } 31 | assert.Equal(account5Limits.Account, "account5") 32 | assert.Equal(account5Limits.AllocatedCPU, 3974.) 33 | assert.Equal(account5Limits.AllocatedMem, 47752500.*1e6) 34 | assert.Equal(account5Limits.AllocatedJobs, 4.e3) 35 | assert.Equal(account5Limits.TotalJobs, 3.e4) 36 | } 37 | 38 | func TestNewLimitCollector(t *testing.T) { 39 | assert := assert.New(t) 40 | config := Config{ 41 | PollLimit: 10, 42 | cliOpts: &CliOpts{ 43 | sacctEnabled: true, 44 | }, 45 | } 46 | collector := NewLicCollector(&config) 47 | assert.NotNil(collector) 48 | } 49 | 50 | func TestLimitCollector(t *testing.T) { 51 | assert := assert.New(t) 52 | config := Config{ 53 | PollLimit: 10, 54 | cliOpts: &CliOpts{ 55 | sacctEnabled: true, 56 | }, 57 | } 58 | lc := NewLimitCollector(&config) 59 | lc.fetcher = &AccountCsvFetcher{ 60 | scraper: MockSacctFetcher, 61 | errorCounter: lc.fetcher.ScrapeError(), 62 | cache: NewAtomicThrottledCache[AccountLimitMetric](10), 63 | } 64 | lcChan := make(chan prometheus.Metric) 65 | go func() { 66 | lc.Collect(lcChan) 67 | close(lcChan) 68 | }() 69 | limitMetrics := make([]prometheus.Metric, 0) 70 | for metric, ok := <-lcChan; ok; metric, ok = <-lcChan { 71 | t.Log(metric.Desc().String()) 72 | limitMetrics = append(limitMetrics, metric) 73 | } 74 | assert.NotEmpty(limitMetrics) 75 | } 76 | func TestLimitDescribe(t *testing.T) { 77 | assert := assert.New(t) 78 | config := Config{ 79 | PollLimit: 10, 80 | cliOpts: &CliOpts{ 81 | sacctEnabled: true, 82 | }, 83 | } 84 | lc := NewLimitCollector(&config) 85 | lc.fetcher = &AccountCsvFetcher{ 86 | scraper: MockSacctFetcher, 87 | errorCounter: lc.fetcher.ScrapeError(), 88 | cache: NewAtomicThrottledCache[AccountLimitMetric](10), 89 | } 90 | lcChan := make(chan *prometheus.Desc) 91 | go func() { 92 | lc.Describe(lcChan) 93 | close(lcChan) 94 | }() 95 | limitMetrics := make([]*prometheus.Desc, 0) 96 | for desc, ok := <-lcChan; ok; desc, ok = <-lcChan { 97 | t.Log(desc.String()) 98 | limitMetrics = append(limitMetrics, desc) 99 | } 100 | assert.Len(limitMetrics, 4) 101 | } 102 | -------------------------------------------------------------------------------- /exporter/main_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "io" 9 | "net/http" 10 | "net/http/httptest" 11 | "os" 12 | "regexp" 13 | "testing" 14 | 15 | "log/slog" 16 | 17 | "github.com/prometheus/client_golang/prometheus" 18 | "github.com/stretchr/testify/assert" 19 | ) 20 | 21 | // global test setups 22 | func TestMain(m *testing.M) { 23 | opts := slog.HandlerOptions{ 24 | Level: slog.LevelError, 25 | } 26 | textHandler := slog.NewTextHandler(io.Discard, &opts) 27 | slog.SetDefault(slog.New(textHandler)) 28 | code := m.Run() 29 | os.Exit(code) 30 | } 31 | 32 | func TestPromServer(t *testing.T) { 33 | assert := assert.New(t) 34 | cliOpts := &CliOpts{ 35 | sinfo: []string{"cat", "fixtures/sinfo_out.json"}, 36 | squeue: []string{"cat", "fixtures/squeue_out.json"}, 37 | excludeFilter: regexp.MustCompile(""), 38 | } 39 | config := &Config{ 40 | PollLimit: 10, 41 | cliOpts: cliOpts, 42 | TraceConf: &TraceConfig{ 43 | enabled: false, 44 | sharedFetcher: &JobJsonFetcher{ 45 | scraper: NewCliScraper(cliOpts.squeue...), 46 | cache: NewAtomicThrottledCache[JobMetric](1), 47 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{ 48 | Name: "slurm_job_scrape_error", 49 | Help: "job scrape error", 50 | }), 51 | }, 52 | }, 53 | } 54 | server := InitPromServer(config) 55 | w := httptest.NewRecorder() 56 | r := httptest.NewRequest(http.MethodGet, "/metrics", nil) 57 | server.ServeHTTP(w, r) 58 | assert.Equal(200, w.Code) 59 | txt := w.Body.String() 60 | assert.Contains(txt, "slurm_job_scrape_error 0") 61 | assert.Contains(txt, "slurm_node_scrape_error 0") 62 | } 63 | 64 | func TestNewConfig_Default(t *testing.T) { 65 | assert := assert.New(t) 66 | config, err := NewConfig(new(CliFlags)) 67 | assert.Nil(err) 68 | assert.Equal([]string{"sinfo", "--json"}, config.cliOpts.sinfo) 69 | assert.Equal([]string{"squeue", "--json"}, config.cliOpts.squeue) 70 | assert.Equal([]string{"scontrol", "show", "lic", "--json"}, config.cliOpts.lic) 71 | assert.Equal(uint64(10), config.TraceConf.rate) 72 | } 73 | 74 | func TestNewConfig_NonDefault(t *testing.T) { 75 | assert := assert.New(t) 76 | cliFlags := CliFlags{SlurmCliFallback: true} 77 | config, err := NewConfig(&cliFlags) 78 | assert.Nil(err) 79 | expected := []string{"squeue", "--states=all", "-h", "-r", "-o", `{"a": "%a", "id": %A, "end_time": "%e", "u": "%u", "state": "%T", "p": "%P", "cpu": %C, "mem": "%m", "array_id": "%K", "r": "%R"}`} 80 | assert.Equal(expected, config.cliOpts.squeue) 81 | } 82 | 83 | // TODO: add integration test 84 | -------------------------------------------------------------------------------- /exporter/mock_utils.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "bytes" 9 | "errors" 10 | "os" 11 | "time" 12 | ) 13 | 14 | type MockFetchErrored struct{} 15 | 16 | func (f *MockFetchErrored) FetchRawBytes() ([]byte, error) { 17 | return nil, errors.New("mock fetch error") 18 | } 19 | 20 | func (f *MockFetchErrored) Duration() time.Duration { 21 | return 1 22 | } 23 | 24 | // implements SlurmByteScraper by pulling fixtures instead 25 | // used exclusively for testing 26 | type MockScraper struct { 27 | fixture string 28 | duration time.Duration 29 | CallCount int 30 | } 31 | 32 | func (f *MockScraper) FetchRawBytes() ([]byte, error) { 33 | defer func(t time.Time) { 34 | f.duration = time.Since(t) 35 | }(time.Now()) 36 | f.CallCount++ 37 | file, err := os.ReadFile(f.fixture) 38 | if err != nil { 39 | return nil, err 40 | } 41 | // allow commenting in text files 42 | sep := []byte("\n") 43 | lines := bytes.Split(file, sep) 44 | filtered := make([][]byte, 0) 45 | for _, line := range lines { 46 | if !bytes.HasPrefix(line, []byte("#")) { 47 | filtered = append(filtered, line) 48 | } 49 | } 50 | return bytes.Join(filtered, sep), nil 51 | } 52 | 53 | func (f *MockScraper) Duration() time.Duration { 54 | return f.duration 55 | } 56 | 57 | // implements SlurmByteScraper by emmiting string payload instead 58 | // used exclusively for testing 59 | type StringByteScraper struct { 60 | msg string 61 | Callcount int 62 | } 63 | 64 | func (es *StringByteScraper) FetchRawBytes() ([]byte, error) { 65 | es.Callcount++ 66 | return []byte(es.msg), nil 67 | } 68 | 69 | func (es *StringByteScraper) Duration() time.Duration { 70 | return time.Duration(1) 71 | } 72 | -------------------------------------------------------------------------------- /exporter/nodes_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "fmt" 9 | "testing" 10 | 11 | "github.com/prometheus/client_golang/prometheus" 12 | "github.com/stretchr/testify/assert" 13 | "slices" 14 | ) 15 | 16 | var MockNodeInfoScraper = &MockScraper{fixture: "fixtures/sinfo_out.json"} 17 | var MockNodeInfoDataParserScraper = &MockScraper{fixture: "fixtures/sinfo_dataparser.json"} 18 | 19 | func TestNewNodeCollector(t *testing.T) { 20 | assert := assert.New(t) 21 | config := &Config{ 22 | cliOpts: &CliOpts{ 23 | fallback: true, 24 | }, 25 | } 26 | collector := NewNodeCollecter(config) 27 | assert.IsType(collector.fetcher, &NodeCliFallbackFetcher{}) 28 | config.cliOpts.fallback = false 29 | collector = NewNodeCollecter(config) 30 | assert.IsType(collector.fetcher, &NodeJsonFetcher{}) 31 | 32 | } 33 | 34 | func TestParseNodeMetrics(t *testing.T) { 35 | fetcher := NodeJsonFetcher{scraper: MockNodeInfoScraper, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)} 36 | nodeMetrics, err := fetcher.FetchMetrics() 37 | if err != nil { 38 | t.Fatalf("Failed to parse metrics with %s", err) 39 | } 40 | if len(nodeMetrics) == 0 { 41 | t.Fatal("No metrics received") 42 | } 43 | t.Logf("Node metrics collected %d", len(nodeMetrics)) 44 | } 45 | 46 | func sumStateMetric(metric map[string]float64) float64 { 47 | sum := 0. 48 | for _, val := range metric { 49 | sum += val 50 | } 51 | return sum 52 | } 53 | 54 | func TestPartitionMetric(t *testing.T) { 55 | assert := assert.New(t) 56 | fetcher := NodeJsonFetcher{scraper: MockNodeInfoScraper, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)} 57 | nodeMetrics, err := fetcher.FetchMetrics() 58 | assert.Nil(err) 59 | metrics := fetchNodePartitionMetrics(nodeMetrics) 60 | assert.Equal(1, len(metrics)) 61 | _, contains := metrics["hw"] 62 | assert.True(contains) 63 | assert.Equal(4., sumStateMetric(metrics["hw"].StateAllocCpus)) 64 | assert.Equal(256., metrics["hw"].TotalCpus) 65 | assert.Equal(114688., sumStateMetric(metrics["hw"].StateAllocMemory)) 66 | assert.Equal(1.823573e+06, metrics["hw"].FreeMemory) 67 | assert.Equal(2e+06, metrics["hw"].RealMemory) 68 | assert.Equal(252., metrics["hw"].IdleCpus) 69 | assert.Equal(4., sumStateMetric(metrics["hw"].StateNodeCount)) 70 | } 71 | 72 | func TestNodeSummaryCpuMetric(t *testing.T) { 73 | assert := assert.New(t) 74 | fetcher := NodeJsonFetcher{scraper: MockNodeInfoScraper, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)} 75 | nodeMetrics, err := fetcher.FetchMetrics() 76 | assert.Nil(err) 77 | metrics := fetchNodeTotalCpuMetrics(nodeMetrics) 78 | assert.Equal(4, len(metrics.PerState)) 79 | for _, psm := range metrics.PerState { 80 | assert.Equal(64., psm.Cpus) 81 | assert.Equal(1., psm.Count) 82 | } 83 | } 84 | 85 | func TestNodeSummaryMemoryMetrics(t *testing.T) { 86 | assert := assert.New(t) 87 | fetcher := NodeJsonFetcher{scraper: MockNodeInfoScraper, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)} 88 | nodeMetrics, err := fetcher.FetchMetrics() 89 | assert.Nil(err) 90 | metrics := fetchNodeTotalMemMetrics(nodeMetrics) 91 | assert.Equal(114688., metrics.AllocMemory) 92 | assert.Equal(1.823573e+06, metrics.FreeMemory) 93 | assert.Equal(2e+06, metrics.RealMemory) 94 | } 95 | 96 | func TestNodeCollector(t *testing.T) { 97 | assert := assert.New(t) 98 | config, err := NewConfig(new(CliFlags)) 99 | assert.Nil(err) 100 | nc := NewNodeCollecter(config) 101 | // cache miss, use our mock fetcher 102 | nc.fetcher = &NodeJsonFetcher{scraper: MockNodeInfoScraper, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)} 103 | metricChan := make(chan prometheus.Metric) 104 | go func() { 105 | nc.Collect(metricChan) 106 | close(metricChan) 107 | }() 108 | metrics := make([]prometheus.Metric, 0) 109 | for m, ok := <-metricChan; ok; m, ok = <-metricChan { 110 | metrics = append(metrics, m) 111 | t.Logf("Received metric %s", m.Desc().String()) 112 | } 113 | assert.NotEmpty(metrics) 114 | } 115 | 116 | func TestNodeDescribe(t *testing.T) { 117 | assert := assert.New(t) 118 | ch := make(chan *prometheus.Desc) 119 | config, err := NewConfig(new(CliFlags)) 120 | assert.Nil(err) 121 | jc := NewNodeCollecter(config) 122 | jc.fetcher = &NodeJsonFetcher{scraper: MockNodeInfoScraper, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)} 123 | go func() { 124 | jc.Describe(ch) 125 | close(ch) 126 | }() 127 | descs := make([]*prometheus.Desc, 0) 128 | for desc, ok := <-ch; ok; desc, ok = <-ch { 129 | descs = append(descs, desc) 130 | } 131 | assert.NotEmpty(descs) 132 | } 133 | 134 | func TestParseFallbackNodeMetrics(t *testing.T) { 135 | assert := assert.New(t) 136 | byteFetcher := &MockScraper{fixture: "fixtures/sinfo_fallback.txt"} 137 | fetcher := NodeCliFallbackFetcher{scraper: byteFetcher, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)} 138 | metrics, err := fetcher.FetchMetrics() 139 | assert.Nil(err) 140 | assert.NotEmpty(metrics) 141 | cs25idx := slices.IndexFunc(metrics, func(nm NodeMetric) bool { return nm.Hostname == "cs25" }) 142 | assert.GreaterOrEqual(cs25idx, 0) 143 | cs25NodeMetric := metrics[cs25idx] 144 | assert.Equal("allocated", cs25NodeMetric.State) 145 | assert.Equal(64., cs25NodeMetric.AllocCpus) 146 | assert.Equal(89124.*1e6, cs25NodeMetric.FreeMemory) 147 | assert.Equal([]string{"hw", "hw-l", "hw-m", "hw-h", "cdn"}, cs25NodeMetric.Partitions) 148 | } 149 | 150 | func TestNAbleFloat_NA(t *testing.T) { 151 | assert := assert.New(t) 152 | n := NAbleFloat(1.5) 153 | data := []byte(`"N/A"`) 154 | assert.NoError(n.UnmarshalJSON(data)) 155 | assert.Equal(0., float64(n)) 156 | } 157 | 158 | func TestNAbleFloat_Float(t *testing.T) { 159 | assert := assert.New(t) 160 | n := NAbleFloat(1.5) 161 | expected := 3.14 162 | data := []byte(fmt.Sprintf(`"%f"`, expected)) 163 | assert.NoError(n.UnmarshalJSON(data)) 164 | assert.Equal(expected, float64(n)) 165 | } 166 | -------------------------------------------------------------------------------- /exporter/server.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "net/http" 9 | "os" 10 | "regexp" 11 | "strconv" 12 | "strings" 13 | 14 | "log/slog" 15 | 16 | "github.com/prometheus/client_golang/prometheus" 17 | "github.com/prometheus/client_golang/prometheus/promhttp" 18 | dto "github.com/prometheus/client_model/go" 19 | ) 20 | 21 | type CliOpts struct { 22 | sinfo []string 23 | squeue []string 24 | sacctmgr []string 25 | lic []string 26 | sdiag []string 27 | licEnabled bool 28 | diagsEnabled bool 29 | fallback bool 30 | sacctEnabled bool 31 | excludeFilter *regexp.Regexp 32 | } 33 | 34 | type TraceConfig struct { 35 | enabled bool 36 | path string 37 | rate uint64 38 | sharedFetcher SlurmMetricFetcher[JobMetric] 39 | } 40 | 41 | type Config struct { 42 | TraceConf *TraceConfig 43 | PollLimit float64 44 | LogLevel slog.Level 45 | ListenAddress string 46 | MetricsPath string 47 | cliOpts *CliOpts 48 | } 49 | 50 | type CliFlags struct { 51 | SlurmLicEnabled bool 52 | SlurmDiagEnabled bool 53 | SlurmCliFallback bool 54 | TraceEnabled bool 55 | SacctEnabled bool 56 | SlurmPollLimit float64 57 | LogLevel string 58 | ListenAddress string 59 | MetricsPath string 60 | SlurmSqueueOverride string 61 | SlurmSinfoOverride string 62 | SlurmDiagOverride string 63 | SlurmAcctOverride string 64 | TraceRate uint64 65 | TracePath string 66 | SlurmLicenseOverride string 67 | MetricsExcludeFilterRegex string 68 | } 69 | 70 | var logLevelMap = map[string]slog.Level{ 71 | "debug": slog.LevelDebug, 72 | "info": slog.LevelInfo, 73 | "warn": slog.LevelWarn, 74 | "error": slog.LevelError, 75 | } 76 | 77 | func NewConfig(cliFlags *CliFlags) (*Config, error) { 78 | // defaults 79 | compiledExcludeRegex, err := regexp.Compile(cliFlags.MetricsExcludeFilterRegex) 80 | if err != nil { 81 | return nil, err 82 | } 83 | cliOpts := CliOpts{ 84 | squeue: []string{"squeue", "--json"}, 85 | sinfo: []string{"sinfo", "--json"}, 86 | lic: []string{"scontrol", "show", "lic", "--json"}, 87 | sdiag: []string{"sdiag", "--json"}, 88 | sacctmgr: []string{"sacctmgr", "show", "assoc", "format=User,Account,GrpCPU,GrpMem,GrpJobs,GrpSubmit", "--noheader", "--parsable2"}, 89 | licEnabled: cliFlags.SlurmLicEnabled, 90 | diagsEnabled: cliFlags.SlurmDiagEnabled, 91 | fallback: cliFlags.SlurmCliFallback, 92 | sacctEnabled: cliFlags.SacctEnabled, 93 | excludeFilter: compiledExcludeRegex, 94 | } 95 | traceConf := TraceConfig{ 96 | enabled: cliFlags.TraceEnabled, 97 | path: "/trace", 98 | rate: 10, 99 | } 100 | config := &Config{ 101 | PollLimit: 10, 102 | LogLevel: slog.LevelInfo, 103 | ListenAddress: ":9092", 104 | MetricsPath: "/metrics", 105 | TraceConf: &traceConf, 106 | cliOpts: &cliOpts, 107 | } 108 | if lm, ok := os.LookupEnv("POLL_LIMIT"); ok { 109 | if limit, err := strconv.ParseFloat(lm, 64); err != nil { 110 | return nil, err 111 | } else { 112 | config.PollLimit = limit 113 | } 114 | } 115 | if cliFlags.SlurmPollLimit > 0 { 116 | config.PollLimit = cliFlags.SlurmPollLimit 117 | } 118 | if lvl, ok := os.LookupEnv("LOGLEVEL"); ok { 119 | config.LogLevel = logLevelMap[lvl] 120 | } 121 | if cliFlags.LogLevel != "" { 122 | config.LogLevel = logLevelMap[cliFlags.LogLevel] 123 | } 124 | if cliFlags.ListenAddress != "" { 125 | config.ListenAddress = cliFlags.ListenAddress 126 | } 127 | if cliFlags.MetricsPath != "" { 128 | config.MetricsPath = cliFlags.MetricsPath 129 | } 130 | if cliFlags.SlurmSqueueOverride != "" { 131 | cliOpts.squeue = strings.Split(cliFlags.SlurmSqueueOverride, " ") 132 | } 133 | if cliFlags.SlurmSinfoOverride != "" { 134 | cliOpts.sinfo = strings.Split(cliFlags.SlurmSinfoOverride, " ") 135 | } 136 | if cliFlags.SlurmDiagOverride != "" { 137 | cliOpts.sdiag = strings.Split(cliFlags.SlurmDiagOverride, " ") 138 | } 139 | if cliFlags.SlurmAcctOverride != "" { 140 | cliOpts.sacctmgr = strings.Split(cliFlags.SlurmAcctOverride, " ") 141 | } 142 | if cliFlags.TraceRate != 0 { 143 | traceConf.rate = cliFlags.TraceRate 144 | } 145 | if cliFlags.TracePath != "" { 146 | traceConf.path = cliFlags.TracePath 147 | } 148 | if cliFlags.SlurmLicenseOverride != "" { 149 | cliOpts.lic = strings.Split(cliFlags.SlurmLicenseOverride, " ") 150 | } 151 | if cliOpts.fallback { 152 | // we define a custom json format that we convert back into the openapi format 153 | if cliFlags.SlurmSqueueOverride == "" { 154 | cliOpts.squeue = []string{"squeue", "--states=all", "-h", "-r", "-o", `{"a": "%a", "id": %A, "end_time": "%e", "u": "%u", "state": "%T", "p": "%P", "cpu": %C, "mem": "%m", "array_id": "%K", "r": "%R"}`} 155 | } 156 | if cliFlags.SlurmSinfoOverride == "" { 157 | cliOpts.sinfo = []string{"sinfo", "-h", "-o", `{"s": "%T", "mem": %m, "n": "%n", "l": "%O", "p": "%R", "fmem": "%e", "cstate": "%C", "w": %w}`} 158 | } 159 | // must instantiate the job fetcher here since it is shared between 2 collectors 160 | traceConf.sharedFetcher = &JobCliFallbackFetcher{ 161 | scraper: NewCliScraper(cliOpts.squeue...), 162 | cache: NewAtomicThrottledCache[JobMetric](config.PollLimit), 163 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{ 164 | Name: "job_scrape_errors", 165 | Help: "job scrape errors", 166 | }), 167 | } 168 | } else { 169 | traceConf.sharedFetcher = &JobJsonFetcher{ 170 | scraper: NewCliScraper(cliOpts.squeue...), 171 | cache: NewAtomicThrottledCache[JobMetric](config.PollLimit), 172 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{ 173 | Name: "job_scrape_errors", 174 | Help: "job scrape errors", 175 | }), 176 | } 177 | } 178 | return config, nil 179 | } 180 | 181 | func NewPromHTTPServer(metricsExcludeFilter *regexp.Regexp) http.Handler { 182 | // Create a handler that filters metrics based on the exclude regex pattern 183 | if metricsExcludeFilter == nil || metricsExcludeFilter.String() == "" { 184 | return promhttp.Handler() 185 | } 186 | slog.Info("filtering metrics based on regex: " + metricsExcludeFilter.String()) 187 | filteredGatherer := prometheus.GathererFunc(func() ([]*dto.MetricFamily, error) { 188 | allMetrics, err := prometheus.DefaultGatherer.Gather() 189 | if err != nil { 190 | return nil, err 191 | } 192 | var filteredMetrics []*dto.MetricFamily 193 | for _, mf := range allMetrics { 194 | if !metricsExcludeFilter.MatchString(mf.GetName()) { 195 | filteredMetrics = append(filteredMetrics, mf) 196 | } 197 | } 198 | return filteredMetrics, nil 199 | }) 200 | return promhttp.HandlerFor(filteredGatherer, promhttp.HandlerOpts{}) 201 | } 202 | 203 | func InitPromServer(config *Config) http.Handler { 204 | textHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ 205 | Level: config.LogLevel, 206 | }) 207 | slog.SetDefault(slog.New(textHandler)) 208 | prometheus.MustRegister(NewNodeCollecter(config), NewJobsController(config)) 209 | if traceconf := config.TraceConf; traceconf.enabled { 210 | slog.Info("trace path enabled at path: " + config.ListenAddress + traceconf.path) 211 | traceController := NewTraceCollector(config) 212 | http.HandleFunc(traceconf.path, traceController.uploadTrace) 213 | prometheus.MustRegister(traceController) 214 | } 215 | cliOpts := config.cliOpts 216 | if cliOpts.licEnabled { 217 | slog.Info("licence collection enabled") 218 | prometheus.MustRegister(NewLicCollector(config)) 219 | } 220 | if cliOpts.diagsEnabled { 221 | slog.Info("daemon diagnostic collection enabled") 222 | prometheus.MustRegister(NewDiagsCollector(config)) 223 | } 224 | if cliOpts.sacctEnabled { 225 | slog.Info("account limit collection enabled") 226 | prometheus.MustRegister(NewLimitCollector(config)) 227 | } 228 | 229 | return NewPromHTTPServer(cliOpts.excludeFilter) 230 | } 231 | -------------------------------------------------------------------------------- /exporter/trace.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "encoding/json" 9 | "errors" 10 | "fmt" 11 | "net/http" 12 | "os" 13 | "path/filepath" 14 | "sync" 15 | "text/template" 16 | "time" 17 | 18 | "github.com/prometheus/client_golang/prometheus" 19 | "log/slog" 20 | ) 21 | 22 | const ( 23 | // cleanup on add if greater than this threshold 24 | cleanupThreshold uint64 = 1_000 25 | templateDirName string = "templates" 26 | proctraceTemplate string = ` 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | {{ range . }} 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | {{ end }} 51 |
Job Id Process Id Cpu % I/O Wait Memory Usage Username Hostname
{{ .JobId }} {{ .Pid}} {{ .Cpus }} {{ .WriteBytes }} {{ .ReadBytes }} {{ .Mem }} {{ .Username }} {{ .Hostname }}
52 | 53 | 54 | ` 55 | ) 56 | 57 | // store a jobs published proc stats 58 | type TraceInfo struct { 59 | JobId int64 `json:"job_id"` 60 | Pid int64 `json:"pid"` 61 | Cpus float64 `json:"cpus"` 62 | WriteBytes float64 `json:"write_bytes"` 63 | ReadBytes float64 `json:"read_bytes"` 64 | Threads float64 `json:"threads"` 65 | Mem float64 `json:"mem"` 66 | Username string `json:"username"` 67 | Hostname string `json:"hostname"` 68 | // do not set explicitly, overridden on Add 69 | uploadAt time.Time 70 | } 71 | 72 | type AtomicProcFetcher struct { 73 | sync.Mutex 74 | Info map[int64]*TraceInfo 75 | sampleRate uint64 76 | cleanupThreshold uint64 77 | } 78 | 79 | func NewAtomicProFetcher(sampleRate uint64) *AtomicProcFetcher { 80 | return &AtomicProcFetcher{ 81 | Info: make(map[int64]*TraceInfo), 82 | sampleRate: sampleRate, 83 | cleanupThreshold: cleanupThreshold, 84 | } 85 | } 86 | 87 | // clean stale entries 88 | func (m *AtomicProcFetcher) cleanup() { 89 | for jobid, metric := range m.Info { 90 | if time.Since(metric.uploadAt).Seconds() > float64(m.sampleRate) { 91 | delete(m.Info, jobid) 92 | } 93 | } 94 | } 95 | 96 | func (m *AtomicProcFetcher) Add(trace *TraceInfo) error { 97 | m.Lock() 98 | defer m.Unlock() 99 | if trace.JobId == 0 { 100 | return errors.New("job id unset") 101 | } 102 | trace.uploadAt = time.Now() 103 | m.Info[trace.JobId] = trace 104 | if len(m.Info) > int(m.cleanupThreshold) { 105 | m.cleanup() 106 | } 107 | return nil 108 | } 109 | 110 | func (m *AtomicProcFetcher) Fetch() map[int64]*TraceInfo { 111 | m.Lock() 112 | defer m.Unlock() 113 | m.cleanup() 114 | cpy := make(map[int64]*TraceInfo) 115 | for k, v := range m.Info { 116 | cpy[k] = v 117 | } 118 | return cpy 119 | } 120 | 121 | type TraceCollector struct { 122 | ProcessFetcher *AtomicProcFetcher 123 | squeueFetcher SlurmMetricFetcher[JobMetric] 124 | fallback bool 125 | // actual proc monitoring 126 | jobAllocMem *prometheus.Desc 127 | jobAllocCpus *prometheus.Desc 128 | pid *prometheus.Desc 129 | cpuUsage *prometheus.Desc 130 | memUsage *prometheus.Desc 131 | threadCount *prometheus.Desc 132 | writeBytes *prometheus.Desc 133 | readBytes *prometheus.Desc 134 | } 135 | 136 | func NewTraceCollector(config *Config) *TraceCollector { 137 | traceConfig := config.TraceConf 138 | return &TraceCollector{ 139 | ProcessFetcher: NewAtomicProFetcher(traceConfig.rate), 140 | squeueFetcher: traceConfig.sharedFetcher, 141 | fallback: config.cliOpts.fallback, 142 | // add for job id correlation 143 | jobAllocMem: prometheus.NewDesc("slurm_job_mem_alloc", "running job mem allocated", []string{"jobid"}, nil), 144 | jobAllocCpus: prometheus.NewDesc("slurm_job_cpu_alloc", "running job cpus allocated", []string{"jobid"}, nil), 145 | pid: prometheus.NewDesc("slurm_proc_pid", "pid of running slurm job", []string{"jobid", "hostname"}, nil), 146 | cpuUsage: prometheus.NewDesc("slurm_proc_cpu_usage", "actual cpu usage collected from proc monitor", []string{"jobid", "username"}, nil), 147 | memUsage: prometheus.NewDesc("slurm_proc_mem_usage", "proc mem usage", []string{"jobid", "username"}, nil), 148 | threadCount: prometheus.NewDesc("slurm_proc_threadcount", "threads currently being used", []string{"jobid", "username"}, nil), 149 | writeBytes: prometheus.NewDesc("slurm_proc_write_bytes", "proc write bytes", []string{"jobid", "username"}, nil), 150 | readBytes: prometheus.NewDesc("slurm_proc_read_bytes", "proc read bytes", []string{"jobid", "username"}, nil), 151 | } 152 | } 153 | 154 | func (c *TraceCollector) Describe(ch chan<- *prometheus.Desc) { 155 | ch <- c.jobAllocMem 156 | ch <- c.jobAllocCpus 157 | ch <- c.pid 158 | ch <- c.cpuUsage 159 | ch <- c.memUsage 160 | ch <- c.threadCount 161 | ch <- c.writeBytes 162 | ch <- c.readBytes 163 | } 164 | 165 | func (c *TraceCollector) Collect(ch chan<- prometheus.Metric) { 166 | procs := c.ProcessFetcher.Fetch() 167 | jobMetrics, err := c.squeueFetcher.FetchMetrics() 168 | if err != nil { 169 | return 170 | } 171 | for _, j := range jobMetrics { 172 | p, ok := procs[int64(j.JobId)] 173 | if !ok { 174 | continue 175 | } 176 | jobid := fmt.Sprint(p.JobId) 177 | ch <- prometheus.MustNewConstMetric(c.jobAllocMem, prometheus.GaugeValue, totalAllocMem(&j.JobResources), jobid) 178 | ch <- prometheus.MustNewConstMetric(c.jobAllocCpus, prometheus.GaugeValue, j.JobResources.AllocCpus, jobid) 179 | ch <- prometheus.MustNewConstMetric(c.pid, prometheus.GaugeValue, float64(p.Pid), jobid, p.Hostname) 180 | ch <- prometheus.MustNewConstMetric(c.cpuUsage, prometheus.GaugeValue, p.Cpus, jobid, p.Username) 181 | ch <- prometheus.MustNewConstMetric(c.memUsage, prometheus.GaugeValue, p.Mem, jobid, p.Username) 182 | ch <- prometheus.MustNewConstMetric(c.threadCount, prometheus.GaugeValue, p.Threads, jobid, p.Username) 183 | ch <- prometheus.MustNewConstMetric(c.writeBytes, prometheus.GaugeValue, p.WriteBytes, jobid, p.Username) 184 | ch <- prometheus.MustNewConstMetric(c.readBytes, prometheus.GaugeValue, p.ReadBytes, jobid, p.Username) 185 | } 186 | } 187 | 188 | func (c *TraceCollector) uploadTrace(w http.ResponseWriter, r *http.Request) { 189 | if r.Method == http.MethodPost { 190 | defer r.Body.Close() 191 | var info TraceInfo 192 | if err := json.NewDecoder(r.Body).Decode(&info); err != nil { 193 | slog.Error(fmt.Sprintf("unable to decode trace response due to err: %q", err)) 194 | return 195 | } 196 | if err := c.ProcessFetcher.Add(&info); err != nil { 197 | slog.Error(fmt.Sprintf("failed to add to map with: %q", err)) 198 | return 199 | } 200 | } 201 | if r.Method == http.MethodGet { 202 | 203 | tmpl := template.Must(template.New("proc_traces").Parse(proctraceTemplate)) 204 | procs := c.ProcessFetcher.Fetch() 205 | traces := make([]TraceInfo, 0, len(procs)) 206 | for _, info := range procs { 207 | traces = append(traces, *info) 208 | } 209 | if err := tmpl.Execute(w, traces); err != nil { 210 | slog.Error(fmt.Sprintf("template failed to render with err: %q", err)) 211 | return 212 | } 213 | } 214 | } 215 | 216 | // detectTraceTemplatePath returns the trace_root path based on the following criteria: 217 | // 1. If TRACE_ROOT_PATH is specified, search that directory. If we don't find a templates dir, let's panic and crash the program. 218 | // 2. If TRACE_ROOT_PATH isn't specified, we can search cwd and /usr/share/prometheus-slurm-exporter. 219 | // If no templates path is found, returns an empty string 220 | func detectTraceTemplatePath() string { 221 | if rpath, ok := os.LookupEnv("TRACE_ROOT_PATH"); ok { 222 | templateP := filepath.Join(rpath, templateDirName) 223 | if _, err := os.Stat(templateP); err != nil { 224 | panic("TRACE_ROOT_PATH must include a directory called: templates") 225 | } 226 | return templateP 227 | } 228 | for _, rpath := range []string{".", "/usr/share/prometheus-slurm-exporter"} { 229 | templateP := filepath.Join(rpath, templateDirName) 230 | if _, err := os.Stat(templateP); err == nil { 231 | return templateP 232 | } 233 | } 234 | return "" 235 | } 236 | -------------------------------------------------------------------------------- /exporter/trace_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "net/http" 11 | "net/http/httptest" 12 | "os" 13 | "path/filepath" 14 | "testing" 15 | "time" 16 | 17 | "github.com/prometheus/client_golang/prometheus" 18 | "github.com/stretchr/testify/assert" 19 | "github.com/stretchr/testify/require" 20 | ) 21 | 22 | func TestAtomicFetcher_Cleanup(t *testing.T) { 23 | assert := assert.New(t) 24 | sampleRate := 10 25 | fetcher := NewAtomicProFetcher(uint64(sampleRate)) 26 | fetcher.Info[11] = &TraceInfo{JobId: 11, uploadAt: time.Now().Add(-time.Second * 11)} 27 | fetcher.Info[10] = &TraceInfo{JobId: 10, uploadAt: time.Now()} 28 | fetcher.cleanup() 29 | assert.Contains(fetcher.Info, int64(10)) 30 | } 31 | 32 | func TestAtomicFetcher_Add(t *testing.T) { 33 | assert := assert.New(t) 34 | fetcher := NewAtomicProFetcher(10) 35 | info := TraceInfo{JobId: 10} 36 | err := fetcher.Add(&info) 37 | assert.Nil(err) 38 | assert.Equal(1, len(fetcher.Info)) 39 | assert.Contains(fetcher.Info, int64(10)) 40 | } 41 | 42 | func TestAtomicFetcher_AddOverflow(t *testing.T) { 43 | assert := assert.New(t) 44 | sampleRate := 10 45 | fetcher := NewAtomicProFetcher(uint64(sampleRate)) 46 | fetcher.cleanupThreshold = 1 47 | fetcher.Info[11] = &TraceInfo{JobId: 11, uploadAt: time.Now().Add(-time.Second * 11)} 48 | fetcher.Add(&TraceInfo{JobId: 10}) 49 | assert.Equal(1, len(fetcher.Info)) 50 | // assert.Contains(10, fetcher.Info) 51 | } 52 | 53 | func TestAtomicFetcher_AddNoJobid(t *testing.T) { 54 | assert := assert.New(t) 55 | fetcher := AtomicProcFetcher{Info: make(map[int64]*TraceInfo)} 56 | info := TraceInfo{JobId: 0} 57 | err := fetcher.Add(&info) 58 | assert.NotNil(err) 59 | } 60 | 61 | func TestAtomicFetcher_FetchStale(t *testing.T) { 62 | assert := assert.New(t) 63 | fetcher := NewAtomicProFetcher(1) 64 | fetcher.Info[10] = &TraceInfo{uploadAt: time.Now().Add(-time.Second * 10)} 65 | traces := fetcher.Fetch() 66 | assert.Equal(0, len(traces)) 67 | } 68 | 69 | func TestAtomicFetcher_Fetch(t *testing.T) { 70 | assert := assert.New(t) 71 | fetcher := NewAtomicProFetcher(10) 72 | fetcher.Info[10] = &TraceInfo{uploadAt: time.Now()} 73 | traces := fetcher.Fetch() 74 | assert.Equal(1, len(traces)) 75 | } 76 | 77 | func TestUploadTracePost(t *testing.T) { 78 | assert := assert.New(t) 79 | fixture, err := os.ReadFile("fixtures/trace_info_body.json") 80 | assert.Nil(err) 81 | config, err := NewConfig(new(CliFlags)) 82 | assert.Nil(err) 83 | r := httptest.NewRequest(http.MethodPost, "dummy.url:8092/trace", bytes.NewBuffer(fixture)) 84 | w := httptest.NewRecorder() 85 | c := NewTraceCollector(config) 86 | c.uploadTrace(w, r) 87 | assert.Equal(1, len(c.ProcessFetcher.Info)) 88 | } 89 | 90 | func TestUploadTraceGet(t *testing.T) { 91 | assert := assert.New(t) 92 | r := httptest.NewRequest(http.MethodGet, "dummy.url:8092/trace", nil) 93 | w := httptest.NewRecorder() 94 | config, err := NewConfig(new(CliFlags)) 95 | assert.Nil(err) 96 | c := NewTraceCollector(config) 97 | c.ProcessFetcher.Info[10] = &TraceInfo{} 98 | c.uploadTrace(w, r) 99 | assert.Equal(200, w.Code) 100 | assert.Positive(w.Body.Len()) 101 | } 102 | 103 | func TestTraceControllerCollect(t *testing.T) { 104 | assert := assert.New(t) 105 | config := &Config{ 106 | PollLimit: 10, 107 | TraceConf: &TraceConfig{ 108 | rate: 10, 109 | sharedFetcher: &JobJsonFetcher{ 110 | scraper: MockJobInfoScraper, 111 | cache: NewAtomicThrottledCache[JobMetric](1), 112 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 113 | }, 114 | }, 115 | cliOpts: new(CliOpts), 116 | } 117 | c := NewTraceCollector(config) 118 | c.ProcessFetcher.Add(&TraceInfo{JobId: 26515966}) 119 | assert.NotEmpty(c.ProcessFetcher.Info) 120 | metricChan := make(chan prometheus.Metric) 121 | go func() { 122 | c.Collect(metricChan) 123 | close(metricChan) 124 | }() 125 | 126 | metrics := make([]prometheus.Metric, 0) 127 | for m, ok := <-metricChan; ok; m, ok = <-metricChan { 128 | metrics = append(metrics, m) 129 | } 130 | assert.NotEmpty(metrics) 131 | } 132 | 133 | func TestTraceControllerCollect_Fallback(t *testing.T) { 134 | assert := assert.New(t) 135 | config := &Config{ 136 | PollLimit: 10, 137 | TraceConf: &TraceConfig{ 138 | rate: 10, 139 | sharedFetcher: &JobCliFallbackFetcher{ 140 | scraper: &MockScraper{fixture: "fixtures/squeue_fallback.txt"}, 141 | cache: NewAtomicThrottledCache[JobMetric](1), 142 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 143 | }, 144 | }, 145 | cliOpts: &CliOpts{fallback: true}, 146 | } 147 | c := NewTraceCollector(config) 148 | c.ProcessFetcher.Add(&TraceInfo{JobId: 26515966}) 149 | assert.NotEmpty(c.ProcessFetcher.Info) 150 | metricChan := make(chan prometheus.Metric) 151 | go func() { 152 | c.Collect(metricChan) 153 | close(metricChan) 154 | }() 155 | 156 | metrics := make([]prometheus.Metric, 0) 157 | for m, ok := <-metricChan; ok; m, ok = <-metricChan { 158 | metrics = append(metrics, m) 159 | } 160 | assert.NotEmpty(metrics) 161 | } 162 | 163 | func TestTraceControllerDescribe(t *testing.T) { 164 | assert := assert.New(t) 165 | config := &Config{ 166 | PollLimit: 10, 167 | TraceConf: &TraceConfig{ 168 | rate: 10, 169 | sharedFetcher: &JobJsonFetcher{ 170 | scraper: MockJobInfoScraper, 171 | cache: NewAtomicThrottledCache[JobMetric](1), 172 | errCounter: prometheus.NewCounter(prometheus.CounterOpts{}), 173 | }, 174 | }, 175 | cliOpts: new(CliOpts), 176 | } 177 | c := NewTraceCollector(config) 178 | c.ProcessFetcher.Add(&TraceInfo{JobId: 26515966}) 179 | assert.Positive(len(c.ProcessFetcher.Info)) 180 | metricChan := make(chan *prometheus.Desc) 181 | go func() { 182 | assert.Positive(len(c.ProcessFetcher.Info)) 183 | c.Describe(metricChan) 184 | close(metricChan) 185 | }() 186 | 187 | metrics := make([]*prometheus.Desc, 0) 188 | for m, ok := <-metricChan; ok; m, ok = <-metricChan { 189 | metrics = append(metrics, m) 190 | t.Logf("Received metric %s", m.String()) 191 | } 192 | assert.Positive(len(metrics)) 193 | } 194 | 195 | func TestPython3Wrapper(t *testing.T) { 196 | if testing.Short() { 197 | t.Skip() 198 | } 199 | assert := assert.New(t) 200 | fetcher := NewCliScraper("python3", "../wrappers/proctrac.py", "--cmd", "sleep", "100", "--jobid=10", "--validate") 201 | t.Logf("cmd: %+v", fetcher.args) 202 | wrapperOut, err := fetcher.FetchRawBytes() 203 | assert.Nil(err) 204 | var info TraceInfo 205 | json.Unmarshal(wrapperOut, &info) 206 | assert.Equal(int64(10), info.JobId) 207 | } 208 | 209 | func TestDetectTraceRootPath_Env(t *testing.T) { 210 | testDir := t.TempDir() 211 | t.Setenv("TRACE_ROOT_PATH", testDir) 212 | // Ensure that the function panics if given a TRACE_ROOT_PATh with no 'templates' subdirectory 213 | assert.PanicsWithValue(t, "TRACE_ROOT_PATH must include a directory called: templates", func() { detectTraceTemplatePath() }) 214 | require.NoError(t, os.Mkdir(filepath.Join(testDir, templateDirName), 0o700)) 215 | 216 | // Now that we have a 'templates' subdir, it should no longer panic 217 | assert.Equal(t, filepath.Join(testDir, templateDirName), detectTraceTemplatePath()) 218 | } 219 | 220 | func TestDetectTraceRootPath_Default(t *testing.T) { 221 | testDir := t.TempDir() 222 | os.Chdir(testDir) 223 | 224 | // Should come back empty if since we don't yet have a 'templates' subdir 225 | assert.Equal(t, detectTraceTemplatePath(), "") 226 | require.NoError(t, os.Mkdir(filepath.Join(testDir, templateDirName), 0o700)) 227 | 228 | // Now that we have 'templates' subdir, cwd is a valid path 229 | assert.Equal(t, templateDirName, detectTraceTemplatePath()) 230 | } 231 | -------------------------------------------------------------------------------- /exporter/utils.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "errors" 11 | "fmt" 12 | "os" 13 | "os/exec" 14 | "regexp" 15 | "strconv" 16 | "strings" 17 | "sync" 18 | "time" 19 | 20 | "log/slog" 21 | 22 | "github.com/prometheus/client_golang/prometheus" 23 | ) 24 | 25 | type SlurmPrimitiveMetric interface { 26 | NodeMetric | JobMetric | DiagMetric | LicenseMetric | AccountLimitMetric 27 | } 28 | 29 | type CoercedInt int 30 | 31 | func (ci *CoercedInt) UnmarshalJSON(data []byte) error { 32 | var nativeInt int 33 | if err := json.Unmarshal(data, &nativeInt); err == nil { 34 | *ci = CoercedInt(nativeInt) 35 | return nil 36 | } 37 | var stringInt string 38 | if err := json.Unmarshal(data, &stringInt); err != nil { 39 | return err 40 | } 41 | convertedInt, err := strconv.ParseInt(stringInt, 10, 64) 42 | if err != nil { 43 | return err 44 | } 45 | *ci = CoercedInt(convertedInt) 46 | return nil 47 | } 48 | 49 | type SlurmVersion struct { 50 | Version struct { 51 | Major CoercedInt `json:"major"` 52 | Micro CoercedInt `json:"micro"` 53 | Minor CoercedInt `json:"minor"` 54 | } `json:"version"` 55 | Release string `json:"release"` 56 | } 57 | 58 | // interface for getting data from slurm 59 | // used for dep injection/ease of testing & for add slurmrestd support later 60 | type SlurmByteScraper interface { 61 | FetchRawBytes() ([]byte, error) 62 | Duration() time.Duration 63 | } 64 | 65 | type SlurmMetricFetcher[M SlurmPrimitiveMetric] interface { 66 | FetchMetrics() ([]M, error) 67 | ScrapeDuration() time.Duration 68 | ScrapeError() prometheus.Counter 69 | } 70 | 71 | type AtomicThrottledCache[C SlurmPrimitiveMetric] struct { 72 | sync.Mutex 73 | t time.Time 74 | limit float64 75 | cache []C 76 | // duration of last cache miss 77 | duration time.Duration 78 | } 79 | 80 | // atomic fetch of either the cache or the collector 81 | // reset & hydrate as necessary 82 | func (atc *AtomicThrottledCache[C]) FetchOrThrottle(fetchFunc func() ([]C, error)) ([]C, error) { 83 | atc.Lock() 84 | defer atc.Unlock() 85 | if len(atc.cache) > 0 && time.Since(atc.t).Seconds() < atc.limit { 86 | return atc.cache, nil 87 | } 88 | t := time.Now() 89 | slurmData, err := fetchFunc() 90 | if err != nil { 91 | return nil, err 92 | } 93 | atc.duration = time.Since(t) 94 | atc.cache = slurmData 95 | atc.t = time.Now() 96 | return slurmData, nil 97 | } 98 | 99 | func NewAtomicThrottledCache[C SlurmPrimitiveMetric](limit float64) *AtomicThrottledCache[C] { 100 | return &AtomicThrottledCache[C]{ 101 | t: time.Now(), 102 | limit: limit, 103 | } 104 | } 105 | 106 | func track(cmd []string) (string, time.Time) { 107 | return strings.Join(cmd, " "), time.Now() 108 | } 109 | 110 | func duration(msg string, start time.Time) { 111 | slog.Debug(fmt.Sprintf("cmd %s took %s secs", msg, time.Since(start))) 112 | } 113 | 114 | // implements SlurmByteScraper by fetch data from cli 115 | type CliScraper struct { 116 | args []string 117 | timeout time.Duration 118 | duration time.Duration 119 | } 120 | 121 | func (cf *CliScraper) Duration() time.Duration { 122 | return cf.duration 123 | } 124 | 125 | func (cf *CliScraper) FetchRawBytes() ([]byte, error) { 126 | defer func(t time.Time) { cf.duration = time.Since(t) }(time.Now()) 127 | if len(cf.args) == 0 { 128 | return nil, errors.New("need at least 1 args") 129 | } 130 | defer duration(track(cf.args)) 131 | cmd := exec.Command(cf.args[0], cf.args[1:]...) 132 | var outb, errb bytes.Buffer 133 | cmd.Stdout = &outb 134 | cmd.Stderr = &errb 135 | if err := cmd.Start(); err != nil { 136 | return nil, err 137 | } 138 | timer := time.AfterFunc(cf.timeout, func() { 139 | if err := cmd.Process.Kill(); err != nil { 140 | slog.Error(fmt.Sprintf("failed to cancel cmd: %v", cf.args)) 141 | } 142 | }) 143 | defer timer.Stop() 144 | if err := cmd.Wait(); err != nil { 145 | return nil, err 146 | } 147 | if errb.Len() > 0 { 148 | return nil, fmt.Errorf("cmd failed with %s", errb.String()) 149 | } 150 | return outb.Bytes(), nil 151 | } 152 | 153 | func NewCliScraper(args ...string) *CliScraper { 154 | var limit float64 = 10 155 | var err error 156 | if tm, ok := os.LookupEnv("CLI_TIMEOUT"); ok { 157 | if limit, err = strconv.ParseFloat(tm, 64); err != nil { 158 | slog.Error("`CLI_TIMEOUT` env var parse error") 159 | } 160 | } 161 | return &CliScraper{ 162 | args: args, 163 | timeout: time.Duration(limit) * time.Second, 164 | } 165 | } 166 | 167 | // convert slurm mem string to float64 bytes 168 | func MemToFloat(mem string) (float64, error) { 169 | if num, err := strconv.ParseFloat(mem, 64); err == nil { 170 | return num, nil 171 | } 172 | memUnits := map[string]float64{ 173 | "M": 1e+6, 174 | "G": 1e+9, 175 | "T": 1e+12, 176 | } 177 | re := regexp.MustCompile(`^(?P([0-9]*[.])?[0-9]+)(?PG|M|T)$`) 178 | matches := re.FindStringSubmatch(mem) 179 | if len(matches) < 2 { 180 | return -1, fmt.Errorf("mem string %s doesn't match regex %s nor is a float", mem, re) 181 | } 182 | // err here should be impossible due to regex 183 | num, err := strconv.ParseFloat(matches[re.SubexpIndex("num")], 64) 184 | memunit := memUnits[matches[re.SubexpIndex("memunit")]] 185 | return num * memunit, err 186 | } 187 | -------------------------------------------------------------------------------- /exporter/utils_test.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | 5 | package exporter 6 | 7 | import ( 8 | "fmt" 9 | "math" 10 | "math/rand" 11 | "testing" 12 | "time" 13 | 14 | "github.com/stretchr/testify/assert" 15 | "log/slog" 16 | ) 17 | 18 | const chars string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" 19 | 20 | var seededRand *rand.Rand 21 | 22 | func init() { 23 | seed := time.Now().UnixNano() 24 | seededRand = rand.New(rand.NewSource(seed)) 25 | slog.Debug(fmt.Sprintf("rand seed: %d", seed)) 26 | } 27 | 28 | func generateRandString(n int) string { 29 | randBytes := make([]byte, n) 30 | for i := 0; i < n; i++ { 31 | randBytes[i] = chars[seededRand.Int()%len(chars)] 32 | } 33 | return string(randBytes) 34 | } 35 | 36 | func TestCliFetcher(t *testing.T) { 37 | assert := assert.New(t) 38 | cliFetcher := NewCliScraper("ls") 39 | data, err := cliFetcher.FetchRawBytes() 40 | assert.NoError(err) 41 | assert.NotNil(data) 42 | } 43 | 44 | func TestCliFetcher_Timeout(t *testing.T) { 45 | assert := assert.New(t) 46 | cliFetcher := NewCliScraper("sleep", "100") 47 | cliFetcher.timeout = 0 48 | data, err := cliFetcher.FetchRawBytes() 49 | assert.EqualError(err, "signal: killed") 50 | assert.Nil(data) 51 | } 52 | 53 | func TestCliFetcher_EmptyArgs(t *testing.T) { 54 | assert := assert.New(t) 55 | cliFetcher := NewCliScraper() 56 | data, err := cliFetcher.FetchRawBytes() 57 | assert.EqualError(err, "need at least 1 args") 58 | assert.Nil(data) 59 | } 60 | 61 | func TestCliFetcher_ExitCodeCmd(t *testing.T) { 62 | assert := assert.New(t) 63 | cliFetcher := NewCliScraper("ls", generateRandString(64)) 64 | data, err := cliFetcher.FetchRawBytes() 65 | assert.NotNil(err) 66 | assert.Nil(data) 67 | } 68 | 69 | func TestCliFetcher_StdErr(t *testing.T) { 70 | assert := assert.New(t) 71 | // the rare case where stderr is written but exit code is still 0 72 | cmd := `echo -e "error" 1>&2` 73 | cliFetcher := NewCliScraper("/bin/bash", "-c", cmd) 74 | data, err := cliFetcher.FetchRawBytes() 75 | assert.NotNil(err) 76 | assert.Nil(data) 77 | } 78 | 79 | func TestAtomicThrottledCache_CompMiss(t *testing.T) { 80 | assert := assert.New(t) 81 | cache := NewAtomicThrottledCache[NodeMetric](10) 82 | // empty cache scenario 83 | called := false 84 | host := "host1" 85 | info, err := cache.FetchOrThrottle(func() ([]NodeMetric, error) { 86 | called = true 87 | return []NodeMetric{{Hostname: host}}, nil 88 | }) 89 | assert.Nil(err) 90 | assert.Equal(info[0].Hostname, host) 91 | // assert no cache hit 92 | assert.True(called) 93 | // assert cache populated 94 | assert.Positive(cache.cache[0].Hostname, host) 95 | } 96 | 97 | func TestAtomicThrottledCache_Hit(t *testing.T) { 98 | assert := assert.New(t) 99 | cache := NewAtomicThrottledCache[NodeMetric](math.MaxFloat64) 100 | cache.cache = []NodeMetric{{Hostname: "host1"}} 101 | // empty cache scenario 102 | called := false 103 | info, err := cache.FetchOrThrottle(func() ([]NodeMetric, error) { 104 | called = true 105 | return []NodeMetric{{Hostname: "host2"}}, nil 106 | }) 107 | assert.Nil(err) 108 | assert.Equal(info[0].Hostname, "host1") 109 | // assert fetch not called 110 | assert.False(called) 111 | // assert cache populated 112 | assert.NotEmpty(cache.cache) 113 | } 114 | 115 | func TestAtomicThrottledCache_Stale(t *testing.T) { 116 | assert := assert.New(t) 117 | cache := NewAtomicThrottledCache[NodeMetric](0) 118 | cache.cache = []NodeMetric{{Hostname: "host1"}} 119 | called := false 120 | info, err := cache.FetchOrThrottle(func() ([]NodeMetric, error) { 121 | called = true 122 | return []NodeMetric{{Hostname: "host2"}}, nil 123 | }) 124 | assert.Nil(err) 125 | assert.Equal(info[0].Hostname, "host2") 126 | // assert fetch not called 127 | assert.True(called) 128 | // assert cache populated 129 | assert.Equal(cache.cache[0].Hostname, "host2") 130 | } 131 | 132 | func TestConvertMemToFloat(t *testing.T) { 133 | assert := assert.New(t) 134 | e := 1.2e+7 135 | for _, s := range []string{"12M", "12G", "12T"} { 136 | n, err := MemToFloat(s) 137 | assert.Nil(err) 138 | assert.Equal(e, n) 139 | e *= 1e+3 140 | } 141 | n, err := MemToFloat("0") 142 | assert.Nil(err) 143 | assert.Zero(n) 144 | } 145 | 146 | func TestConvertMemToFloat_Sad(t *testing.T) { 147 | assert := assert.New(t) 148 | n, err := MemToFloat("afal") 149 | assert.Error(err) 150 | assert.Equal(-1., n) 151 | } 152 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/rivosinc/prometheus-slurm-exporter 2 | 3 | go 1.23.1 4 | 5 | require ( 6 | github.com/prometheus/client_golang v1.20.5 7 | github.com/prometheus/client_model v0.6.1 8 | github.com/stretchr/testify v1.10.0 9 | ) 10 | 11 | require ( 12 | github.com/beorn7/perks v1.0.1 // indirect 13 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 14 | github.com/davecgh/go-spew v1.1.1 // indirect 15 | github.com/klauspost/compress v1.17.9 // indirect 16 | github.com/kr/text v0.2.0 // indirect 17 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 18 | github.com/pmezard/go-difflib v1.0.0 // indirect 19 | github.com/prometheus/common v0.55.0 // indirect 20 | github.com/prometheus/procfs v0.15.1 // indirect 21 | github.com/rogpeppe/go-internal v1.11.0 // indirect 22 | golang.org/x/sys v0.22.0 // indirect 23 | google.golang.org/protobuf v1.34.2 // indirect 24 | gopkg.in/yaml.v3 v3.0.1 // indirect 25 | ) 26 | -------------------------------------------------------------------------------- /go.mod.license: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 2 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 3 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 4 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 5 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 6 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 7 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 8 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 9 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 10 | github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= 11 | github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= 12 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 13 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 14 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 15 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 16 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= 17 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= 18 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 19 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 20 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 21 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 22 | github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= 23 | github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= 24 | github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= 25 | github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= 26 | github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= 27 | github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= 28 | github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= 29 | github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= 30 | github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= 31 | github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= 32 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 33 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 34 | golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= 35 | golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 36 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= 37 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= 38 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 39 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 40 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 41 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 42 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 43 | -------------------------------------------------------------------------------- /go.sum.license: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /images/dev_container_launch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rivosinc/prometheus-slurm-exporter/9186d0777b03fce4d36c988336639335e827eb1f/images/dev_container_launch.png -------------------------------------------------------------------------------- /images/dev_container_launch.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /images/trace_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rivosinc/prometheus-slurm-exporter/9186d0777b03fce4d36c988336639335e827eb1f/images/trace_example.png -------------------------------------------------------------------------------- /images/trace_example.png.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | 3 | SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /init_cgroup.conf: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | CgroupAutomount=yes 5 | ConstrainCores=yes 6 | ConstrainDevices=yes 7 | ConstrainRAMSpace=yes 8 | CgroupPlugin=cgroup/v1 9 | -------------------------------------------------------------------------------- /init_slurm.conf: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | ClusterName=default-cluster 5 | SlurmctldHost=localhost 6 | AuthType=auth/munge 7 | ProctrackType=proctrack/linuxproc 8 | ReturnToService=1 9 | SlurmctldPidFile=/var/run/slurmctld.pid 10 | SlurmctldPort=6817 11 | SlurmdPidFile=/var/run/slurmd.pid 12 | SlurmdPort=6818 13 | SlurmdSpoolDir=/var/spool/slurmd 14 | SlurmUser=root 15 | StateSaveLocation=/var/spool/slurmctld 16 | # TaskPlugin=task/affinity 17 | InactiveLimit=0 18 | KillWait=30 19 | MinJobAge=300 20 | SlurmctldTimeout=120 21 | SlurmdTimeout=300 22 | Waittime=0 23 | SchedulerType=sched/backfill 24 | SelectType=select/cons_tres 25 | JobCompType=jobcomp/none 26 | JobAcctGatherFrequency=30 27 | SlurmctldDebug=info 28 | SlurmctldLogFile=/var/log/slurmctld.log 29 | SlurmdDebug=info 30 | SlurmdLogFile=/var/log/slurmd.log 31 | NodeName=localhost 32 | PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP 33 | -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | build_dir := "./build" 6 | coverage := "coverage" 7 | vpython := "venv/bin/python3" 8 | # default ld_library and include paths that work within container 9 | ld_library := "/usr/lib64/lib/slurm" 10 | include_path := "/usr/lib64/include" 11 | 12 | set dotenv-load 13 | set shell := ["bash", "-ceuo", "pipefail"] 14 | 15 | # list all recipes 16 | default: 17 | just --list 18 | 19 | init: 20 | go mod tidy 21 | rm -rf venv 22 | python3 -m venv venv 23 | {{vpython}} -m pip install -U pip pre-commit psutil requests 24 | ./venv/bin/pre-commit install --install-hooks 25 | if ! [ -f .env ]; then printf "SLURM_LIB_DIR={{ld_library}}\nSLURM_INCLUDE_DIR={{include_path}}\n" > .env; fi 26 | 27 | build: 28 | rm -rf {{build_dir}} 29 | mkdir {{build_dir}} 30 | CGO_ENABLED=0 go build -o {{build_dir}}/slurm_exporter . 31 | 32 | devel: build 33 | {{build_dir}}/slurm_exporter \ 34 | -trace.enabled \ 35 | -slurm.cli-fallback \ 36 | -slurm.collect-limits \ 37 | -slurm.collect-diags \ 38 | -slurm.collect-licenses \ 39 | -slurm.squeue-cli "cat exporter/fixtures/squeue_fallback.txt" \ 40 | -slurm.sinfo-cli "cat exporter/fixtures/sinfo_fallback.txt" \ 41 | -slurm.diag-cli "cat exporter/fixtures/sdiag.json" \ 42 | -slurm.lic-cli "cat exporter/fixtures/license_out.json" \ 43 | -slurm.sacctmgr-cli "cat exporter/fixtures/sacctmgr.txt" 44 | 45 | prod: build 46 | {{build_dir}}/slurm_exporter -slurm.cli-fallback -web.listen-address :9093 47 | 48 | test-exporter: 49 | source venv/bin/activate && CGO_ENABLED=0 go test ./exporter 50 | 51 | cover: 52 | CGO_ENABLED=0 go test -coverprofile=c.out 53 | go tool cover -html="c.out" 54 | 55 | fmt: 56 | go fmt 57 | 58 | docker: 59 | docker build -t slurmcprom . 60 | 61 | test-all: 62 | #!/bin/bash 63 | set -aeuxo pipefail 64 | CGO_CXXFLAGS="-I${SLURM_INCLUDE_DIR}" 65 | CGO_LDFLAGS="-L${SLURM_LIB_DIR} -lslurmfull" 66 | LD_LIBRARY_PATH="${SLURM_LIB_DIR}" 67 | go test ./exporter ./slurmcprom 68 | 69 | crun: 70 | #!/bin/bash 71 | set -aeuxo pipefail 72 | rm -rf {{build_dir}} 73 | mkdir -p {{build_dir}} 74 | CGO_CXXFLAGS="-I${SLURM_INCLUDE_DIR}" 75 | CGO_LDFLAGS="-L${SLURM_LIB_DIR} -lslurmfull" 76 | POLL_LIMIT=1 77 | LD_LIBRARY_PATH="${SLURM_LIB_DIR}" 78 | go build -o {{build_dir}}/cexporter cmain.go 79 | {{build_dir}}/cexporter 80 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2023 Rivos Inc. 2 | // 3 | // SPDX-License-Identifier: Apache-2.0 4 | package main 5 | 6 | import ( 7 | "flag" 8 | "log" 9 | "net/http" 10 | 11 | "log/slog" 12 | 13 | "github.com/rivosinc/prometheus-slurm-exporter/exporter" 14 | ) 15 | 16 | var ( 17 | listenAddress = flag.String("web.listen-address", "", 18 | `Address to listen on for telemetry "(default: :9092)"`) 19 | metricsPath = flag.String("web.telemetry-path", "", 20 | "Path under which to expose metrics (default: /metrics)") 21 | logLevel = flag.String("web.log-level", "", "Log level: info, debug, error, warning") 22 | traceEnabled = flag.Bool("trace.enabled", false, "Set up Post endpoint for collecting traces") 23 | tracePath = flag.String("trace.path", "", "POST path to upload job proc info") 24 | traceRate = flag.Uint64("trace.rate", 0, "number of seconds proc info should stay in memory before being marked as stale (default 10)") 25 | slurmPollLimit = flag.Float64("slurm.poll-limit", 0, "throttle for slurmctld (default: 10s)") 26 | slurmSinfoOverride = flag.String("slurm.sinfo-cli", "", "sinfo cli override") 27 | slurmSqueueOverride = flag.String("slurm.squeue-cli", "", "squeue cli override") 28 | slurmLicenseOverride = flag.String("slurm.lic-cli", "", "squeue cli override") 29 | slurmDiagOverride = flag.String("slurm.diag-cli", "", "sdiag cli override") 30 | slurmSaactOverride = flag.String("slurm.sacctmgr-cli", "", "saactmgr cli override") 31 | slurmLicEnabled = flag.Bool("slurm.collect-licenses", false, "Collect license info from slurm") 32 | slurmDiagEnabled = flag.Bool("slurm.collect-diags", false, "Collect daemon diagnostics stats from slurm") 33 | slurmSacctEnabled = flag.Bool("slurm.collect-limits", false, "Collect account and user limits from slurm") 34 | slurmCliFallback = flag.Bool("slurm.cli-fallback", true, "drop the --json arg and revert back to standard squeue for performance reasons") 35 | metricsFilterRegex = flag.String("metrics.exclude", "", "Regex pattern for metrics to exclude") 36 | ) 37 | 38 | func main() { 39 | flag.Parse() 40 | cliFlags := exporter.CliFlags{ 41 | ListenAddress: *listenAddress, 42 | MetricsPath: *metricsPath, 43 | LogLevel: *logLevel, 44 | TraceEnabled: *traceEnabled, 45 | TracePath: *tracePath, 46 | SlurmPollLimit: *slurmPollLimit, 47 | SlurmSinfoOverride: *slurmSinfoOverride, 48 | SlurmSqueueOverride: *slurmSqueueOverride, 49 | SlurmLicenseOverride: *slurmLicenseOverride, 50 | SlurmDiagOverride: *slurmDiagOverride, 51 | SlurmLicEnabled: *slurmLicEnabled, 52 | SlurmDiagEnabled: *slurmDiagEnabled, 53 | SacctEnabled: *slurmSacctEnabled, 54 | SlurmCliFallback: *slurmCliFallback, 55 | TraceRate: *traceRate, 56 | SlurmAcctOverride: *slurmSaactOverride, 57 | MetricsExcludeFilterRegex: *metricsFilterRegex, 58 | } 59 | config, err := exporter.NewConfig(&cliFlags) 60 | if err != nil { 61 | log.Fatalf("failed to init config with %q", err) 62 | } 63 | handler := exporter.InitPromServer(config) 64 | http.Handle(config.MetricsPath, handler) 65 | slog.Info("serving metrics at " + config.ListenAddress + config.MetricsPath) 66 | log.Fatalf("server exited with %q", http.ListenAndServe(config.ListenAddress, nil)) 67 | 68 | } 69 | -------------------------------------------------------------------------------- /wrappers/proctrac.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # SPDX-FileCopyrightText: 2023 Rivos Inc. 4 | # 5 | # SPDX-License-Identifier: Apache-2.0 6 | 7 | import psutil 8 | import os 9 | import requests 10 | from time import sleep 11 | from typing import Generator 12 | import argparse as ag 13 | import json 14 | import platform 15 | from datetime import datetime 16 | from dataclasses import dataclass, asdict, field 17 | from pprint import pprint 18 | from getpass import getuser 19 | 20 | 21 | # must correlate with trace info struct 22 | @dataclass 23 | class TraceInfo: 24 | pid: int 25 | cpus: float 26 | threads: float 27 | mem: float 28 | read_bytes: float 29 | write_bytes: float 30 | job_id: int 31 | username: str = field(default_factory=getuser) 32 | hostname: str = field(default_factory=platform.node) 33 | 34 | @classmethod 35 | def from_proc(cls, jobid: int, proc: psutil.Popen) -> "TraceInfo": 36 | io_counters = proc.io_counters() 37 | return cls( 38 | pid=proc.pid, 39 | cpus=proc.cpu_percent(0.1), 40 | threads=proc.num_threads(), 41 | mem=proc.memory_info().rss, 42 | read_bytes=io_counters.read_bytes, 43 | write_bytes=io_counters.write_bytes, 44 | job_id=jobid, 45 | ) 46 | 47 | def add_subproc(self, proc: psutil.Popen): 48 | io_counters = proc.io_counters() 49 | self.cpus += proc.cpu_percent(0.1) 50 | self.threads += proc.num_threads() 51 | self.mem += proc.memory_info().rss 52 | self.read_bytes += io_counters.read_bytes 53 | self.write_bytes += io_counters.write_bytes 54 | 55 | 56 | class ProcWrapper: 57 | """thin wrapper to send slurm proc metrics to our exporter""" 58 | 59 | sample_rate: int 60 | jobid: int 61 | proc: psutil.Popen 62 | 63 | def __init__(self, cmd=[], sample_rate=0, jobid=0): 64 | self.cmd = cmd 65 | self.sample_rate = sample_rate 66 | self.jobid = jobid 67 | assert self.jobid > 0, "SLURM_JOBID must be provided" 68 | assert self.cmd, "no cmd provided" 69 | assert self.sample_rate > 0, "endpoint must be greater than 0" 70 | self.proc = psutil.Popen(self.cmd) 71 | 72 | def poll_info(self) -> Generator[TraceInfo, None, None]: 73 | while self.proc.poll() is None: 74 | trace = TraceInfo.from_proc(self.jobid, self.proc) 75 | start = datetime.now() 76 | for p in self.proc.children(True): 77 | try: 78 | trace.add_subproc(p) 79 | except psutil.Error as e: 80 | print(f"failed to poll child process with error {e}") 81 | continue 82 | yield trace 83 | durr = datetime.now() - start 84 | sleep(max(self.sample_rate - durr.seconds, 0)) 85 | 86 | 87 | if __name__ == "__main__": 88 | parser = ag.ArgumentParser( 89 | "cmd wrapper", 90 | """ 91 | Simple wrapper on any proccess using proc utils can use it inline, exp. 92 | $ python proctrac.py sleep 10 93 | the wrapper will then resolve sample rate for SAMPLE_RATE env var and 94 | endpoint url for the slurm exporter from the SLURM_EXPORTER env var. 95 | Or by passing explicit cmdline args, exp. 96 | $ python proctrac.py --endpoint localhost:8092 --sample-rate 10 --cmd sleep 10 97 | """, 98 | "This script is intended to be called from within a sbatch script wrapper", 99 | ) 100 | parser.add_argument("argv", nargs="*") 101 | parser.add_argument( 102 | "--endpoint", 103 | help="endpoint for slurm exporter", 104 | default=os.getenv("SLURM_EXPORTER", "localhost:8092"), 105 | ) 106 | parser.add_argument( 107 | "--sample-rate", 108 | type=float, 109 | help="rate to sample wrapped proc", 110 | default=float(os.getenv("SAMPLE_RATE", 10)), 111 | ) 112 | parser.add_argument("--cmd", nargs="+") 113 | parser.add_argument( 114 | "--jobid", 115 | type=int, 116 | help="explicitly passing slurm job id (very rarely needed)", 117 | default=int(os.getenv("SLURM_JOBID", 0)), 118 | ) 119 | parser.add_argument("--dry-run", action="store_true") 120 | parser.add_argument("--verbose", action="store_true") 121 | parser.add_argument( 122 | "--validate", 123 | action="store_true", 124 | help="run the poll once to check for schema correctness", 125 | ) 126 | args = parser.parse_args() 127 | assert not (args.argv and args.cmd), "argv and --cmd are mutually exclusive" 128 | assert args.argv or args.cmd, "must provide an commnad to wrap" 129 | wrapper = ProcWrapper(args.cmd or args.argv, args.sample_rate, args.jobid) 130 | 131 | if args.validate: 132 | print(json.dumps(asdict(next(wrapper.poll_info())))) 133 | wrapper.proc.terminate() 134 | elif args.dry_run: 135 | [pprint(asdict(stat)) for stat in wrapper.poll_info()] 136 | else: 137 | for trace in wrapper.poll_info(): 138 | resp = requests.post(args.endpoint, json=asdict(trace)) 139 | if args.verbose: 140 | print(asdict(trace), resp) 141 | --------------------------------------------------------------------------------