├── internal
    ├── api
    │   ├── transport_test.go
    │   ├── endpoints.go
    │   ├── endpoints_2311.go
    │   ├── endpoints_2405.go
    │   ├── endpoints_2411.go
    │   ├── handler.go
    │   ├── cache.go
    │   ├── unmarshalers_2311_test.go
    │   ├── unmarshalers_2405_test.go
    │   ├── unmarshalers_2411_test.go
    │   ├── responses_2311.go
    │   ├── responses_2405.go
    │   ├── responses_2411.go
    │   ├── unmarshalers.go
    │   └── transport.go
    ├── util
    │   ├── time.go
    │   ├── fmt.go
    │   ├── infinity.go
    │   └── testdata.go
    ├── types
    │   ├── keys.go
    │   ├── nodes.go
    │   └── jobs.go
    └── slurm
    │   ├── fairshare.go
    │   ├── users.go
    │   ├── gpus.go
    │   ├── cpus.go
    │   ├── account.go
    │   ├── node.go
    │   ├── nodes.go
    │   ├── queue.go
    │   ├── partitions.go
    │   └── scheduler.go
├── .gitignore
├── images
    ├── Job_Status.png
    ├── Node_Status.png
    └── Scheduler_Info.png
├── docker
    ├── commands
    ├── lets_go_job.sbatch
    ├── hello_world_job.sbatch
    ├── slurmdbd.conf
    ├── cgroup.conf
    ├── start_jobs.sh
    ├── slurm.conf
    ├── slurm.dockerfile
    ├── start_slurm.sh
    ├── 23.11.dockerfile
    ├── build_slurm_version.py
    └── 24.05.dockerfile
├── openapitools.json
├── extras
    └── systemd
    │   └── prometheus-slurm-exporter.service
├── go.mod
├── Makefile
├── .github
    └── workflows
    │   └── gotest.yml
├── .goreleaser.yaml
├── go.sum
├── CONTRIBUTING.md
├── README.md
├── cmd
    └── prometheus-slurm-exporter
    │   └── main.go
└── testdata
    ├── SlurmV0041GetShares200Response.json
    ├── V0041OpenapiSharesResp.json
    ├── SlurmV0041GetDiag200Response.json
    ├── V0041OpenapiNodesResp.json
    ├── V0041OpenapiPartitionResp.json
    ├── V0040OpenapiSharesResp.json
    ├── V0040OpenapiDiagResp.json
    ├── V0040OpenapiPartitionResp.json
    └── V0040OpenapiJobInfoResp.json


/internal/api/transport_test.go:
--------------------------------------------------------------------------------
1 | package api
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | go/
3 | *.snap
4 | 
5 | dist/
6 | .ansible
7 | 


--------------------------------------------------------------------------------
/images/Job_Status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcrownover/prometheus-slurm-exporter/HEAD/images/Job_Status.png


--------------------------------------------------------------------------------
/images/Node_Status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcrownover/prometheus-slurm-exporter/HEAD/images/Node_Status.png


--------------------------------------------------------------------------------
/images/Scheduler_Info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcrownover/prometheus-slurm-exporter/HEAD/images/Scheduler_Info.png


--------------------------------------------------------------------------------
/internal/util/time.go:
--------------------------------------------------------------------------------
1 | package util
2 | 
3 | import "time"
4 | 
5 | func NowEpoch() int64 {
6 | 	return time.Now().Unix()
7 | }
8 | 


--------------------------------------------------------------------------------
/docker/commands:
--------------------------------------------------------------------------------
1 | docker build -t 24.05 -f 24.05.dockerfile .
2 | 
3 | docker run -it --rm --name slurm_container --entrypoint /bin/bash 24.05
4 | 


--------------------------------------------------------------------------------
/internal/util/fmt.go:
--------------------------------------------------------------------------------
1 | package util
2 | 
3 | import "strings"
4 | 
5 | func RemoveWhitespace(s string) string {
6 | 	return strings.Join(strings.Fields(s), "")
7 | }
8 | 


--------------------------------------------------------------------------------
/openapitools.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "./node_modules/@openapitools/openapi-generator-cli/config.schema.json",
3 |   "spaces": 2,
4 |   "generator-cli": {
5 |     "version": "7.8.0"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docker/lets_go_job.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=lets_go
3 | #SBATCH --output=/jobs/output/lets_go.out
4 | #SBATCH --error=/jobs/err/lets_go.err
5 | #SBATCH --ntasks=1
6 | printf "Lets go\n"
7 | 


--------------------------------------------------------------------------------
/docker/hello_world_job.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=hello_world
3 | #SBATCH --output=/jobs/output/hello_world.out
4 | #SBATCH --error=/jobs/err/hello_world.err
5 | #SBATCH --time=00:05:00
6 | #SBATCH --ntasks=1
7 | printf "Hello world\n"
8 | 


--------------------------------------------------------------------------------
/internal/types/keys.go:
--------------------------------------------------------------------------------
 1 | package types
 2 | 
 3 | type Key int
 4 | 
 5 | const (
 6 | 	ApiCacheKey Key = iota
 7 | 	ApiCacheTimeoutKey
 8 | 	ApiUserKey
 9 | 	ApiTokenKey
10 | 	ApiURLKey
11 | 	ApiJobsEndpointKey
12 | 	ApiNodesEndpointKey
13 | 	ApiPartitionsEndpointKey
14 | 	ApiDiagEndpointKey
15 | 	ApiSharesEndpointKey
16 | )
17 | 


--------------------------------------------------------------------------------
/extras/systemd/prometheus-slurm-exporter.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Prometheus SLURM Exporter
 3 | 
 4 | [Service]
 5 | ExecStart=/usr/local/sbin/prometheus-slurm-exporter
 6 | EnvironmentFile=/etc/prometheus-slurm-exporter/env.conf
 7 | Restart=always
 8 | RestartSec=15
 9 | 
10 | [Install]
11 | WantedBy=multi-user.target
12 | 


--------------------------------------------------------------------------------
/docker/slurmdbd.conf:
--------------------------------------------------------------------------------
 1 | AuthInfo=/var/run/munge/munge.socket.2
 2 | AuthType=auth/munge
 3 | DbdHost=localhost
 4 | DebugLevel=info
 5 | DbdPort=6819
 6 | LogFile=/var/log/slurm/slurmdbd.log
 7 | SlurmUser=slurm
 8 | StorageHost=localhost  # or the database server host
 9 | StoragePass=root
10 | StorageType=accounting_storage/mysql
11 | StorageUser=slurm
12 | 


--------------------------------------------------------------------------------
/docker/cgroup.conf:
--------------------------------------------------------------------------------
 1 | CgroupMountpoint=/sys/fs/cgroup
 2 | ConstrainCores=yes
 3 | EnableControllers=yes
 4 | ConstrainRAMSpace=yes
 5 | CgroupPlugin=cgroup/v2
 6 | ConstrainSwapSpace=yes
 7 | ConstrainDevices=yes
 8 | #CgroupAutomount=yes Defunct option 
 9 | AllowedRamSpace=100
10 | AllowedSwapSpace=0
11 | MaxRAMPercent=99
12 | MaxSwapPercent=0
13 | MinRAMSpace=200
14 | 
15 | IgnoreSystemd=yes
16 | 


--------------------------------------------------------------------------------
/docker/start_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Submitting jobs"
 4 | sbatch /jobs/hello_world_job.sbatch
 5 | if [ $? -eq 0 ]; then
 6 |   echo "hello_world_job.sbatch submitted successfully"
 7 | else
 8 |   echo "Failed to submit hello_world_job.sbatch"
 9 | fi
10 | 
11 | sbatch /jobs/lets_go_job.sbatch
12 | if [ $? -eq 0 ]; then
13 |   echo "lets_go_job.sbatch submitted successfully"
14 | else
15 |   echo "Failed to submit lets_go_job.sbatch"
16 | fi
17 | 


--------------------------------------------------------------------------------
/internal/api/endpoints.go:
--------------------------------------------------------------------------------
 1 | package api
 2 | 
 3 | import (
 4 | 	"context"
 5 | 
 6 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 7 | )
 8 | 
 9 | type endpoint struct {
10 | 	key  types.Key
11 | 	name string
12 | 	path string
13 | }
14 | 
15 | // this gives a compile warning but centralizes the endpoints
16 | var endpoints = versionedEndpoints
17 | 
18 | func RegisterEndpoints(ctx context.Context) context.Context {
19 | 	for _, e := range endpoints {
20 | 		ctx = context.WithValue(ctx, e.key, e.path)
21 | 	}
22 | 	return ctx
23 | }
24 | 


--------------------------------------------------------------------------------
/internal/api/endpoints_2311.go:
--------------------------------------------------------------------------------
 1 | //go:build 2311
 2 | 
 3 | package api
 4 | 
 5 | import (
 6 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 7 | )
 8 | 
 9 | var versionedEndpoints = []endpoint{
10 | 	{types.ApiJobsEndpointKey, "jobs", "/slurm/v0.0.40/jobs"},
11 | 	{types.ApiNodesEndpointKey, "nodes", "/slurm/v0.0.40/nodes"},
12 | 	{types.ApiPartitionsEndpointKey, "partitions", "/slurm/v0.0.40/partitions"},
13 | 	{types.ApiDiagEndpointKey, "diag", "/slurm/v0.0.40/diag"},
14 | 	{types.ApiSharesEndpointKey, "shares", "/slurm/v0.0.40/shares"},
15 | }
16 | 


--------------------------------------------------------------------------------
/internal/api/endpoints_2405.go:
--------------------------------------------------------------------------------
 1 | //go:build 2405
 2 | 
 3 | package api
 4 | 
 5 | import (
 6 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 7 | )
 8 | 
 9 | var versionedEndpoints = []endpoint{
10 | 	{types.ApiJobsEndpointKey, "jobs", "/slurm/v0.0.41/jobs"},
11 | 	{types.ApiNodesEndpointKey, "nodes", "/slurm/v0.0.41/nodes"},
12 | 	{types.ApiPartitionsEndpointKey, "partitions", "/slurm/v0.0.41/partitions"},
13 | 	{types.ApiDiagEndpointKey, "diag", "/slurm/v0.0.41/diag"},
14 | 	{types.ApiSharesEndpointKey, "shares", "/slurm/v0.0.41/shares"},
15 | }
16 | 


--------------------------------------------------------------------------------
/internal/api/endpoints_2411.go:
--------------------------------------------------------------------------------
 1 | //go:build 2411
 2 | 
 3 | package api
 4 | 
 5 | import (
 6 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 7 | )
 8 | 
 9 | var versionedEndpoints = []endpoint{
10 | 	{types.ApiJobsEndpointKey, "jobs", "/slurm/v0.0.42/jobs"},
11 | 	{types.ApiNodesEndpointKey, "nodes", "/slurm/v0.0.42/nodes"},
12 | 	{types.ApiPartitionsEndpointKey, "partitions", "/slurm/v0.0.42/partitions"},
13 | 	{types.ApiDiagEndpointKey, "diag", "/slurm/v0.0.42/diag"},
14 | 	{types.ApiSharesEndpointKey, "shares", "/slurm/v0.0.42/shares"},
15 | }
16 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/lcrownover/prometheus-slurm-exporter
 2 | 
 3 | go 1.22.5
 4 | 
 5 | require (
 6 | 	github.com/akyoto/cache v1.0.6
 7 | 	github.com/prometheus/client_golang v1.19.1
 8 | )
 9 | 
10 | require (
11 | 	github.com/beorn7/perks v1.0.1 // indirect
12 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
13 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
14 | 	github.com/prometheus/client_model v0.6.1 // indirect
15 | 	github.com/prometheus/common v0.55.0 // indirect
16 | 	github.com/prometheus/procfs v0.15.1 // indirect
17 | 	golang.org/x/sys v0.22.0 // indirect
18 | 	google.golang.org/protobuf v1.34.2 // indirect
19 | )
20 | 


--------------------------------------------------------------------------------
/internal/api/handler.go:
--------------------------------------------------------------------------------
 1 | package api
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"log/slog"
 6 | 	"net/http"
 7 | 
 8 | 	"github.com/prometheus/client_golang/prometheus"
 9 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
10 | )
11 | 
12 | func beforeCollect(ctx context.Context) {
13 | 	err := PopulateCache(ctx)
14 | 	if err != nil {
15 | 		slog.Error("error populating request cache", "error", err)
16 | 	}
17 | }
18 | 
19 | func afterCollect(ctx context.Context) {
20 | 	WipeCache(ctx)
21 | }
22 | 
23 | func MetricsHandler(r *prometheus.Registry, ctx context.Context) http.HandlerFunc {
24 | 	h := promhttp.HandlerFor(r, promhttp.HandlerOpts{})
25 | 
26 | 	return func(w http.ResponseWriter, r *http.Request) {
27 | 		beforeCollect(ctx)
28 | 		h.ServeHTTP(w, r)
29 | 		afterCollect(ctx)
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/internal/types/nodes.go:
--------------------------------------------------------------------------------
 1 | package types
 2 | 
 3 | type NodeState string
 4 | 
 5 | const (
 6 | 	NodeStateAlloc         NodeState = "alloc"
 7 | 	NodeStateComp          NodeState = "comp"
 8 | 	NodeStateDown          NodeState = "down"
 9 | 	NodeStateDrain         NodeState = "drain"
10 | 	NodeStateFail          NodeState = "fail"
11 | 	NodeStateErr           NodeState = "err"
12 | 	NodeStateIdle          NodeState = "idle"
13 | 	NodeStateMaint         NodeState = "maint"
14 | 	NodeStateMix           NodeState = "mix"
15 | 	NodeStateResv          NodeState = "resv"
16 | 	NodeStatePlanned       NodeState = "planned"
17 | 	NodeStateNotResponding NodeState = "not_responding"
18 | 	NodeStateInvalid       NodeState = "invalid"
19 | 	NodeStateInvalidReg    NodeState = "invalid_reg"
20 | 	NodeStateDynamicNorm   NodeState = "dynamic_norm"
21 | )
22 | 


--------------------------------------------------------------------------------
/internal/types/jobs.go:
--------------------------------------------------------------------------------
 1 | package types
 2 | 
 3 | type JobState string
 4 | 
 5 | const (
 6 | 	JobStatePending     JobState = "pending"
 7 | 	JobStateCompleted   JobState = "pompleted"
 8 | 	JobStateFailed      JobState = "failed"
 9 | 	JobStateOutOfMemory JobState = "out_of_memory"
10 | 	JobStateRunning     JobState = "running"
11 | 	JobStateSuspended   JobState = "suspended"
12 | 	JobStateUnknown     JobState = "unknown"
13 | 	JobStateTimeout     JobState = "timeout"
14 | 	JobStateCancelled   JobState = "cancelled"
15 | 	JobStateCompleting  JobState = "completing"
16 | 	JobStateConfiguring JobState = "configuring"
17 | 	JobStatePreempted   JobState = "preempted"
18 | 	JobStateNodeFail    JobState = "node_fail"
19 | )
20 | 
21 | type SlurmJobsResponse struct {
22 | 	Jobs []slurmJob `json:"jobs"`
23 | }
24 | 
25 | type slurmJobCPUs struct {
26 | 	Number int `json:"number"`
27 | }
28 | 
29 | type slurmJob struct {
30 | 	Account   string       `json:"account"`
31 | 	JobStates []string     `json:"job_state"`
32 | 	CPUs      slurmJobCPUs `json:"cpus"`
33 | }
34 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT_NAME = prometheus-slurm-exporter
 2 | 
 3 | ifndef SLURM_VERSION
 4 | $(error SLURM_VERSION environment variable is not set)
 5 | endif
 6 | 
 7 | slurm_version := ${SLURM_VERSION}
 8 | 
 9 | # If SLURM_VERSION is "all", print an error message for the default build target
10 | build:
11 | ifeq ($(slurm_version),all)
12 | 	$(error You must set a specific SLURM_VERSION to build)
13 | else
14 | 	mkdir -p bin/
15 | 	go build -tags=$(subst .,,$(slurm_version)) -o bin/prometheus-slurm-exporter cmd/prometheus-slurm-exporter/main.go
16 | endif
17 | 
18 | test:
19 | ifeq ($(slurm_version),all)
20 | 	# Generate and test for version 24.05
21 | 	go test -tags=2405 -v ./...
22 | 	# Generate and test for version 23.11
23 | 	go test -tags=2311 -v ./...
24 | else
25 | 	go test -tags=$(subst .,,$(slurm_version)) -v ./...
26 | endif
27 | 
28 | install:
29 | 	cp bin/prometheus-slurm-exporter /usr/local/sbin/prometheus-slurm-exporter
30 | 	cp extras/systemd/prometheus-slurm-exporter.service /etc/systemd/system/prometheus-slurm-exporter.service
31 | 	systemctl daemon-reload
32 | 


--------------------------------------------------------------------------------
/internal/util/infinity.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import "strings"
 4 | 
 5 | func CleanseInfinity(b []byte) []byte {
 6 | 	// this is disgusting but the response has values of "Infinity" which are
 7 | 	// not json unmarshal-able, so I manually replace all the "Infinity"s with the correct
 8 | 	// float64 value that represents Infinity.
 9 | 	// this will be fixed in v0.0.42
10 | 	// https://support.schedmd.com/show_bug.cgi?id=20817
11 | 	//
12 | 	// https://github.com/lcrownover/prometheus-slurm-exporter/issues/8
13 | 	// also reported that folks are getting "inf" back, so I'll protect for that too
14 | 	bs := string(b)
15 | 	maxFloatStr := ": 1.7976931348623157e+308"
16 | 	// replacing the longer strings first should prevent any partial replacements
17 | 	bs = strings.ReplaceAll(bs, ": Infinity", maxFloatStr)
18 | 	bs = strings.ReplaceAll(bs, ": infinity", maxFloatStr)
19 | 	// sometimes it'd return "inf", so let's cover for that too.
20 | 	bs = strings.ReplaceAll(bs, ": Inf", maxFloatStr)
21 | 	bs = strings.ReplaceAll(bs, ": inf", maxFloatStr)
22 | 	return []byte(bs)
23 | }
24 | 


--------------------------------------------------------------------------------
/.github/workflows/gotest.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Go Test
 3 | 
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |       - develop
 9 |   pull_request:
10 |     branches:
11 |       - main
12 |       - develop
13 | 
14 | jobs:
15 | 
16 |   build_2311:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |     - uses: actions/checkout@v4
20 | 
21 |     - name: Set up Go
22 |       uses: actions/setup-go@v5
23 |       with:
24 |         go-version: '1.22.5'
25 | 
26 |     - name: Build
27 |       run: go build -tags=2311 -v ./...
28 | 
29 |     - name: Test
30 |       run: go test -tags=2311 -v ./...
31 | 
32 |   build_2405:
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |     - uses: actions/checkout@v4
36 | 
37 |     - name: Set up Go
38 |       uses: actions/setup-go@v5
39 |       with:
40 |         go-version: '1.22.5'
41 | 
42 |     - name: Build
43 |       run: go build -tags=2405 -v ./...
44 | 
45 |     - name: Test
46 |       run: go test -tags=2405 -v ./...
47 | 
48 |   build_2411:
49 |     runs-on: ubuntu-latest
50 |     steps:
51 |     - uses: actions/checkout@v4
52 | 
53 |     - name: Set up Go
54 |       uses: actions/setup-go@v5
55 |       with:
56 |         go-version: '1.22.5'
57 | 
58 |     - name: Build
59 |       run: go build -tags=2411 -v ./...
60 | 
61 |     - name: Test
62 |       run: go test -tags=2411 -v ./...
63 | 


--------------------------------------------------------------------------------
/internal/util/testdata.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io"
 6 | 	"log"
 7 | 	"os"
 8 | 	"path"
 9 | 	"runtime"
10 | )
11 | 
12 | // getTestDataDir returns the path to the `testdata` directory in the project
13 | func getTestDataDir() string {
14 | 	_, filename, _, _ := runtime.Caller(0)
15 | 	dir := path.Join(path.Dir(filename), "../..")
16 | 	err := os.Chdir(dir)
17 | 	if err != nil {
18 | 		panic(err)
19 | 	}
20 | 	return fmt.Sprintf(dir + "/testdata/")
21 | }
22 | 
23 | // GetTestDataFilePath returns the full filepath to the specified filename
24 | // of test data.
25 | //
26 | // Example: GetTestDataFilePath("SomeTestData.json") ->
27 | //
28 | //	/home/me/prometheus-slurm-exporter/testdata/SomeTestData.json
29 | func GetTestDataFilePath(filename string) string {
30 | 	testDataDir := getTestDataDir()
31 | 	return fmt.Sprintf("%s/%s", testDataDir, filename)
32 | }
33 | 
34 | // ReadTestDataBytes takes the short filename of the desired test data file
35 | // and returns that data as bytes.
36 | func ReadTestDataBytes(filename string) []byte {
37 | 	filepath := GetTestDataFilePath(filename)
38 | 	file, err := os.Open(filepath)
39 | 	if err != nil {
40 | 		log.Fatalf("failed to open file: %v\n", err)
41 | 	}
42 | 	defer file.Close()
43 | 
44 | 	data, err := io.ReadAll(file)
45 | 	if err != nil {
46 | 		log.Fatalf("failed to read file: %v\n", err)
47 | 	}
48 | 
49 | 	return data
50 | }
51 | 


--------------------------------------------------------------------------------
/docker/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Basic SLURM configuration
 2 | ClusterName=slurm_head
 3 | SlurmdPort=6280
 4 | SlurmUser=slurm
 5 | SlurmctldPort=6281
 6 | ProctrackType=proctrack/pgid
 7 | # Slurm controller host
 8 | #AccountingStorageType=accounting_storage/slurmdbd  <-- this breaks squeue and sinfo. Does not allow the daemons to properly come up 
 9 | AuthType=auth/munge
10 | #AuthAltTypes=auth/jwt
11 | SlurmctldHost=localhost
12 | SlurmctldParameters=enable_configless
13 | #ControlMachine=localhost  <- not needed and is ignored by slurm.conf and slurmd
14 | StateSaveLocation=/var/spool/slurm
15 | #CgroupPlugin=cgroup/v2
16 | #DebugFlags=all  <- this "all" option does not work. Tried "ALL" as well. Removing all together now
17 | SlurmdDebug=debug3
18 | SlurmdLogFile=/var/log/slurm/slurmd.log
19 | SlurmctldLogFile=/var/log/slurm/slurmctld.log
20 | 
21 | #TaskPlugin=task/cgroup   12/23/24
22 | #SelectType=select/linear  12/23/24
23 | 
24 | # Specify node as both a controller and compute node
25 | #NodeName=localhost CPUs=2 RealMemory=2048 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1 State=UNKNOWN
26 | NodeName=localhost CPUs=1 RealMemory=2048 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1
27 | #NodeName=localhost CPUs=12 RealMemory=2048 Sockets=1 CoresPerSocket=6 ThreadsPerCore=1 State=UNKNOWN
28 | 
29 | # partition with both controller and compute node
30 | PartitionName=debug Nodes=localhost Default=YES MaxTime=INFINITE State=UP
31 | 


--------------------------------------------------------------------------------
/internal/api/cache.go:
--------------------------------------------------------------------------------
 1 | package api
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"log/slog"
 7 | 	"strings"
 8 | 	"sync"
 9 | 
10 | 	"github.com/akyoto/cache"
11 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
12 | )
13 | 
14 | // PopulateCache is used to populate the cache with data from the slurm api
15 | func PopulateCache(ctx context.Context) error {
16 | 	slog.Debug("populating cache")
17 | 	var data []byte
18 | 	var err error
19 | 
20 | 	apiCache := ctx.Value(types.ApiCacheKey).(*cache.Cache)
21 | 
22 | 	var wg sync.WaitGroup
23 | 	wg.Add(len(endpoints))
24 | 	errors := make(chan error, len(endpoints))
25 | 
26 | 	for _, e := range endpoints {
27 | 		go func(e endpoint) {
28 | 			defer wg.Done()
29 | 			data, err = GetSlurmRestResponse(ctx, e.key)
30 | 			if err != nil {
31 | 				errors <- fmt.Errorf("failed to get slurmrestd %s response: %v", e.path, err)
32 | 			}
33 | 			apiCache.Set(e.name, data, 0)
34 | 		}(e)
35 | 	}
36 | 
37 | 	wg.Wait()
38 | 	close(errors)
39 | 
40 | 	var errmsgs []string
41 | 	for err := range errors {
42 | 		errmsgs = append(errmsgs, err.Error())
43 | 		return fmt.Errorf("error(s) encountered calling slurm api: [%s]", strings.Join(errmsgs, ", "))
44 | 	}
45 | 
46 | 	slog.Debug("finished populating cache")
47 | 
48 | 	return nil
49 | }
50 | 
51 | func WipeCache(ctx context.Context) error {
52 | 	apiCache := ctx.Value(types.ApiCacheKey).(*cache.Cache)
53 | 	apiCache.Delete("diag")
54 | 	apiCache.Delete("nodes")
55 | 	apiCache.Delete("jobs")
56 | 	apiCache.Delete("partitions")
57 | 	apiCache.Delete("shares")
58 | 	return nil
59 | }
60 | 


--------------------------------------------------------------------------------
/docker/slurm.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nathanhess/slurm:full-root
 2 | 
 3 | # Install systemd
 4 | RUN apt-get update && apt-get install -y \
 5 |     systemd \
 6 |     && apt-get clean \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | # Create necessary directories
10 | RUN mkdir -p /etc/systemd/system/multi-user.target.wants /container/jobs /container/output /container/err
11 | 
12 | # Copy Slurm configuration files
13 | COPY slurm.conf /etc/slurm/slurm.conf
14 | COPY cgroup.conf /etc/slurm/cgroup.conf
15 | 
16 | # Set arguments for resources
17 | ARG CPU=4
18 | ARG MEMORY=8192
19 | 
20 | # Display OS info
21 | RUN echo "Container OS:" && cat /etc/os-release
22 | 
23 | # Create sample SLURM job scripts
24 | RUN echo '#!/bin/bash\n\
25 | #SBATCH --job-name=hello_world\n\
26 | #SBATCH --output=/container/output/hello_world.out\n\
27 | #SBATCH --error=/container/err/hello_world.err\n\
28 | #SBATCH --time=00:05:00\n\
29 | #SBATCH --ntasks=1\n\n\
30 | echo "Hello World"\n\
31 | sleep 300' > /container/jobs/hello_world_job.sbatch
32 | 
33 | RUN echo '#!/bin/bash\n\
34 | #SBATCH --job-name=lets_go\n\
35 | #SBATCH --output=/container/output/lets_go.out\n\
36 | #SBATCH --error=/container/err/lets_go.err\n\
37 | #SBATCH --time=00:05:00\n\
38 | #SBATCH --ntasks=1\n\n\
39 | echo "Let'\''s Go"\n\
40 | sleep 300' > /container/jobs/lets_go_job.sbatch
41 | 
42 | RUN chmod +x /container/jobs/hello_world_job.sbatch /container/jobs/lets_go_job.sbatch
43 | 
44 | COPY start_slurm.sh /usr/local/bin/start_slurm.sh
45 | RUN chmod +x /usr/local/bin/start_slurm.sh
46 | 
47 | ENTRYPOINT ["/usr/local/bin/start_slurm.sh"]
48 | CMD ["/bin/systemd"]
49 | 
50 | 


--------------------------------------------------------------------------------
/.goreleaser.yaml:
--------------------------------------------------------------------------------
 1 | # This is an example .goreleaser.yml file with some sensible defaults.
 2 | # Make sure to check the documentation at https://goreleaser.com
 3 | 
 4 | # The lines below are called `modelines`. See `:help modeline`
 5 | # Feel free to remove those if you don't want/need to use them.
 6 | # yaml-language-server: $schema=https://goreleaser.com/static/schema.json
 7 | # vim: set ts=2 sw=2 tw=0 fo=cnqoj
 8 | 
 9 | version: 2
10 | 
11 | before:
12 |   hooks:
13 |     - go mod tidy
14 | 
15 | archives:
16 |   - format: binary
17 | 
18 | changelog:
19 |   sort: asc
20 |   filters:
21 |     exclude:
22 |       - "^docs:"
23 |       - "^test:"
24 | builds:
25 |   - id: 'slurm-23.11'
26 |     main: ./cmd/prometheus-slurm-exporter/main.go
27 |     binary: prometheus-slurm-exporter_slurm-23.11_{{ .Os }}_{{ .Arch }}
28 |     flags:
29 |       - -tags=2311
30 |     env:
31 |       - CGO_ENABLED=0
32 |     goos:
33 |       - linux
34 |     goarch:
35 |       - amd64
36 |       - arm64
37 | 
38 |   - id: 'slurm-24.05'
39 |     main: ./cmd/prometheus-slurm-exporter/main.go
40 |     binary: prometheus-slurm-exporter_slurm-24.05_{{ .Os }}_{{ .Arch }}
41 |     flags:
42 |       - -tags=2405
43 |     env:
44 |       - CGO_ENABLED=0
45 |     goos:
46 |       - linux
47 |     goarch:
48 |       - amd64
49 |       - arm64
50 | 
51 |   - id: 'slurm-24.11'
52 |     main: ./cmd/prometheus-slurm-exporter/main.go
53 |     binary: prometheus-slurm-exporter_slurm-24.11_{{ .Os }}_{{ .Arch }}
54 |     flags:
55 |       - -tags=2411
56 |     env:
57 |       - CGO_ENABLED=0
58 |     goos:
59 |       - linux
60 |     goarch:
61 |       - amd64
62 |       - arm64
63 | 


--------------------------------------------------------------------------------
/internal/api/unmarshalers_2311_test.go:
--------------------------------------------------------------------------------
 1 | //go:build 2311
 2 | 
 3 | package api
 4 | 
 5 | import (
 6 | 	"encoding/json"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/util"
10 | )
11 | 
12 | func TestUnmarshalDiagResponse(t *testing.T) {
13 | 	var r DiagResp
14 | 	fb := util.ReadTestDataBytes("V0040OpenapiDiagResp.json")
15 | 	err := json.Unmarshal(fb, &r)
16 | 	if err != nil {
17 | 		t.Fatalf("failed to unmarshal diag response: %v\n", err)
18 | 	}
19 | }
20 | 
21 | func TestUnmarshalJobsResponse(t *testing.T) {
22 | 	var r JobsResp
23 | 	fb := util.ReadTestDataBytes("V0040OpenapiJobInfoResp.json")
24 | 	err := json.Unmarshal(fb, &r)
25 | 	if err != nil {
26 | 		t.Fatalf("failed to unmarshal jobs response: %v\n", err)
27 | 	}
28 | }
29 | 
30 | func TestUnmarshalNodesResponse(t *testing.T) {
31 | 	var r NodesResp
32 | 	fb := util.ReadTestDataBytes("V0040OpenapiNodesResp.json")
33 | 	err := json.Unmarshal(fb, &r)
34 | 	if err != nil {
35 | 		t.Fatalf("failed to unmarshal nodes response: %v\n", err)
36 | 	}
37 | }
38 | 
39 | func TestUnmarshalPartitionsResponse(t *testing.T) {
40 | 	var r PartitionsResp
41 | 	fb := util.ReadTestDataBytes("V0040OpenapiPartitionResp.json")
42 | 	err := json.Unmarshal(fb, &r)
43 | 	if err != nil {
44 | 		t.Fatalf("failed to unmarshal partition response: %v\n", err)
45 | 	}
46 | }
47 | 
48 | func TestUnmarshalSharesResponse(t *testing.T) {
49 | 	var r SharesResp
50 | 	fb := util.ReadTestDataBytes("V0040OpenapiSharesResp.json")
51 | 	fb = util.CleanseInfinity(fb)
52 | 	err := json.Unmarshal(fb, &r)
53 | 	if err != nil {
54 | 		t.Fatalf("failed to unmarshal shares response: %v\n", err)
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/internal/api/unmarshalers_2405_test.go:
--------------------------------------------------------------------------------
 1 | //go:build 2405
 2 | 
 3 | package api
 4 | 
 5 | import (
 6 | 	"encoding/json"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/util"
10 | )
11 | 
12 | func TestUnmarshalDiagResponse(t *testing.T) {
13 | 	var r DiagResp
14 | 	fb := util.ReadTestDataBytes("SlurmV0041GetDiag200Response.json")
15 | 	err := json.Unmarshal(fb, &r)
16 | 	if err != nil {
17 | 		t.Fatalf("failed to unmarshal diag response: %v\n", err)
18 | 	}
19 | }
20 | 
21 | func TestUnmarshalJobsResponse(t *testing.T) {
22 | 	var r JobsResp
23 | 	fb := util.ReadTestDataBytes("V0041OpenapiJobInfoResp.json")
24 | 	err := json.Unmarshal(fb, &r)
25 | 	if err != nil {
26 | 		t.Fatalf("failed to unmarshal jobs response: %v\n", err)
27 | 	}
28 | }
29 | 
30 | func TestUnmarshalNodesResponse(t *testing.T) {
31 | 	var r NodesResp
32 | 	fb := util.ReadTestDataBytes("V0041OpenapiNodesResp.json")
33 | 	err := json.Unmarshal(fb, &r)
34 | 	if err != nil {
35 | 		t.Fatalf("failed to unmarshal nodes response: %v\n", err)
36 | 	}
37 | }
38 | 
39 | func TestUnmarshalPartitionsResponse(t *testing.T) {
40 | 	var r PartitionsResp
41 | 	fb := util.ReadTestDataBytes("V0041OpenapiPartitionResp.json")
42 | 	err := json.Unmarshal(fb, &r)
43 | 	if err != nil {
44 | 		t.Fatalf("failed to unmarshal partition response: %v\n", err)
45 | 	}
46 | }
47 | 
48 | func TestUnmarshalSharesResponse(t *testing.T) {
49 | 	var r SharesResp
50 | 	fb := util.ReadTestDataBytes("V0041OpenapiSharesResp.json")
51 | 	fb = util.CleanseInfinity(fb)
52 | 	err := json.Unmarshal(fb, &r)
53 | 	if err != nil {
54 | 		t.Fatalf("failed to unmarshal shares response: %v\n", err)
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/internal/api/unmarshalers_2411_test.go:
--------------------------------------------------------------------------------
 1 | //go:build 2411
 2 | 
 3 | package api
 4 | 
 5 | import (
 6 | 	"encoding/json"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/util"
10 | )
11 | 
12 | func TestUnmarshalDiagResponse(t *testing.T) {
13 | 	var r DiagResp
14 | 	fb := util.ReadTestDataBytes("SlurmV0041GetDiag200Response.json")
15 | 	err := json.Unmarshal(fb, &r)
16 | 	if err != nil {
17 | 		t.Fatalf("failed to unmarshal diag response: %v\n", err)
18 | 	}
19 | }
20 | 
21 | func TestUnmarshalJobsResponse(t *testing.T) {
22 | 	var r JobsResp
23 | 	fb := util.ReadTestDataBytes("V0041OpenapiJobInfoResp.json")
24 | 	err := json.Unmarshal(fb, &r)
25 | 	if err != nil {
26 | 		t.Fatalf("failed to unmarshal jobs response: %v\n", err)
27 | 	}
28 | }
29 | 
30 | func TestUnmarshalNodesResponse(t *testing.T) {
31 | 	var r NodesResp
32 | 	fb := util.ReadTestDataBytes("V0041OpenapiNodesResp.json")
33 | 	err := json.Unmarshal(fb, &r)
34 | 	if err != nil {
35 | 		t.Fatalf("failed to unmarshal nodes response: %v\n", err)
36 | 	}
37 | }
38 | 
39 | func TestUnmarshalPartitionsResponse(t *testing.T) {
40 | 	var r PartitionsResp
41 | 	fb := util.ReadTestDataBytes("V0041OpenapiPartitionResp.json")
42 | 	err := json.Unmarshal(fb, &r)
43 | 	if err != nil {
44 | 		t.Fatalf("failed to unmarshal partition response: %v\n", err)
45 | 	}
46 | }
47 | 
48 | func TestUnmarshalSharesResponse(t *testing.T) {
49 | 	var r SharesResp
50 | 	fb := util.ReadTestDataBytes("V0041OpenapiSharesResp.json")
51 | 	fb = util.CleanseInfinity(fb)
52 | 	err := json.Unmarshal(fb, &r)
53 | 	if err != nil {
54 | 		t.Fatalf("failed to unmarshal shares response: %v\n", err)
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/docker/start_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Ensure logfile and /var/log/munge have the correct ownership
 4 | # Start munge daemon as munge user
 5 | echo "Starting the munge daemon"
 6 | sudo -u munge /usr/sbin/munged
 7 | 
 8 | # Check if munge daemon started successfully
 9 | if ps aux | grep -q '[m]unged'; then
10 |   echo "Munge daemon started successfully"
11 | else
12 |   echo "Failed to start Munge daemon"
13 |   exit 1
14 | fi
15 | 
16 | # Output the Slurm configuration
17 | #cat /usr/local/etc/slurm.conf
18 | # Start the slurmctld daemon
19 | echo "Starting the slurmctld daemon"
20 | if /slurm/src/slurmctld/slurmctld -f /usr/local/etc/slurm.conf; then
21 |   echo "slurmctld daemon started successfully"
22 | else
23 |   echo "Failed to start slurmctld daemon"
24 |   exit 1
25 | fi
26 | 
27 | # Start the slurmd daemon
28 | echo "Starting the slurmd daemon"
29 | if /slurm/src/slurmd/slurmd/slurmd --conf-server localhost:6281; then
30 |   echo "slurmd daemon started successfully"
31 | else
32 |   echo "Failed to start slurmd daemon"
33 |   exit 1
34 | fi
35 | 
36 | echo "Starting the slurmdbd daemon"
37 | if /slurm/src/slurmdbd/slurmdbd; then
38 |   echo "slurmdbd daemon started successfully"
39 | else
40 |   echo "Failed to start slurmd daemon"
41 |   exit 1
42 | fi
43 | 
44 | sleep 3
45 | ps aux | grep munged | grep -v grep
46 | ps aux | grep slurmd | grep -v grep
47 | ps aux | grep slurmctld | grep -v grep
48 | ps aux | grep slurmdbd | grep -v grep
49 | #echo "Submitting jobs"
50 | #sbatch /jobs/hello_world_job.sbatch
51 | #if [ $? -eq 0 ]; then
52 | #  echo "hello_world_job.sbatch submitted successfully"
53 | #else
54 | #  echo "Failed to submit hello_world_job.sbatch"
55 | #fi
56 | #
57 | #sbatch /jobs/lets_go_job.sbatch
58 | #if [ $? -eq 0 ]; then
59 | #  echo "lets_go_job.sbatch submitted successfully"
60 | #else
61 | #  echo "Failed to submit lets_go_job.sbatch"
62 | #fi
63 | 
64 | # Keep the container running
65 | #tail -f /dev/null
66 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/akyoto/cache v1.0.6 h1:5XGVVYoi2i+DZLLPuVIXtsNIJ/qaAM16XT0LaBaXd2k=
 2 | github.com/akyoto/cache v1.0.6/go.mod h1:WfxTRqKhfgAG71Xh6E3WLpjhBtZI37O53G4h5s+3iM4=
 3 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 4 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 5 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 6 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 7 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 8 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 9 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
10 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
11 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
12 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
13 | github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
14 | github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
15 | github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
16 | github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
17 | github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
18 | github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
19 | github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
20 | github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
21 | golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
22 | golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
23 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
24 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
25 | 


--------------------------------------------------------------------------------
/internal/slurm/fairshare.go:
--------------------------------------------------------------------------------
 1 | package slurm
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"log/slog"
 6 | 
 7 | 	"github.com/akyoto/cache"
 8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
 9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
10 | 	"github.com/prometheus/client_golang/prometheus"
11 | )
12 | 
13 | type FairShareCollector struct {
14 | 	ctx       context.Context
15 | 	fairshare *prometheus.Desc
16 | }
17 | 
18 | func NewFairShareCollector(ctx context.Context) *FairShareCollector {
19 | 	labels := []string{"account"}
20 | 	return &FairShareCollector{
21 | 		ctx:       ctx,
22 | 		fairshare: prometheus.NewDesc("slurm_account_fairshare", "FairShare for account", labels, nil),
23 | 	}
24 | }
25 | 
26 | func (fsc *FairShareCollector) Describe(ch chan<- *prometheus.Desc) {
27 | 	ch <- fsc.fairshare
28 | }
29 | 
30 | func (fsc *FairShareCollector) Collect(ch chan<- prometheus.Metric) {
31 | 	apiCache := fsc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
32 | 	sharesRespBytes, found := apiCache.Get("shares")
33 | 	if !found {
34 | 		slog.Error("failed to get shares response for fair share metrics from cache")
35 | 		return
36 | 	}
37 | 
38 | 	sharesData, err := api.ProcessSharesResponse(sharesRespBytes.([]byte))
39 | 	if err != nil {
40 | 		slog.Error("failed to process shares response for fair share metrics", "error", err)
41 | 		return
42 | 	}
43 | 	fsm, err := ParseFairShareMetrics(sharesData)
44 | 	if err != nil {
45 | 		slog.Error("failed to collect fair share metrics", "error", err)
46 | 		return
47 | 	}
48 | 	for f := range fsm {
49 | 		ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f)
50 | 	}
51 | }
52 | 
53 | type fairShareMetrics struct {
54 | 	fairshare float64
55 | }
56 | 
57 | func NewFairShareMetrics() *fairShareMetrics {
58 | 	return &fairShareMetrics{}
59 | }
60 | 
61 | func ParseFairShareMetrics(sharesData *api.SharesData) (map[string]*fairShareMetrics, error) {
62 | 	accounts := make(map[string]*fairShareMetrics)
63 | 	for _, s := range sharesData.Shares {
64 | 		account := s.Name
65 | 		if account == "root" {
66 | 			// we don't care about the root account
67 | 			continue
68 | 		}
69 | 		if _, exists := accounts[account]; !exists {
70 | 			accounts[account] = NewFairShareMetrics()
71 | 		}
72 | 		accounts[account].fairshare = s.EffectiveUsage
73 | 	}
74 | 	return accounts, nil
75 | }
76 | 


--------------------------------------------------------------------------------
/internal/api/responses_2311.go:
--------------------------------------------------------------------------------
 1 | //go:build 2311
 2 | 
 3 | package api
 4 | 
 5 | var apiVersion = "23.11"
 6 | 
 7 | type DiagResp struct {
 8 | 	Statistics struct {
 9 | 		ServerThreadCount      *int32 `json:"server_thread_count"`
10 | 		AgentQueueSize         *int32 `json:"agent_queue_size"`
11 | 		DbdAgentQueueSize      *int32 `json:"dbd_agent_queue_size"`
12 | 		ScheduleCycleLast      *int32 `json:"schedule_cycle_last"`
13 | 		ScheduleCycleMean      *int64 `json:"schedule_cycle_mean"`
14 | 		ScheduleCyclePerMinute *int64 `json:"schedule_cycle_per_minute"`
15 | 		BfDepthMean            *int64 `json:"bf_depth_mean"`
16 | 		BfCycleLast            *int32 `json:"bf_cycle_last"`
17 | 		BfCycleMean            *int64 `json:"bf_cycle_mean"`
18 | 		BfBackfilledJobs       *int32 `json:"bf_backfilled_jobs"`
19 | 		BfLastBackfilledJobs   *int32 `json:"bf_last_backfilled_jobs"`
20 | 		BfBackfilledHetJobs    *int32 `json:"bf_backfilled_het_jobs"`
21 | 	} `json:"statistics"`
22 | }
23 | 
24 | type JobsResp struct {
25 | 	Jobs []struct {
26 | 		Account      *string  `json:"account"`
27 | 		UserName     *string  `json:"user_name"`
28 | 		Partition    *string  `json:"partition"`
29 | 		JobState     []string `json:"job_state"`
30 | 		Dependency   *string  `json:"dependency"`
31 | 		JobResources struct {
32 | 			Cpus *int32 `json:"allocated_cores"`
33 | 		} `json:"job_resources"`
34 | 	} `json:"jobs"`
35 | }
36 | 
37 | type NodesResp struct {
38 | 	Nodes []struct {
39 | 		Name          *string  `json:"name,omitempty"`
40 | 		Hostname      *string  `json:"hostname,omitempty"`
41 | 		State         []string `json:"state,omitempty"`
42 | 		Tres          *string  `json:"tres,omitempty"`
43 | 		TresUsed      *string  `json:"tres_used,omitempty"`
44 | 		Partitions    []string `json:"partitions,omitempty"`
45 | 		AllocMemory   *int64   `json:"alloc_memory,omitempty"`
46 | 		RealMemory    *int64   `json:"real_memory,omitempty"`
47 | 		AllocCpus     *int32   `json:"alloc_cpus,omitempty"`
48 | 		AllocIdleCpus *int32   `json:"alloc_idle_cpus,omitempty"`
49 | 		Cpus          *int32   `json:"cpus,omitempty"`
50 | 	} `json:"nodes"`
51 | }
52 | 
53 | type PartitionsResp struct {
54 | 	Partitions []struct {
55 | 		Name *string `json:"name,omitempty"`
56 | 		Cpus *struct {
57 | 			Total *int32 `json:"total"`
58 | 		} `json:"cpus"`
59 | 		Nodes *struct {
60 | 			Configured *string `json:"configured"`
61 | 		} `json:"nodes"`
62 | 	} `json:"partitions"`
63 | }
64 | 
65 | type SharesResp struct {
66 | 	Shares struct {
67 | 		Shares []struct {
68 | 			Name           *string  `json:"name"`
69 | 			EffectiveUsage *float64 `json:"effective_usage"`
70 | 		} `json:"shares"`
71 | 	} `json:"shares"`
72 | }
73 | 


--------------------------------------------------------------------------------
/internal/api/responses_2405.go:
--------------------------------------------------------------------------------
 1 | //go:build 2405
 2 | 
 3 | package api
 4 | 
 5 | var apiVersion = "24.05"
 6 | 
 7 | type DiagResp struct {
 8 | 	Statistics struct {
 9 | 		ServerThreadCount      *int32 `json:"server_thread_count"`
10 | 		AgentQueueSize         *int32 `json:"agent_queue_size"`
11 | 		DbdAgentQueueSize      *int32 `json:"dbd_agent_queue_size"`
12 | 		ScheduleCycleLast      *int32 `json:"schedule_cycle_last"`
13 | 		ScheduleCycleMean      *int64 `json:"schedule_cycle_mean"`
14 | 		ScheduleCyclePerMinute *int64 `json:"schedule_cycle_per_minute"`
15 | 		BfDepthMean            *int64 `json:"bf_depth_mean"`
16 | 		BfCycleLast            *int32 `json:"bf_cycle_last"`
17 | 		BfCycleMean            *int64 `json:"bf_cycle_mean"`
18 | 		BfBackfilledJobs       *int32 `json:"bf_backfilled_jobs"`
19 | 		BfLastBackfilledJobs   *int32 `json:"bf_last_backfilled_jobs"`
20 | 		BfBackfilledHetJobs    *int32 `json:"bf_backfilled_het_jobs"`
21 | 	} `json:"statistics"`
22 | }
23 | 
24 | type JobsResp struct {
25 | 	Jobs []struct {
26 | 		Account      *string  `json:"account"`
27 | 		UserName     *string  `json:"user_name"`
28 | 		Partition    *string  `json:"partition"`
29 | 		JobState     []string `json:"job_state"`
30 | 		Dependency   *string  `json:"dependency"`
31 | 		JobResources struct {
32 | 			Cpus *int32 `json:"cpus"`
33 | 		} `json:"job_resources"`
34 | 	} `json:"jobs"`
35 | }
36 | 
37 | type NodesResp struct {
38 | 	Nodes []struct {
39 | 		Name          *string  `json:"name,omitempty"`
40 | 		Hostname      *string  `json:"hostname,omitempty"`
41 | 		State         []string `json:"state,omitempty"`
42 | 		Tres          *string  `json:"tres,omitempty"`
43 | 		TresUsed      *string  `json:"tres_used,omitempty"`
44 | 		Partitions    []string `json:"partitions,omitempty"`
45 | 		AllocMemory   *int64   `json:"alloc_memory,omitempty"`
46 | 		RealMemory    *int64   `json:"real_memory,omitempty"`
47 | 		AllocCpus     *int32   `json:"alloc_cpus,omitempty"`
48 | 		AllocIdleCpus *int32   `json:"alloc_idle_cpus,omitempty"`
49 | 		Cpus          *int32   `json:"cpus,omitempty"`
50 | 	} `json:"nodes"`
51 | }
52 | 
53 | type PartitionsResp struct {
54 | 	Partitions []struct {
55 | 		Name *string `json:"name,omitempty"`
56 | 		Cpus *struct {
57 | 			Total *int32 `json:"total"`
58 | 		} `json:"cpus"`
59 | 		Nodes *struct {
60 | 			Configured *string `json:"configured"`
61 | 		} `json:"nodes"`
62 | 	} `json:"partitions"`
63 | }
64 | 
65 | type SharesResp struct {
66 | 	Shares struct {
67 | 		Shares []struct {
68 | 			Name           *string `json:"name"`
69 | 			EffectiveUsage struct {
70 | 				Number *float64 `json:"number"`
71 | 			} `json:"effective_usage"`
72 | 		} `json:"shares"`
73 | 	} `json:"shares"`
74 | }
75 | 


--------------------------------------------------------------------------------
/internal/api/responses_2411.go:
--------------------------------------------------------------------------------
 1 | //go:build 2411
 2 | 
 3 | package api
 4 | 
 5 | var apiVersion = "24.11"
 6 | 
 7 | type DiagResp struct {
 8 | 	Statistics struct {
 9 | 		ServerThreadCount      *int32 `json:"server_thread_count"`
10 | 		AgentQueueSize         *int32 `json:"agent_queue_size"`
11 | 		DbdAgentQueueSize      *int32 `json:"dbd_agent_queue_size"`
12 | 		ScheduleCycleLast      *int32 `json:"schedule_cycle_last"`
13 | 		ScheduleCycleMean      *int64 `json:"schedule_cycle_mean"`
14 | 		ScheduleCyclePerMinute *int64 `json:"schedule_cycle_per_minute"`
15 | 		BfDepthMean            *int64 `json:"bf_depth_mean"`
16 | 		BfCycleLast            *int32 `json:"bf_cycle_last"`
17 | 		BfCycleMean            *int64 `json:"bf_cycle_mean"`
18 | 		BfBackfilledJobs       *int32 `json:"bf_backfilled_jobs"`
19 | 		BfLastBackfilledJobs   *int32 `json:"bf_last_backfilled_jobs"`
20 | 		BfBackfilledHetJobs    *int32 `json:"bf_backfilled_het_jobs"`
21 | 	} `json:"statistics"`
22 | }
23 | 
24 | type JobsResp struct {
25 | 	Jobs []struct {
26 | 		Account      *string  `json:"account"`
27 | 		UserName     *string  `json:"user_name"`
28 | 		Partition    *string  `json:"partition"`
29 | 		JobState     []string `json:"job_state"`
30 | 		Dependency   *string  `json:"dependency"`
31 | 		JobResources struct {
32 | 			Cpus *int32 `json:"cpus"`
33 | 		} `json:"job_resources"`
34 | 	} `json:"jobs"`
35 | }
36 | 
37 | type NodesResp struct {
38 | 	Nodes []struct {
39 | 		Name          *string  `json:"name,omitempty"`
40 | 		Hostname      *string  `json:"hostname,omitempty"`
41 | 		State         []string `json:"state,omitempty"`
42 | 		Tres          *string  `json:"tres,omitempty"`
43 | 		TresUsed      *string  `json:"tres_used,omitempty"`
44 | 		Partitions    []string `json:"partitions,omitempty"`
45 | 		AllocMemory   *int64   `json:"alloc_memory,omitempty"`
46 | 		RealMemory    *int64   `json:"real_memory,omitempty"`
47 | 		AllocCpus     *int32   `json:"alloc_cpus,omitempty"`
48 | 		AllocIdleCpus *int32   `json:"alloc_idle_cpus,omitempty"`
49 | 		Cpus          *int32   `json:"cpus,omitempty"`
50 | 	} `json:"nodes"`
51 | }
52 | 
53 | type PartitionsResp struct {
54 | 	Partitions []struct {
55 | 		Name *string `json:"name,omitempty"`
56 | 		Cpus *struct {
57 | 			Total *int32 `json:"total"`
58 | 		} `json:"cpus"`
59 | 		Nodes *struct {
60 | 			Configured *string `json:"configured"`
61 | 		} `json:"nodes"`
62 | 	} `json:"partitions"`
63 | }
64 | 
65 | type SharesResp struct {
66 | 	Shares struct {
67 | 		Shares []struct {
68 | 			Name           *string  `json:"name"`
69 | 			EffectiveUsage *struct {
70 | 				Number   *float64 `json:"number"`
71 | 			} `json:"effective_usage"`
72 | 		} `json:"shares"`
73 | 	} `json:"shares"`
74 | }
75 | 


--------------------------------------------------------------------------------
/internal/api/unmarshalers.go:
--------------------------------------------------------------------------------
 1 | package api
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 	"log/slog"
 7 | 
 8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/util"
 9 | )
10 | 
11 | func ProcessDiagResponse(b []byte) (*DiagData, error) {
12 | 	var r DiagResp
13 | 	if len(b) == 0 {
14 | 		return nil, fmt.Errorf("failed to unmarshal diag response, body is empty")
15 | 	}
16 | 	err := json.Unmarshal(b, &r)
17 | 	if err != nil {
18 | 		slog.Debug("failed to unmarshal diag response", "body", string(b))
19 | 		return nil, fmt.Errorf("failed to unmarshall diag response data: %v", err)
20 | 	}
21 | 	d := NewDiagData()
22 | 	d.FromResponse(r)
23 | 	return d, nil
24 | }
25 | 
26 | // ProcessJobsResponse converts the response bytes into a slurm type
27 | func ProcessJobsResponse(b []byte) (*JobsData, error) {
28 | 	var r JobsResp
29 | 	if len(b) == 0 {
30 | 		return nil, fmt.Errorf("failed to unmarshal jobs response, body is empty")
31 | 	}
32 | 	err := json.Unmarshal(b, &r)
33 | 	if err != nil {
34 | 		slog.Debug("failed to unmarshal jobs response", "body", string(b))
35 | 		return nil, fmt.Errorf("failed to unmarshall jobs response data: %v", err)
36 | 	}
37 | 	d := NewJobsData()
38 | 	d.FromResponse(r)
39 | 	return d, nil
40 | }
41 | 
42 | // ProcessNodesResponse converts the response bytes into a slurm type
43 | func ProcessNodesResponse(b []byte) (*NodesData, error) {
44 | 	var r NodesResp
45 | 	if len(b) == 0 {
46 | 		return nil, fmt.Errorf("failed to unmarshal nodes response, body is empty")
47 | 	}
48 | 	err := json.Unmarshal(b, &r)
49 | 	if err != nil {
50 | 		slog.Debug("failed to unmarshal nodes response", "body", string(b))
51 | 		return nil, fmt.Errorf("failed to unmarshall nodes response data: %v", err)
52 | 	}
53 | 	d := NewNodesData()
54 | 	d.FromResponse(r)
55 | 	return d, nil
56 | }
57 | 
58 | // ProcessPartitionsResponse converts the response bytes into a slurm type
59 | func ProcessPartitionsResponse(b []byte) (*PartitionsData, error) {
60 | 	var r PartitionsResp
61 | 	if len(b) == 0 {
62 | 		return nil, fmt.Errorf("failed to unmarshal partitions response, body is empty")
63 | 	}
64 | 	err := json.Unmarshal(b, &r)
65 | 	if err != nil {
66 | 		slog.Debug("failed to unmarshal partitions response", "body", string(b))
67 | 		return nil, fmt.Errorf("failed to unmarshall partitions response data: %v", err)
68 | 	}
69 | 	d := NewPartitionsData()
70 | 	d.FromResponse(r)
71 | 	return d, nil
72 | }
73 | 
74 | // ProcessSharesResponse converts the response bytes into a slurm type
75 | func ProcessSharesResponse(b []byte) (*SharesData, error) {
76 | 	b = util.CleanseInfinity(b)
77 | 	var r SharesResp
78 | 	if len(b) == 0 {
79 | 		return nil, fmt.Errorf("failed to unmarshal shares response, body is empty")
80 | 	}
81 | 	err := json.Unmarshal(b, &r)
82 | 	if err != nil {
83 | 		slog.Debug("failed to unmarshal shares response", "body", string(b))
84 | 		return nil, fmt.Errorf("failed to unmarshall shares response data: %v", err)
85 | 	}
86 | 
87 | 	d := NewSharesData()
88 | 	d.FromResponse(r)
89 | 	return d, nil
90 | }
91 | 


--------------------------------------------------------------------------------
/docker/23.11.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rockylinux:8
 2 | RUN dnf update -y && \
 3 |     dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \
 4 |     dnf install -y --enablerepo=devel mariadb-devel python3-PyMySQL hwloc lz4-devel wget bzip2 perl munge-devel munge cmake jansson libjwt-devel libjwt json-c-devel json-c http-parser-devel http-parser libcgroup libcgroup-tools dbus-devel && \
 5 |     dnf group install -y "Development Tools"
 6 | 
 7 | RUN dnf install -y sudo
 8 | 
 9 | RUN dnf -y update && \
10 |     dnf install -y systemd && \
11 |     dnf clean all && \
12 |     rm -rf /var/lib/apt/lists/*
13 | 
14 | RUN adduser slurm
15 | 
16 | # Install http_parser
17 | RUN git clone --depth 1 --single-branch -b v2.9.4 https://github.com/nodejs/http-parser.git http_parser \
18 |     && cd http_parser \
19 |     && make \
20 |     && make install
21 | 
22 | RUN dnf install -y systemd
23 | 
24 | WORKDIR /slurm
25 | RUN wget https://download.schedmd.com/slurm/slurm-23.11-latest.tar.bz2 && tar -xvjf slurm-23.11-latest.tar.bz2 --strip-components=1
26 | 
27 | RUN ./configure \
28 |     --with-cgroup-v2 \
29 |     --with-http-parser=/usr/local/ \
30 |     --enable-slurmrestd \
31 |     && make && make install
32 | 
33 | # Create the /var/log/slurm directory and set permissions
34 | RUN mkdir -p /var/log/slurm && \
35 |     chown slurm:slurm /var/log/slurm && \
36 |     chmod 750 /var/log/slurm && \
37 |     touch /var/log/slurm/slurmd.log && \
38 |     touch /var/log/slurm/slurmctld.log && \
39 |     chown slurm:slurm /var/log/slurm/slurmctld.log /var/log/slurm/slurmd.log
40 | 
41 | RUN getent group munge || groupadd -r munge && \
42 |     getent passwd munge || useradd -r -g munge munge && \
43 |     mkdir -p /var/log/munge && \
44 |     chown munge:munge /var/log/munge && \
45 |     chmod 750 /var/log/munge && \
46 |     /usr/sbin/create-munge-key && \
47 |     chown munge:munge /etc/munge/munge.key && \
48 |     chmod 400 /etc/munge/munge.key
49 | 
50 | RUN touch /var/log/munge/munged.log && \
51 |     chown munge:munge /var/log/munge/munged.log
52 | 
53 | COPY slurm.conf /usr/local/etc/slurm.conf
54 | 
55 | USER root
56 | COPY cgroup.conf /usr/local/etc/cgroup.conf
57 | COPY slurm.conf /usr/local/etc/slurm.conf
58 | COPY start_slurm.sh /start_slurm.sh
59 | RUN chmod 755 /start_slurm.sh
60 | RUN mkdir -p /var/spool/slurm /var/spool/slurmd && \
61 |     chown slurm:slurm /var/spool/slurm /var/spool/slurmd
62 | 
63 | RUN mkdir -p /jobs /jobs/output /jobs/err
64 | 
65 | # Create sample SLURM job scripts
66 | 
67 | COPY hello_world_job.sbatch /jobs/hello_world_job.sbatch
68 | COPY lets_go_job.sbatch /jobs/lets_go_job.sbatch
69 | 
70 | RUN chmod +x /jobs/hello_world_job.sbatch /jobs/lets_go_job.sbatch
71 | 
72 | EXPOSE 6280
73 | 
74 | RUN ln -s slurm/src/slurmd/slurmd/slurmd /bin/slurmd
75 | 
76 | #RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.41 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.41.json
77 | #RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.40 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.40.json
78 | ENTRYPOINT ["/start_slurm.sh"]
79 | 


--------------------------------------------------------------------------------
/docker/build_slurm_version.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # This script is just a quick tool to generate a slurmrestd container
  4 | # and dump the latest openapi spec to ./openapi-specs
  5 | 
  6 | import subprocess
  7 | import sys
  8 | 
  9 | if len(sys.argv) != 2:
 10 |     print(f"usage: {sys.argv[0]} <slurm xx.xx version>")
 11 |     exit(1)
 12 | 
 13 | slurm_version = sys.argv[1]
 14 | 
 15 | versions = {
 16 |     "24.05": {
 17 |         "api_version": "0.0.41",
 18 |         "container_version": "24.05",
 19 |     },
 20 |     "23.11": {
 21 |         "api_version": "0.0.40",
 22 |         "container_version": "24.05",
 23 |     },
 24 | }
 25 | 
 26 | if slurm_version not in versions:
 27 |     print(
 28 |         "supported slurm versions: {}".format(", ".join([v for v in versions.keys()]))
 29 |     )
 30 |     exit(1)
 31 | 
 32 | oapi_version = versions[slurm_version]["api_version"]
 33 | container_version = versions[slurm_version]["container_version"]
 34 | 
 35 | 
 36 | def cleanup_container(container_version: str):
 37 |     container_delete_command = f"docker rm -f slurm-{container_version}"
 38 |     s = subprocess.run(
 39 |         container_delete_command.split(),
 40 |         stdout=subprocess.PIPE,
 41 |         stderr=subprocess.PIPE,
 42 |         universal_newlines=True,
 43 |     )
 44 | 
 45 |     if s.returncode != 0:
 46 |         raise Exception(f"Failed to clean up container: {s.stderr}")
 47 | 
 48 | 
 49 | def build_container(container_version: str):
 50 |     build_command = f"docker build -t slurm_{container_version} --file {container_version}.dockerfile ."
 51 |     s = subprocess.run(
 52 |         build_command.split(),
 53 |         stdout=subprocess.PIPE,
 54 |         stderr=subprocess.PIPE,
 55 |         universal_newlines=True,
 56 |     )
 57 | 
 58 |     if s.returncode != 0:
 59 |         raise Exception(f"Failed to build SLURM: {s.stderr}")
 60 | 
 61 | 
 62 | def create_container(container_version: str) -> str:
 63 |     create_command = (
 64 |         f"docker create --name slurm-{container_version} slurm_{container_version}"
 65 |     )
 66 |     s = subprocess.run(
 67 |         create_command.split(),
 68 |         stdout=subprocess.PIPE,
 69 |         stderr=subprocess.PIPE,
 70 |         universal_newlines=True,
 71 |     )
 72 | 
 73 |     if s.returncode != 0:
 74 |         raise Exception(f"Failed to create SLURM container: {s.stderr}")
 75 | 
 76 |     return s.stdout.strip()
 77 | 
 78 | 
 79 | def copy_container_file(container_id: str, oapi_version: str):
 80 |     copy_command = f"docker cp {container_id}:/slurm/v{oapi_version}.json ../openapi-specs/{slurm_version}.json"
 81 |     s = subprocess.run(
 82 |         copy_command.split(),
 83 |         stdout=subprocess.PIPE,
 84 |         stderr=subprocess.PIPE,
 85 |         universal_newlines=True,
 86 |     )
 87 | 
 88 |     if s.returncode != 0:
 89 |         raise Exception(f"Failed to copy Openapi specs from container: {s.stderr}")
 90 | 
 91 | 
 92 | print(
 93 |     f"Building SLURM {container_version} to get Openapi manifest version {oapi_version}"
 94 | )
 95 | 
 96 | try:
 97 |     build_container(container_version)
 98 |     container_id = create_container(container_version)
 99 |     copy_container_file(container_id, oapi_version)
100 |     cleanup_container(container_version)
101 |     print(f"Copied openapi spec {oapi_version} to ../openapi-specs/{slurm_version}.json")
102 | 
103 | except Exception as e:
104 |     print(f"Failed to copy openapi spec: {e}")
105 |     cleanup_container(container_version)
106 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Development
  2 | 
  3 | You must have access to a slurm head node running `slurmrestd` and a valid token
  4 | for that service. Take note of your slurm version, such as `24.05`, as you'll
  5 | use this version when building.
  6 | 
  7 | ## Requirements
  8 | 
  9 | Install Go from source if you don't already have it.
 10 | 
 11 | ```bash
 12 | export VERSION=1.22.5 OS=linux ARCH=amd64
 13 | wget https://dl.google.com/go/go$VERSION.$OS-$ARCH.tar.gz
 14 | tar -xzvf go$VERSION.$OS-$ARCH.tar.gz
 15 | export PATH=$PWD/go/bin:$PATH
 16 | ```
 17 | 
 18 | _Alternatively install Go using the packaging system of your Linux
 19 | distribution._
 20 | 
 21 | ## Building
 22 | 
 23 | ### Clone this repository and build
 24 | 
 25 | Use Git to clone the source code:
 26 | 
 27 | ```bash
 28 | git clone https://github.com/lcrownover/prometheus-slurm-exporter.git
 29 | cd prometheus-slurm-exporter
 30 | ```
 31 | 
 32 | Build the binary for your SLURM version, for example 24.05:
 33 | 
 34 | ```bash
 35 | SLURM_VERSION=24.05 make
 36 | ```
 37 | 
 38 | Run tests for a specific SLURM version:
 39 | 
 40 | ```bash
 41 | SLURM_VERSION=24.05 make test
 42 | ```
 43 | 
 44 | Run the tests for all SLURM versions:
 45 | 
 46 | ```bash
 47 | SLURM_VERSION=all make test
 48 | ```
 49 | 
 50 | Start the exporter:
 51 | 
 52 | ```bash
 53 | ./bin/prometheus-slurm-exporter
 54 | ```
 55 | 
 56 | If you wish to run the exporter on a different port, or the default port (8080)
 57 | is already in use, run with the following argument:
 58 | 
 59 | ```bash
 60 | ./bin/prometheus-slurm-exporter --listen-address="0.0.0.0:<port>"
 61 | ```
 62 | 
 63 | Query all metrics:
 64 | 
 65 | ```bash
 66 | curl http://localhost:8080/metrics
 67 | ```
 68 | 
 69 | ### Cutting releases
 70 | 
 71 | Once you're ready to cut a new release, perform the following steps on the
 72 | `main` branch.
 73 | 
 74 | Tag the release version:
 75 | 
 76 | `git tag v1.0.1`
 77 | 
 78 | Push the tag:
 79 | 
 80 | `git push origin v1.0.1`
 81 | 
 82 | Make sure you have `GITHUB_TOKEN` exported, then use `goreleaser` to create
 83 | releases:
 84 | 
 85 | `goreleaser release --clean`
 86 | 
 87 | ## Adding Support for New Openapi Versions
 88 | 
 89 | ### Install openapi-generator-cli and openjdk
 90 | 
 91 | Install `openapi-generator-cli` globally with NPM:
 92 | 
 93 | ```bash
 94 | npm install -g @openapitools/openapi-generator-cli`
 95 | ```
 96 | 
 97 | This package depends on having the `java` executable in `PATH`, so install java.
 98 | 
 99 | For mac, `brew install java`, then following the brew message, symlink the JDK,
100 | `sudo ln -sfn /usr/local/opt/openjdk/libexec/openjdk.jdk /Library/Java/JavaVirtualMachines/openjdk.jdk`
101 | 
102 | For ubuntu, `sudo snap install openjdk`.
103 | 
104 | ### Generating and Saving Openapi specs from SLURM using Docker
105 | 
106 | Navigate to the `docker` directory and use the python script to automatically
107 | grab and store an openapi yaml spec from a target slurm version into the
108 | `openapi-specs` directory.
109 | 
110 | ```bash
111 | python build_slurm_version.py 24.11
112 | ```
113 | 
114 | ### Generating the Openapi code for new SLURM versions
115 | 
116 | I do this for every new SLURM version, so it should already be done.
117 | 
118 | Assuming 23.11:
119 | 
120 | ```bash
121 | openapi-generator-cli generate \
122 |     -g go \
123 |     -i openapi-specs/23.11.json \
124 |     -o ../openapi-slurm-23-11 \
125 |     --package-name openapi_slurm_23_11 \
126 |     --git-user-id lcrownover \
127 |     --git-repo-id openapi-slurm-23-11
128 | ```
129 | 
130 | This will generate an entire git repository that you can toss up in GitHub.
131 | 


--------------------------------------------------------------------------------
/internal/slurm/users.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 
  7 | 	"github.com/akyoto/cache"
  8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 10 | 	"github.com/prometheus/client_golang/prometheus"
 11 | )
 12 | 
 13 | type UsersCollector struct {
 14 | 	ctx          context.Context
 15 | 	pending      *prometheus.Desc
 16 | 	pending_cpus *prometheus.Desc
 17 | 	running      *prometheus.Desc
 18 | 	running_cpus *prometheus.Desc
 19 | 	suspended    *prometheus.Desc
 20 | }
 21 | 
 22 | func NewUsersCollector(ctx context.Context) *UsersCollector {
 23 | 	labels := []string{"user"}
 24 | 	return &UsersCollector{
 25 | 		ctx:          ctx,
 26 | 		pending:      prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil),
 27 | 		pending_cpus: prometheus.NewDesc("slurm_user_cpus_pending", "Pending jobs for user", labels, nil),
 28 | 		running:      prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil),
 29 | 		running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil),
 30 | 		suspended:    prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil),
 31 | 	}
 32 | }
 33 | 
 34 | func (uc *UsersCollector) Describe(ch chan<- *prometheus.Desc) {
 35 | 	ch <- uc.pending
 36 | 	ch <- uc.pending_cpus
 37 | 	ch <- uc.running
 38 | 	ch <- uc.running_cpus
 39 | 	ch <- uc.suspended
 40 | }
 41 | 
 42 | func (uc *UsersCollector) Collect(ch chan<- prometheus.Metric) {
 43 | 	apiCache := uc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 44 | 	jobsRespBytes, found := apiCache.Get("jobs")
 45 | 	if !found {
 46 | 		slog.Error("failed to get jobs response for users metrics from cache")
 47 | 		return
 48 | 	}
 49 | 	jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte))
 50 | 	if err != nil {
 51 | 		slog.Error("failed to process jobs data for users metrics", "error", err)
 52 | 		return
 53 | 	}
 54 | 	um, err := ParseUsersMetrics(jobsData)
 55 | 	if err != nil {
 56 | 		slog.Error("failed to collect user metrics", "error", err)
 57 | 		return
 58 | 	}
 59 | 	for u := range um {
 60 | 		if um[u].pending > 0 {
 61 | 			ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u)
 62 | 		}
 63 | 		if um[u].pending_cpus > 0 {
 64 | 			ch <- prometheus.MustNewConstMetric(uc.pending_cpus, prometheus.GaugeValue, um[u].pending_cpus, u)
 65 | 		}
 66 | 		if um[u].running > 0 {
 67 | 			ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u)
 68 | 		}
 69 | 		if um[u].running_cpus > 0 {
 70 | 			ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u)
 71 | 		}
 72 | 		if um[u].suspended > 0 {
 73 | 			ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u)
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func NewUserJobMetrics() *userJobMetrics {
 79 | 	return &userJobMetrics{0, 0, 0, 0, 0}
 80 | }
 81 | 
 82 | type userJobMetrics struct {
 83 | 	pending      float64
 84 | 	pending_cpus float64
 85 | 	running      float64
 86 | 	running_cpus float64
 87 | 	suspended    float64
 88 | }
 89 | 
 90 | func ParseUsersMetrics(jobsData *api.JobsData) (map[string]*userJobMetrics, error) {
 91 | 	users := make(map[string]*userJobMetrics)
 92 | 	for _, j := range jobsData.Jobs {
 93 | 		user := j.UserName
 94 | 		if _, exists := users[user]; !exists {
 95 | 			users[user] = NewUserJobMetrics()
 96 | 		}
 97 | 
 98 | 		switch j.JobState {
 99 | 		case types.JobStatePending:
100 | 			users[user].pending++
101 | 			users[user].pending_cpus += float64(j.Cpus)
102 | 		case types.JobStateRunning:
103 | 			users[user].running++
104 | 			users[user].running_cpus += float64(j.Cpus)
105 | 		case types.JobStateSuspended:
106 | 			users[user].suspended++
107 | 		}
108 | 	}
109 | 	return users, nil
110 | }
111 | 


--------------------------------------------------------------------------------
/internal/slurm/gpus.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 
  7 | 	"github.com/akyoto/cache"
  8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 10 | 	"github.com/prometheus/client_golang/prometheus"
 11 | )
 12 | 
 13 | type GPUsCollector struct {
 14 | 	ctx         context.Context
 15 | 	alloc       *prometheus.Desc
 16 | 	idle        *prometheus.Desc
 17 | 	other       *prometheus.Desc
 18 | 	total       *prometheus.Desc
 19 | 	utilization *prometheus.Desc
 20 | }
 21 | 
 22 | func NewGPUsCollector(ctx context.Context) *GPUsCollector {
 23 | 	return &GPUsCollector{
 24 | 		ctx:         ctx,
 25 | 		alloc:       prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
 26 | 		idle:        prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
 27 | 		other:       prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
 28 | 		total:       prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
 29 | 		utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
 30 | 	}
 31 | }
 32 | 
 33 | func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
 34 | 	ch <- cc.alloc
 35 | 	ch <- cc.idle
 36 | 	ch <- cc.other
 37 | 	ch <- cc.total
 38 | 	ch <- cc.utilization
 39 | }
 40 | func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
 41 | 	apiCache := cc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 42 | 	nodesRespBytes, found := apiCache.Get("nodes")
 43 | 	if !found {
 44 | 		slog.Error("failed to get nodes response for cpu metrics from cache")
 45 | 		return
 46 | 	}
 47 | 	nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte))
 48 | 	if err != nil {
 49 | 		slog.Error("failed to process nodes response for gpu metrics", "error", err)
 50 | 		return
 51 | 	}
 52 | 	gm, err := ParseGPUsMetrics(nodesData)
 53 | 	if err != nil {
 54 | 		slog.Error("failed to collect gpus metrics", "error", err)
 55 | 		return
 56 | 	}
 57 | 	ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, gm.alloc)
 58 | 	ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, gm.idle)
 59 | 	ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, gm.other)
 60 | 	ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, gm.total)
 61 | 	ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, gm.utilization)
 62 | }
 63 | 
 64 | type gpusMetrics struct {
 65 | 	alloc       float64
 66 | 	idle        float64
 67 | 	other       float64
 68 | 	total       float64
 69 | 	utilization float64
 70 | }
 71 | 
 72 | func NewGPUsMetrics() *gpusMetrics {
 73 | 	return &gpusMetrics{}
 74 | }
 75 | 
 76 | // NOTES:
 77 | // node[gres] 		=> gpu:0 										# no gpus
 78 | // node[gres] 		=> gpu:nvidia_h100_80gb_hbm3:4(S:0-1) 			# 4 h100 gpus
 79 | // node[gres_used]  => gpu:nvidia_h100_80gb_hbm3:4(IDX:0-3) 		# 4 used gpus
 80 | // node[gres_used]  => gpu:nvidia_h100_80gb_hbm3:0(IDX:N/A) 		# 0 used gpus
 81 | // node[tres]		=> cpu=48,mem=1020522M,billing=48,gres/gpu=4	# 4 total gpus
 82 | // node[tres]		=> cpu=1,mem=1M,billing=1						# 0 total gpus
 83 | // node[tres_used]	=> cpu=48,mem=1020522M,billing=48,gres/gpu=4	# 4 used gpus
 84 | // node[tres_used]	=> cpu=1,mem=1M,billing=1						# 0 used gpus
 85 | //
 86 | // For tracking gpu resources, it looks like tres will be better. If I need to pull out per-gpu stats later,
 87 | // I'll have to use gres
 88 | //
 89 | 
 90 | // ParseGPUsMetrics iterates through node response objects and tallies up the total and
 91 | // allocated gpus, then derives idle and utilization from those numbers.
 92 | func ParseGPUsMetrics(nodesData *api.NodesData) (*gpusMetrics, error) {
 93 | 	gm := NewGPUsMetrics()
 94 | 	for _, n := range nodesData.Nodes {
 95 | 		idleGPUs := n.GPUTotal - n.GPUAllocated
 96 | 		gm.total += float64(n.GPUTotal)
 97 | 		gm.alloc += float64(n.GPUAllocated)
 98 | 		gm.idle += float64(idleGPUs)
 99 | 	}
100 | 	// TODO: Do we really need an "other" field?
101 | 	// using TRES, it should be straightforward.
102 | 	if gm.total > 0 {
103 | 		// if total is 0, we get NaN, so we check here
104 | 		gm.other = gm.total - (gm.alloc + gm.idle)
105 | 	} else {
106 | 		gm.other = 0
107 | 	}
108 | 	return gm, nil
109 | }
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Prometheus Slurm Exporter
  2 | 
  3 | Prometheus collector and exporter for metrics extracted from the [Slurm](https://slurm.schedmd.com/overview.html) resource scheduling system.
  4 | 
  5 | This project was forked from [https://github.com/vpenso/prometheus-slurm-exporter](https://github.com/vpenso/prometheus-slurm-exporter) and, for now, aims to be backwards-compatible from SLURM 23.11 forward.
  6 | This means the existing Grafana Dashboard should plug directly into this exporter and work roughly the same.
  7 | 
  8 | Unlike previous slurm exporters, this project leverages the SLURM REST API (`slurmrestd`) for data retreival.
  9 | Due to that difference, you are no longer required to run this exporter on a cluster node, as the exporter does not depend on having SLURM installed or connected to the head node!
 10 | I will be releasing containerized versions of this exporter soon.
 11 | 
 12 | ## Installation
 13 | 
 14 | This repository contains precompiled binaries for the three most recent major versions of SLURM _(Note: currently only two versions, but will be three when 24.11 releases)_.
 15 | In the [releases](https://github.com/lcrownover/prometheus-slurm-exporter/releases) page, download the newest version of the exporter that matches your SLURM version.
 16 | The included systemd file assumes you've saved this binary to `/usr/local/sbin/prometheus-slurm-exporter`, so drop it there or take note to change the systemd file if you choose to use it.
 17 | 
 18 | ## Configuration
 19 | 
 20 | The expoter requires several environment variables to be set:
 21 | 
 22 | * `SLURM_EXPORTER_LISTEN_ADDRESS`
 23 | 
 24 |   This should be the full address for the exporter to listen on.
 25 | 
 26 |   _Default: `0.0.0.0:8080`_
 27 | 
 28 | * `SLURM_EXPORTER_API_URL`
 29 | 
 30 |   This is the URL to your slurmrestd server.
 31 | 
 32 |   _Example: `http://head1.domain.edu:6820`_
 33 |   _Example: `unix://path/to/unix/socket`_
 34 | 
 35 | * `SLURM_EXPORTER_API_USER`
 36 | 
 37 |   The user specified in the token command.
 38 | 
 39 | * `SLURM_EXPORTER_API_TOKEN`
 40 | 
 41 |   This is the [SLURM token to authenticate against slurmrestd](https://slurm.schedmd.com/jwt.html).
 42 | 
 43 |   The easiest way to generate this is by running the following line on your head node:
 44 | 
 45 |   ```bash
 46 |   scontrol token username=myuser lifespan=someseconds
 47 |   ```
 48 | 
 49 |   `myuser` should probably be the `slurm` user, or some other privileged account.
 50 | 
 51 |   `lifespan` is specified in seconds. I set mine for 1 year (`lifespan=31536000`).
 52 | 
 53 | * `SLURM_EXPORTER_ENABLE_TLS`
 54 | 
 55 |   Set to `true` to enable TLS support. You must also provide paths to your certificate and key.
 56 | 
 57 | * `SLURM_EXPORTER_TLS_CERT_PATH`
 58 | 
 59 |   Path to your TLS certificate.
 60 | 
 61 | * `SLURM_EXPORTER_TLS_KEY_PATH`
 62 | 
 63 |   Path to your TLS key, it should be `0600`.
 64 | 
 65 | ## Systemd
 66 | 
 67 | A systemd unit file is [included](https://github.com/lcrownover/prometheus-slurm-exporter/blob/develop/extras/systemd/prometheus-slurm-exporter.service) for ease of deployment.
 68 | 
 69 | This unit file assumes you've written your environment variables to `/etc/prometheus-slurm-exporter/env.conf` in the format:
 70 | 
 71 | ```
 72 | SLURM_EXPORTER_API_URL="http://head.domain.edu:6820"
 73 | SLURM_EXPORTER_API_USER="root"
 74 | SLURM_EXPORTER_API_TOKEN="mytoken"
 75 | ```
 76 | 
 77 | _Don't forget to `chmod 600 /etc/prometheus-slurm-exporter/env.conf`!_
 78 | 
 79 | ## Prometheus Server Scrape Config
 80 | 
 81 | This is an example scrape config for your prometheus server:
 82 | 
 83 | ```
 84 | scrape_configs:
 85 |   - job_name: 'slurm_exporter'
 86 |     scrape_interval:  30s
 87 |     scrape_timeout:   30s
 88 |     static_configs:
 89 |       - targets: ['exporter_host.domain.edu:8080']
 90 | ```
 91 | 
 92 | ## Grafana Dashboard
 93 | 
 94 | The [dashboard](https://grafana.com/dashboards/4323) published by the previous author should work the same with this exporter.
 95 | I will be releasing a new version of the dashboard soon that will receive new features.
 96 | 
 97 | ![Status of the Nodes](images/Node_Status.png)
 98 | 
 99 | ![Status of the Jobs](images/Job_Status.png)
100 | 
101 | ![SLURM Scheduler Information](images/Scheduler_Info.png)
102 | 
103 | ## Contributing
104 | 
105 | Check out the [CONTRIBUTING.md](CONTRIBUTING.md) document.
106 | 


--------------------------------------------------------------------------------
/internal/slurm/cpus.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 
  7 | 	"github.com/akyoto/cache"
  8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 10 | 	"github.com/prometheus/client_golang/prometheus"
 11 | )
 12 | 
 13 | // CPU metrics collector
 14 | type CPUsCollector struct {
 15 | 	ctx   context.Context
 16 | 	alloc *prometheus.Desc
 17 | 	idle  *prometheus.Desc
 18 | 	other *prometheus.Desc
 19 | 	total *prometheus.Desc
 20 | }
 21 | 
 22 | // NewCPUsCollector creates a new CPUsCollector
 23 | func NewCPUsCollector(ctx context.Context) *CPUsCollector {
 24 | 	return &CPUsCollector{
 25 | 		ctx:   ctx,
 26 | 		alloc: prometheus.NewDesc("slurm_cpus_alloc", "Allocated CPUs", nil, nil),
 27 | 		idle:  prometheus.NewDesc("slurm_cpus_idle", "Idle CPUs", nil, nil),
 28 | 		other: prometheus.NewDesc("slurm_cpus_other", "Mix CPUs", nil, nil),
 29 | 		total: prometheus.NewDesc("slurm_cpus_total", "Total CPUs", nil, nil),
 30 | 	}
 31 | }
 32 | 
 33 | func (cc *CPUsCollector) Describe(ch chan<- *prometheus.Desc) {
 34 | 	ch <- cc.alloc
 35 | 	ch <- cc.idle
 36 | 	ch <- cc.other
 37 | 	ch <- cc.total
 38 | }
 39 | 
 40 | func (cc *CPUsCollector) Collect(ch chan<- prometheus.Metric) {
 41 | 	apiCache := cc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 42 | 	jobsRespBytes, found := apiCache.Get("jobs")
 43 | 	if !found {
 44 | 		slog.Error("failed to get jobs response for users metrics from cache")
 45 | 		return
 46 | 	}
 47 | 	jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte))
 48 | 	if err != nil {
 49 | 		slog.Error("failed to process jobs response for cpu metrics", "error", err)
 50 | 		return
 51 | 	}
 52 | 	nodesRespBytes, found := apiCache.Get("nodes")
 53 | 	if !found {
 54 | 		slog.Error("failed to get nodes response for cpu metrics from cache")
 55 | 		return
 56 | 	}
 57 | 	nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte))
 58 | 	if err != nil {
 59 | 		slog.Error("failed to process nodes response for cpu metrics", "error", err)
 60 | 		return
 61 | 	}
 62 | 	cm, err := ParseCPUsMetrics(nodesData, jobsData)
 63 | 	if err != nil {
 64 | 		slog.Error("failed to collect cpus metrics", "error", err)
 65 | 		return
 66 | 	}
 67 | 	ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
 68 | 	ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
 69 | 	ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
 70 | 	ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
 71 | }
 72 | 
 73 | type cpusMetrics struct {
 74 | 	alloc float64
 75 | 	idle  float64
 76 | 	other float64
 77 | 	total float64
 78 | }
 79 | 
 80 | func NewCPUsMetrics() *cpusMetrics {
 81 | 	return &cpusMetrics{}
 82 | }
 83 | 
 84 | // ParseCPUMetrics pulls out total cluster cpu states of alloc,idle,other,total
 85 | func ParseCPUsMetrics(nodesData *api.NodesData, jobsData *api.JobsData) (*cpusMetrics, error) {
 86 | 	cm := NewCPUsMetrics()
 87 | 	for _, j := range jobsData.Jobs {
 88 | 		// alloc is easy, we just add up all the cpus in the "Running" job state
 89 | 		if j.JobState == types.JobStateRunning {
 90 | 			cm.alloc += float64(j.Cpus)
 91 | 		}
 92 | 	}
 93 | 	// total is just the total number of cpus in the cluster
 94 | 	nodes := nodesData.Nodes
 95 | 	for _, n := range nodes {
 96 | 		if n.Cpus == 1 {
 97 | 			// TODO: This probably needs to be a call to partitions to get all nodes
 98 | 			// in a partition, then add the nodes CPU values up for this field.
 99 | 			// In our environment, nodes that exist (need slurm commands) get
100 | 			// put into slurm without being assigned a partition, but slurm
101 | 			// seems to track these systems with cpus=1.
102 | 			// This isn't a problem unless your site has nodes with a single CPU.
103 | 			continue
104 | 		}
105 | 		cpus := float64(n.Cpus)
106 | 		cm.total += cpus
107 | 
108 | 		for _, ns := range n.States {
109 | 			if ns == types.NodeStateMix || ns == types.NodeStateAlloc || ns == types.NodeStateIdle {
110 | 				// TODO: This calculate is scuffed. In our 17k core environment, it's
111 | 				// reporting ~400 more than the `sinfo -h -o '%C'` command.
112 | 				// Gotta figure this one out.
113 | 				idle_cpus := float64(n.AllocIdleCpus)
114 | 				cm.idle += idle_cpus
115 | 			}
116 | 		}
117 | 	}
118 | 	// Assumedly, this should be fine.
119 | 	cm.other = cm.total - cm.idle - cm.alloc
120 | 	return cm, nil
121 | }
122 | 


--------------------------------------------------------------------------------
/internal/slurm/account.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 
  7 | 	"github.com/akyoto/cache"
  8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 10 | 	"github.com/prometheus/client_golang/prometheus"
 11 | )
 12 | 
 13 | /*
 14 | 
 15 | AccountsCollector collects metrics for accounts
 16 | 
 17 | */
 18 | 
 19 | // AccountsCollector collects metrics for accounts
 20 | type AccountsCollector struct {
 21 | 	ctx          context.Context
 22 | 	pending      *prometheus.Desc
 23 | 	pending_cpus *prometheus.Desc
 24 | 	running      *prometheus.Desc
 25 | 	running_cpus *prometheus.Desc
 26 | 	suspended    *prometheus.Desc
 27 | }
 28 | 
 29 | // NewAccountsCollector creates a new AccountsCollector
 30 | func NewAccountsCollector(ctx context.Context) *AccountsCollector {
 31 | 	labels := []string{"account"}
 32 | 	return &AccountsCollector{
 33 | 		ctx:          ctx,
 34 | 		pending:      prometheus.NewDesc("slurm_account_jobs_pending", "Pending jobs for account", labels, nil),
 35 | 		pending_cpus: prometheus.NewDesc("slurm_account_cpus_pending", "Pending cpus for account", labels, nil),
 36 | 		running:      prometheus.NewDesc("slurm_account_jobs_running", "Running jobs for account", labels, nil),
 37 | 		running_cpus: prometheus.NewDesc("slurm_account_cpus_running", "Running cpus for account", labels, nil),
 38 | 		suspended:    prometheus.NewDesc("slurm_account_jobs_suspended", "Suspended jobs for account", labels, nil),
 39 | 	}
 40 | }
 41 | 
 42 | func (ac *AccountsCollector) Describe(ch chan<- *prometheus.Desc) {
 43 | 	ch <- ac.pending
 44 | 	ch <- ac.pending_cpus
 45 | 	ch <- ac.running
 46 | 	ch <- ac.running_cpus
 47 | 	ch <- ac.suspended
 48 | }
 49 | 
 50 | func (ac *AccountsCollector) Collect(ch chan<- prometheus.Metric) {
 51 | 	apiCache := ac.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 52 | 	jobsRespBytes, found := apiCache.Get("jobs")
 53 | 	if !found {
 54 | 		slog.Error("failed to get jobs response for users metrics from cache")
 55 | 		return
 56 | 	}
 57 | 	jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte))
 58 | 	if err != nil {
 59 | 		slog.Error("failed to extract jobs data for accounts metrics", "error", err)
 60 | 		return
 61 | 	}
 62 | 	am, err := ParseAccountsMetrics(*jobsData)
 63 | 	if err != nil {
 64 | 		slog.Error("failed to parse accounts metrics", "error", err)
 65 | 		return
 66 | 	}
 67 | 	for a := range am {
 68 | 		if am[a].pending > 0 {
 69 | 			ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a)
 70 | 		}
 71 | 		if am[a].pending_cpus > 0 {
 72 | 			ch <- prometheus.MustNewConstMetric(ac.pending_cpus, prometheus.GaugeValue, am[a].pending_cpus, a)
 73 | 		}
 74 | 		if am[a].running > 0 {
 75 | 			ch <- prometheus.MustNewConstMetric(ac.running, prometheus.GaugeValue, am[a].running, a)
 76 | 		}
 77 | 		if am[a].running_cpus > 0 {
 78 | 			ch <- prometheus.MustNewConstMetric(ac.running_cpus, prometheus.GaugeValue, am[a].running_cpus, a)
 79 | 		}
 80 | 		if am[a].suspended > 0 {
 81 | 			ch <- prometheus.MustNewConstMetric(ac.suspended, prometheus.GaugeValue, am[a].suspended, a)
 82 | 		}
 83 | 	}
 84 | }
 85 | 
 86 | type JobMetrics struct {
 87 | 	pending      float64
 88 | 	pending_cpus float64
 89 | 	running      float64
 90 | 	running_cpus float64
 91 | 	suspended    float64
 92 | }
 93 | 
 94 | func NewJobMetrics() *JobMetrics {
 95 | 	return &JobMetrics{}
 96 | }
 97 | 
 98 | // ParseAccountsMetrics gets the response body of jobs from SLURM and
 99 | // parses it into a map of "accountName": *JobMetrics
100 | func ParseAccountsMetrics(jobsData api.JobsData) (map[string]*JobMetrics, error) {
101 | 	accounts := make(map[string]*JobMetrics)
102 | 	for _, j := range jobsData.Jobs {
103 | 		// build the map with the account name as the key and job metrics as the value
104 | 		_, key := accounts[j.Account]
105 | 		if !key {
106 | 			// initialize a new metrics object if the key isnt found
107 | 			accounts[j.Account] = NewJobMetrics()
108 | 		}
109 | 		// for each of the jobs, depending on the state,
110 | 		// tally up the cpu count and increment the count of jobs for that state
111 | 		switch j.JobState {
112 | 		case types.JobStatePending:
113 | 			accounts[j.Account].pending++
114 | 			accounts[j.Account].pending_cpus += float64(j.Cpus)
115 | 		case types.JobStateRunning:
116 | 			accounts[j.Account].running++
117 | 			accounts[j.Account].running_cpus += float64(j.Cpus)
118 | 		case types.JobStateSuspended:
119 | 			accounts[j.Account].suspended++
120 | 		}
121 | 	}
122 | 	return accounts, nil
123 | }
124 | 


--------------------------------------------------------------------------------
/internal/slurm/node.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"log/slog"
  7 | 
  8 | 	"github.com/akyoto/cache"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
 10 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 11 | 	"github.com/prometheus/client_golang/prometheus"
 12 | )
 13 | 
 14 | type NodeCollector struct {
 15 | 	ctx      context.Context
 16 | 	cpuAlloc *prometheus.Desc
 17 | 	cpuIdle  *prometheus.Desc
 18 | 	cpuOther *prometheus.Desc
 19 | 	cpuTotal *prometheus.Desc
 20 | 	memAlloc *prometheus.Desc
 21 | 	memTotal *prometheus.Desc
 22 | }
 23 | 
 24 | // NewNodeCollectorOld creates a Prometheus collector to keep all our stats in
 25 | // It returns a set of collections for consumption
 26 | func NewNodeCollector(ctx context.Context) *NodeCollector {
 27 | 	labels := []string{"node", "status"}
 28 | 
 29 | 	return &NodeCollector{
 30 | 		ctx:      ctx,
 31 | 		cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
 32 | 		cpuIdle:  prometheus.NewDesc("slurm_node_cpu_idle", "Idle CPUs per node", labels, nil),
 33 | 		cpuOther: prometheus.NewDesc("slurm_node_cpu_other", "Other CPUs per node", labels, nil),
 34 | 		cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil),
 35 | 		memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil),
 36 | 		memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil),
 37 | 	}
 38 | }
 39 | 
 40 | // Send all metric descriptions
 41 | func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) {
 42 | 	ch <- nc.cpuAlloc
 43 | 	ch <- nc.cpuIdle
 44 | 	ch <- nc.cpuOther
 45 | 	ch <- nc.cpuTotal
 46 | 	ch <- nc.memAlloc
 47 | 	ch <- nc.memTotal
 48 | }
 49 | 
 50 | func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
 51 | 	apiCache := nc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 52 | 	nodesRespBytes, found := apiCache.Get("nodes")
 53 | 	if !found {
 54 | 		slog.Error("failed to get nodes response for cpu metrics from cache")
 55 | 		return
 56 | 	}
 57 | 	nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte))
 58 | 	if err != nil {
 59 | 		slog.Error("failed to process nodes response for node metrics", "error", err)
 60 | 		return
 61 | 	}
 62 | 	nm, err := ParseNodeMetrics(nodesData)
 63 | 	if err != nil {
 64 | 		slog.Error("failed to collect nodes metrics", "error", err)
 65 | 		return
 66 | 	}
 67 | 	for node := range nm {
 68 | 		ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nm[node].cpuAlloc), node, nm[node].nodeStatus)
 69 | 		ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nm[node].cpuIdle), node, nm[node].nodeStatus)
 70 | 		ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nm[node].cpuOther), node, nm[node].nodeStatus)
 71 | 		ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nm[node].cpuTotal), node, nm[node].nodeStatus)
 72 | 		ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nm[node].memAlloc), node, nm[node].nodeStatus)
 73 | 		ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nm[node].memTotal), node, nm[node].nodeStatus)
 74 | 	}
 75 | }
 76 | 
 77 | // NodeMetrics stores metrics for each node
 78 | type nodeMetrics struct {
 79 | 	memAlloc   uint64
 80 | 	memTotal   uint64
 81 | 	cpuAlloc   uint64
 82 | 	cpuIdle    uint64
 83 | 	cpuOther   uint64
 84 | 	cpuTotal   uint64
 85 | 	nodeStatus string
 86 | }
 87 | 
 88 | func NewNodeMetrics() *nodeMetrics {
 89 | 	return &nodeMetrics{}
 90 | }
 91 | 
 92 | // ParseNodeMetrics takes the output of sinfo with node data
 93 | // It returns a map of metrics per node
 94 | func ParseNodeMetrics(nodesData *api.NodesData) (map[string]*nodeMetrics, error) {
 95 | 	nodeMap := make(map[string]*nodeMetrics)
 96 | 
 97 | 	for _, n := range nodesData.Nodes {
 98 | 		nodeName := n.Hostname
 99 | 		nodeMap[nodeName] = &nodeMetrics{0, 0, 0, 0, 0, 0, ""}
100 | 
101 | 		// state
102 | 		nodeStatesStr, err := n.GetNodeStatesString("|")
103 | 		if err != nil {
104 | 			return nil, fmt.Errorf("failed to get node state: %v", err)
105 | 		}
106 | 		nodeMap[nodeName].nodeStatus = nodeStatesStr
107 | 
108 | 		// memory
109 | 		nodeMap[nodeName].memAlloc = uint64(n.AllocMemory)
110 | 		nodeMap[nodeName].memTotal = uint64(n.RealMemory)
111 | 
112 | 		// cpu
113 | 		nodeMap[nodeName].cpuAlloc = uint64(n.AllocCpus)
114 | 		nodeMap[nodeName].cpuIdle = uint64(n.AllocIdleCpus)
115 | 		nodeMap[nodeName].cpuOther = uint64(n.OtherCpus)
116 | 		nodeMap[nodeName].cpuTotal = uint64(n.Cpus)
117 | 	}
118 | 
119 | 	return nodeMap, nil
120 | }
121 | 


--------------------------------------------------------------------------------
/internal/slurm/nodes.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 
  7 | 	"github.com/akyoto/cache"
  8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 10 | 	"github.com/prometheus/client_golang/prometheus"
 11 | )
 12 | 
 13 | type NodesCollector struct {
 14 | 	ctx   context.Context
 15 | 	alloc *prometheus.Desc
 16 | 	comp  *prometheus.Desc
 17 | 	down  *prometheus.Desc
 18 | 	drain *prometheus.Desc
 19 | 	err   *prometheus.Desc
 20 | 	fail  *prometheus.Desc
 21 | 	idle  *prometheus.Desc
 22 | 	maint *prometheus.Desc
 23 | 	mix   *prometheus.Desc
 24 | 	resv  *prometheus.Desc
 25 | }
 26 | 
 27 | func NewNodesCollector(ctx context.Context) *NodesCollector {
 28 | 	return &NodesCollector{
 29 | 		ctx:   ctx,
 30 | 		alloc: prometheus.NewDesc("slurm_nodes_alloc", "Allocated nodes", nil, nil),
 31 | 		comp:  prometheus.NewDesc("slurm_nodes_comp", "Completing nodes", nil, nil),
 32 | 		down:  prometheus.NewDesc("slurm_nodes_down", "Down nodes", nil, nil),
 33 | 		drain: prometheus.NewDesc("slurm_nodes_drain", "Drain nodes", nil, nil),
 34 | 		err:   prometheus.NewDesc("slurm_nodes_err", "Error nodes", nil, nil),
 35 | 		fail:  prometheus.NewDesc("slurm_nodes_fail", "Fail nodes", nil, nil),
 36 | 		idle:  prometheus.NewDesc("slurm_nodes_idle", "Idle nodes", nil, nil),
 37 | 		maint: prometheus.NewDesc("slurm_nodes_maint", "Maint nodes", nil, nil),
 38 | 		mix:   prometheus.NewDesc("slurm_nodes_mix", "Mix nodes", nil, nil),
 39 | 		resv:  prometheus.NewDesc("slurm_nodes_resv", "Reserved nodes", nil, nil),
 40 | 	}
 41 | }
 42 | 
 43 | func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) {
 44 | 	ch <- nc.alloc
 45 | 	ch <- nc.comp
 46 | 	ch <- nc.down
 47 | 	ch <- nc.drain
 48 | 	ch <- nc.err
 49 | 	ch <- nc.fail
 50 | 	ch <- nc.idle
 51 | 	ch <- nc.maint
 52 | 	ch <- nc.mix
 53 | 	ch <- nc.resv
 54 | }
 55 | 
 56 | func (nc *NodesCollector) Collect(ch chan<- prometheus.Metric) {
 57 | 	apiCache := nc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 58 | 	nodesRespBytes, found := apiCache.Get("nodes")
 59 | 	if !found {
 60 | 		slog.Error("failed to get nodes response for cpu metrics from cache")
 61 | 		return
 62 | 	}
 63 | 	nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte))
 64 | 	if err != nil {
 65 | 		slog.Error("failed to process nodes response for nodes metrics", "error", err)
 66 | 		return
 67 | 	}
 68 | 	nm, err := ParseNodesMetrics(nodesData)
 69 | 	if err != nil {
 70 | 		slog.Error("failed to collect nodes metrics", "error", err)
 71 | 		return
 72 | 	}
 73 | 	ch <- prometheus.MustNewConstMetric(nc.alloc, prometheus.GaugeValue, nm.alloc)
 74 | 	ch <- prometheus.MustNewConstMetric(nc.comp, prometheus.GaugeValue, nm.comp)
 75 | 	ch <- prometheus.MustNewConstMetric(nc.down, prometheus.GaugeValue, nm.down)
 76 | 	ch <- prometheus.MustNewConstMetric(nc.drain, prometheus.GaugeValue, nm.drain)
 77 | 	ch <- prometheus.MustNewConstMetric(nc.err, prometheus.GaugeValue, nm.err)
 78 | 	ch <- prometheus.MustNewConstMetric(nc.fail, prometheus.GaugeValue, nm.fail)
 79 | 	ch <- prometheus.MustNewConstMetric(nc.idle, prometheus.GaugeValue, nm.idle)
 80 | 	ch <- prometheus.MustNewConstMetric(nc.maint, prometheus.GaugeValue, nm.maint)
 81 | 	ch <- prometheus.MustNewConstMetric(nc.mix, prometheus.GaugeValue, nm.mix)
 82 | 	ch <- prometheus.MustNewConstMetric(nc.resv, prometheus.GaugeValue, nm.resv)
 83 | }
 84 | 
 85 | type nodesMetrics struct {
 86 | 	alloc float64
 87 | 	comp  float64
 88 | 	down  float64
 89 | 	drain float64
 90 | 	err   float64
 91 | 	fail  float64
 92 | 	idle  float64
 93 | 	maint float64
 94 | 	mix   float64
 95 | 	resv  float64
 96 | }
 97 | 
 98 | func NewNodesMetrics() *nodesMetrics {
 99 | 	return &nodesMetrics{}
100 | }
101 | 
102 | // ParseNodesMetrics iterates through node response objects and tallies up
103 | // nodes based on their state
104 | func ParseNodesMetrics(nodesData *api.NodesData) (*nodesMetrics, error) {
105 | 	nm := NewNodesMetrics()
106 | 
107 | 	for _, n := range nodesData.Nodes {
108 | 		for _, ns := range n.States {
109 | 			switch ns {
110 | 			case types.NodeStateAlloc:
111 | 				nm.alloc += 1
112 | 			case types.NodeStateComp:
113 | 				nm.comp += 1
114 | 			case types.NodeStateDown:
115 | 				nm.down += 1
116 | 			case types.NodeStateDrain:
117 | 				nm.drain += 1
118 | 			case types.NodeStateErr:
119 | 				nm.err += 1
120 | 			case types.NodeStateFail:
121 | 				nm.fail += 1
122 | 			case types.NodeStateIdle:
123 | 				nm.idle += 1
124 | 			case types.NodeStateMaint:
125 | 				nm.maint += 1
126 | 			case types.NodeStateMix:
127 | 				nm.mix += 1
128 | 			case types.NodeStateResv:
129 | 				nm.resv += 1
130 | 			}
131 | 		}
132 | 	}
133 | 
134 | 	return nm, nil
135 | }
136 | 


--------------------------------------------------------------------------------
/cmd/prometheus-slurm-exporter/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"log/slog"
  8 | 	"net/http"
  9 | 	"os"
 10 | 	"strconv"
 11 | 	"strings"
 12 | 	"time"
 13 | 
 14 | 	"github.com/akyoto/cache"
 15 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
 16 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/slurm"
 17 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 18 | 	"github.com/prometheus/client_golang/prometheus"
 19 | )
 20 | 
 21 | var err error
 22 | 
 23 | var version = "2.1.1-beta"
 24 | 
 25 | func main() {
 26 | 	// set up logging
 27 | 	lvl := slog.LevelInfo
 28 | 	_, debug := os.LookupEnv("SLURM_EXPORTER_DEBUG")
 29 | 	if debug {
 30 | 		lvl = slog.LevelDebug
 31 | 	}
 32 | 	l := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
 33 | 		Level: lvl,
 34 | 	}))
 35 | 	slog.SetDefault(l)
 36 | 	slog.Debug("debug logging enabled")
 37 | 
 38 | 	// if -v is passed, print the version and exit
 39 | 	if len(os.Args) > 1 && os.Args[1] == "-v" {
 40 | 		fmt.Println(version)
 41 | 		os.Exit(0)
 42 | 	}
 43 | 
 44 | 	log.Printf("Starting Prometheus Slurm Exporter %s\n", version)
 45 | 
 46 | 	listenAddress, found := os.LookupEnv("SLURM_EXPORTER_LISTEN_ADDRESS")
 47 | 	if !found {
 48 | 		listenAddress = "0.0.0.0:8080"
 49 | 	}
 50 | 
 51 | 	apiURL, found := os.LookupEnv("SLURM_EXPORTER_API_URL")
 52 | 	if !found {
 53 | 		fmt.Println("You must set SLURM_EXPORTER_API_URL. Example: localhost:6820")
 54 | 		os.Exit(1)
 55 | 	}
 56 | 
 57 | 	var apiUser string
 58 | 	var apiToken string
 59 | 	var tlsEnable bool
 60 | 	var tlsCert string
 61 | 	var tlsKey string
 62 | 
 63 | 	// we only need these values if the endpoint is not unix://
 64 | 	if strings.HasPrefix(apiURL, "http://") || strings.HasPrefix(apiURL, "https://") {
 65 | 		var found bool
 66 | 		apiUser, found = os.LookupEnv("SLURM_EXPORTER_API_USER")
 67 | 		if !found {
 68 | 			fmt.Println("You must set SLURM_EXPORTER_API_USER")
 69 | 			os.Exit(1)
 70 | 		}
 71 | 
 72 | 		apiToken, found = os.LookupEnv("SLURM_EXPORTER_API_TOKEN")
 73 | 		if !found {
 74 | 			fmt.Println("You must set SLURM_EXPORTER_API_TOKEN")
 75 | 			os.Exit(1)
 76 | 		}
 77 | 
 78 | 		tlsString, found := os.LookupEnv("SLURM_EXPORTER_ENABLE_TLS")
 79 | 
 80 | 		if !found {
 81 | 			tlsEnable = false // default to false, do not break existing conf files
 82 | 		} else {
 83 | 			tlsEnable, err = strconv.ParseBool(tlsString)
 84 | 			if err != nil {
 85 | 				fmt.Println("Failed to parse SLURM_EXPORTER_ENABLE_TLS.  Please set to 1, t, T, TRUE, true, True, 0, f, F, FALSE, false, or False.")
 86 | 			}
 87 | 		}
 88 | 		if tlsEnable { // require tlsCert and tlsKey only if tlsEnable is true
 89 | 			tlsCert, found = os.LookupEnv("SLURM_EXPORTER_TLS_CERT_PATH")
 90 | 			if !found {
 91 | 				fmt.Println("You must set SLURM_EXPORTER_TLS_CERT_PATH to the path of your cert")
 92 | 				os.Exit(1)
 93 | 			}
 94 | 			tlsKey, found = os.LookupEnv("SLURM_EXPORTER_TLS_KEY_PATH")
 95 | 			if !found {
 96 | 				fmt.Println("You must set SLURM_EXPORTER_TLS_KEY_PATH to the path of your key")
 97 | 				os.Exit(1)
 98 | 			}
 99 | 		}
100 | 
101 | 	} else if strings.HasPrefix(apiURL, "unix://") {
102 | 		apiUser = ""
103 | 		apiToken = ""
104 | 		tlsEnable = false
105 | 		tlsCert = ""
106 | 		tlsKey = ""
107 | 
108 | 	} else {
109 | 		fmt.Println("SLURM_EXPORTER_API_URL must start with unix://, http://, or https://")
110 | 		fmt.Println("Got: ", apiURL)
111 | 		os.Exit(1)
112 | 	}
113 | 	// API Cache
114 | 	apiCache := cache.New(60 * time.Second)
115 | 
116 | 	// Set up the context to pass around
117 | 	ctx := context.Background()
118 | 	ctx = context.WithValue(ctx, types.ApiUserKey, apiUser)
119 | 	ctx = context.WithValue(ctx, types.ApiTokenKey, apiToken)
120 | 	ctx = context.WithValue(ctx, types.ApiURLKey, apiURL)
121 | 	ctx = context.WithValue(ctx, types.ApiCacheKey, apiCache)
122 | 
123 | 	// Register all the endpoints
124 | 	ctx = api.RegisterEndpoints(ctx)
125 | 
126 | 	// Register all the collectors
127 | 	r := prometheus.NewRegistry()
128 | 	r.MustRegister(slurm.NewAccountsCollector(ctx))
129 | 	r.MustRegister(slurm.NewCPUsCollector(ctx))
130 | 	r.MustRegister(slurm.NewGPUsCollector(ctx))
131 | 	r.MustRegister(slurm.NewNodesCollector(ctx))
132 | 	r.MustRegister(slurm.NewNodeCollector(ctx))
133 | 	r.MustRegister(slurm.NewPartitionsCollector(ctx))
134 | 	r.MustRegister(slurm.NewFairShareCollector(ctx))
135 | 	r.MustRegister(slurm.NewQueueCollector(ctx))
136 | 	r.MustRegister(slurm.NewSchedulerCollector(ctx))
137 | 	r.MustRegister(slurm.NewUsersCollector(ctx))
138 | 
139 | 	log.Printf("Starting Server: %s\n", listenAddress)
140 | 	http.Handle("/metrics", api.MetricsHandler(r, ctx))
141 | 	if tlsEnable {
142 | 		log.Fatal(http.ListenAndServeTLS(listenAddress, tlsCert, tlsKey, nil))
143 | 	} else {
144 | 		log.Fatal(http.ListenAndServe(listenAddress, nil))
145 | 	}
146 | }
147 | 


--------------------------------------------------------------------------------
/testdata/SlurmV0041GetShares200Response.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "shares" : {
  3 |     "shares" : [ {
  4 |       "cluster" : "cluster",
  5 |       "parent" : "parent",
  6 |       "shares_normalized" : {
  7 |         "number" : 6.027456183070403,
  8 |         "set" : true,
  9 |         "infinite" : true
 10 |       },
 11 |       "usage" : 9,
 12 |       "fairshare" : {
 13 |         "level" : 2.027123023002322,
 14 |         "factor" : 3.616076749251911
 15 |       },
 16 |       "type" : [ "USER", "USER" ],
 17 |       "effective_usage" : 2.3021358869347655,
 18 |       "shares" : {
 19 |         "number" : 1,
 20 |         "set" : true,
 21 |         "infinite" : true
 22 |       },
 23 |       "partition" : "partition",
 24 |       "usage_normalized" : {
 25 |         "number" : 7.061401241503109,
 26 |         "set" : true,
 27 |         "infinite" : true
 28 |       },
 29 |       "name" : "name",
 30 |       "tres" : {
 31 |         "run_seconds" : [ {
 32 |           "name" : "name",
 33 |           "value" : {
 34 |             "number" : 5,
 35 |             "set" : true,
 36 |             "infinite" : true
 37 |           }
 38 |         }, {
 39 |           "name" : "name",
 40 |           "value" : {
 41 |             "number" : 5,
 42 |             "set" : true,
 43 |             "infinite" : true
 44 |           }
 45 |         } ],
 46 |         "usage" : [ {
 47 |           "name" : "name",
 48 |           "value" : 5.637376656633329
 49 |         }, {
 50 |           "name" : "name",
 51 |           "value" : 5.637376656633329
 52 |         } ],
 53 |         "group_minutes" : [ {
 54 |           "name" : "name",
 55 |           "value" : {
 56 |             "number" : 5,
 57 |             "set" : true,
 58 |             "infinite" : true
 59 |           }
 60 |         }, {
 61 |           "name" : "name",
 62 |           "value" : {
 63 |             "number" : 5,
 64 |             "set" : true,
 65 |             "infinite" : true
 66 |           }
 67 |         } ]
 68 |       },
 69 |       "id" : 0
 70 |     }, {
 71 |       "cluster" : "cluster",
 72 |       "parent" : "parent",
 73 |       "shares_normalized" : {
 74 |         "number" : 6.027456183070403,
 75 |         "set" : true,
 76 |         "infinite" : true
 77 |       },
 78 |       "usage" : 9,
 79 |       "fairshare" : {
 80 |         "level" : 2.027123023002322,
 81 |         "factor" : 3.616076749251911
 82 |       },
 83 |       "type" : [ "USER", "USER" ],
 84 |       "effective_usage" : 2.3021358869347655,
 85 |       "shares" : {
 86 |         "number" : 1,
 87 |         "set" : true,
 88 |         "infinite" : true
 89 |       },
 90 |       "partition" : "partition",
 91 |       "usage_normalized" : {
 92 |         "number" : 7.061401241503109,
 93 |         "set" : true,
 94 |         "infinite" : true
 95 |       },
 96 |       "name" : "name",
 97 |       "tres" : {
 98 |         "run_seconds" : [ {
 99 |           "name" : "name",
100 |           "value" : {
101 |             "number" : 5,
102 |             "set" : true,
103 |             "infinite" : true
104 |           }
105 |         }, {
106 |           "name" : "name",
107 |           "value" : {
108 |             "number" : 5,
109 |             "set" : true,
110 |             "infinite" : true
111 |           }
112 |         } ],
113 |         "usage" : [ {
114 |           "name" : "name",
115 |           "value" : 5.637376656633329
116 |         }, {
117 |           "name" : "name",
118 |           "value" : 5.637376656633329
119 |         } ],
120 |         "group_minutes" : [ {
121 |           "name" : "name",
122 |           "value" : {
123 |             "number" : 5,
124 |             "set" : true,
125 |             "infinite" : true
126 |           }
127 |         }, {
128 |           "name" : "name",
129 |           "value" : {
130 |             "number" : 5,
131 |             "set" : true,
132 |             "infinite" : true
133 |           }
134 |         } ]
135 |       },
136 |       "id" : 0
137 |     } ],
138 |     "total_shares" : 4
139 |   },
140 |   "meta" : {
141 |     "slurm" : {
142 |       "cluster" : "cluster",
143 |       "release" : "release",
144 |       "version" : {
145 |         "major" : "major",
146 |         "minor" : "minor",
147 |         "micro" : "micro"
148 |       }
149 |     },
150 |     "plugin" : {
151 |       "accounting_storage" : "accounting_storage",
152 |       "name" : "name",
153 |       "type" : "type",
154 |       "data_parser" : "data_parser"
155 |     },
156 |     "client" : {
157 |       "source" : "source",
158 |       "user" : "user",
159 |       "group" : "group"
160 |     },
161 |     "command" : [ "command", "command" ]
162 |   },
163 |   "warnings" : [ {
164 |     "description" : "description",
165 |     "source" : "source"
166 |   }, {
167 |     "description" : "description",
168 |     "source" : "source"
169 |   } ],
170 |   "errors" : [ {
171 |     "description" : "description",
172 |     "source" : "source",
173 |     "error" : "error",
174 |     "error_number" : 5
175 |   }, {
176 |     "description" : "description",
177 |     "source" : "source",
178 |     "error" : "error",
179 |     "error_number" : 5
180 |   } ]
181 | }
182 | 


--------------------------------------------------------------------------------
/docker/24.05.dockerfile:
--------------------------------------------------------------------------------
  1 | FROM rockylinux:8
  2 | RUN dnf clean all && \
  3 |     dnf update -y && \
  4 |     dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \
  5 |     dnf install -y --enablerepo=devel mariadb-devel python3-PyMySQL hwloc lz4-devel wget bzip2 perl munge-devel munge cmake jansson libjwt-devel libjwt json-c-devel json-c http-parser-devel http-parser libcgroup libcgroup-tools dbus-devel mariadb && \
  6 |     dnf group install -y "Development Tools"
  7 | 
  8 | RUN dnf install -y sudo
  9 | 
 10 | RUN dnf -y update && \
 11 |     dnf install -y systemd && \
 12 |     dnf clean all && \
 13 |     rm -rf /var/lib/apt/lists/*
 14 | 
 15 | 
 16 | # Add fake users to run jobs as
 17 | RUN adduser user1
 18 | RUN adduser user2
 19 | RUN adduser slurm
 20 | 
 21 | # Install http_parser
 22 | RUN git clone --depth 1 --single-branch -b v2.9.4 https://github.com/nodejs/http-parser.git http_parser \
 23 |     && cd http_parser \
 24 |     && make \
 25 |     && make install
 26 | 
 27 | #RUN dnf install -y systemd
 28 | #slurmrestd -d list
 29 | #Need to do this 
 30 | 
 31 | RUN dnf install -y jansson-devel
 32 | 
 33 | RUN git clone --depth 1 --single-branch -b v1.12.0 https://github.com/benmcollins/libjwt.git libjwt \
 34 |     && cd libjwt \
 35 |     && autoreconf --force --install \
 36 |     && ./configure --prefix=/usr/local/ \
 37 |     && make -j && make install
 38 | 
 39 | 
 40 | WORKDIR /slurm
 41 | RUN wget https://download.schedmd.com/slurm/slurm-24.05-latest.tar.bz2 && tar -xvjf slurm-24.05-latest.tar.bz2 --strip-components=1
 42 | # add --with-jwt=/usr/local/
 43 | RUN ./configure \
 44 |     --with-cgroup-v2 \
 45 |     --with-http-parser=/usr/local/ \
 46 |     --enable-slurmrestd \
 47 |     --with-jwt=/usr/local/ \
 48 |     && make && make install
 49 | 
 50 | # Create the /var/log/slurm directory and set permissions
 51 | RUN mkdir -p /var/log/slurm && \
 52 |     chown slurm:slurm /var/log/slurm && \
 53 |     chmod 750 /var/log/slurm && \
 54 |     touch /var/log/slurm/slurmd.log /var/log/slurm/slurmctld.log /var/log/slurm/slurmdbd.log && \
 55 |     chown slurm:slurm /var/log/slurm/slurmctld.log /var/log/slurm/slurmd.log /var/log/slurm/slurmdbd.log 
 56 | 
 57 | RUN getent group munge || groupadd -r munge && \
 58 |     getent passwd munge || useradd -r -g munge munge && \
 59 |     mkdir -p /var/log/munge && \
 60 |     chown munge:munge /var/log/munge && \
 61 |     chmod 750 /var/log/munge && \
 62 |     /usr/sbin/create-munge-key && \
 63 |     chown munge:munge /etc/munge/munge.key && \
 64 |     chmod 400 /etc/munge/munge.key
 65 | 
 66 | RUN touch /var/log/munge/munged.log && \
 67 |     chown munge:munge /var/log/munge/munged.log
 68 | 
 69 | COPY slurm.conf /usr/local/etc/slurm.conf
 70 | 
 71 | USER root
 72 | COPY cgroup.conf /usr/local/etc/cgroup.conf
 73 | COPY slurm.conf /usr/local/etc/slurm.conf
 74 | COPY slurmdbd.conf /usr/local/etc/slurmdbd.conf
 75 | RUN chown slurm:slurm /usr/local/etc/slurmdbd.conf
 76 | RUN chmod 600 /usr/local/etc/slurmdbd.conf
 77 | COPY start_slurm.sh /start_slurm.sh
 78 | COPY start_jobs.sh /start_jobs.sh
 79 | 
 80 | ENV SLURM_CONF=/usr/local/etc/slurm.conf
 81 | RUN chmod 755 /start_slurm.sh /start_jobs.sh
 82 | 
 83 | RUN mkdir -p /var/spool/slurm /var/spool/slurmd && \
 84 |     chown slurm:slurm /var/spool/slurm /var/spool/slurmd && \
 85 |     chmod 755 /var/spool/slurmd
 86 | 
 87 | RUN chown -R slurm:slurm /slurm/src/ 
 88 | 
 89 | # touch /var/spool/slurmd/cred_state && \
 90 | # chown slurm:slurm /var/spool/slurmd/cred_state && \
 91 | # chmod 755 /var/spool/slurmd/cred_state
 92 | 
 93 | RUN mkdir -p /var/spool/slurm/statesave && dd if=/dev/random of=/var/spool/slurm/statesave/jwt_hs256.key bs=32 count=1 \
 94 |     && chown slurm:slurm /var/spool/slurm/statesave/jwt_hs256.key \
 95 |     && chmod 0600 /var/spool/slurm/statesave/jwt_hs256.key \
 96 |     && chown slurm:slurm /var/spool/slurm/statesave \
 97 |     && chmod 0755 /var/spool/slurm/statesave
 98 | 
 99 | 
100 | RUN mkdir -p /jobs /jobs/output /jobs/err && \
101 |     chown root:slurm /jobs /jobs/output /jobs/err
102 | 
103 | # Create sample SLURM job scripts
104 | 
105 | COPY hello_world_job.sbatch /jobs/hello_world_job.sbatch
106 | COPY lets_go_job.sbatch /jobs/lets_go_job.sbatch
107 | 
108 | RUN chmod +x /jobs/hello_world_job.sbatch /jobs/lets_go_job.sbatch
109 | 
110 | # Ask Lucas about what other ports need to be exposed or if I need to build slurm with this port exposed from the getgo 
111 | EXPOSE 6280
112 | 
113 | RUN ln -s /slurm/src/slurmd/slurmd/slurmd /bin/slurmd       # I only added this to make it easier to run the slurmd executable during daemon start troubleshooting 
114 | RUN ln -s /slurm/src/slurmdbd/slurmdbd /bin/slurmdbd        # I only added this to make it easier to run the slurmd executable during daemon start troubleshooting 
115 | RUN ln -s /slurm/src/slurmrestd/slurmrestd /bin/slurmrestd  # I only added this to make it easier to run the slurmd executable during daemon start troubleshooting 
116 | 
117 | RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.41 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.41.json
118 | RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.40 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.40.json
119 | 
120 | ENTRYPOINT ["/start_slurm.sh"]
121 | 


--------------------------------------------------------------------------------
/testdata/V0041OpenapiSharesResp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "shares" : {
  3 |     "shares" : [ {
  4 |       "cluster" : "cluster",
  5 |       "parent" : "parent",
  6 |       "shares_normalized" : {
  7 |         "number" : 6.027456183070403,
  8 |         "set" : true,
  9 |         "infinite" : true
 10 |       },
 11 |       "usage" : 9,
 12 |       "fairshare" : {
 13 |         "level" : 2.027123023002322,
 14 |         "factor" : 3.616076749251911
 15 |       },
 16 |       "type" : [ "USER", "USER" ],
 17 |       "effective_usage" : {
 18 |         "number" : 2.3021358869347655,
 19 |         "set" : true,
 20 |         "infinite" : true
 21 |       },
 22 |       "shares" : {
 23 |         "number" : 1,
 24 |         "set" : true,
 25 |         "infinite" : true
 26 |       },
 27 |       "partition" : "partition",
 28 |       "usage_normalized" : {
 29 |         "number" : 7.061401241503109,
 30 |         "set" : true,
 31 |         "infinite" : true
 32 |       },
 33 |       "name" : "name",
 34 |       "tres" : {
 35 |         "run_seconds" : [ {
 36 |           "name" : "name",
 37 |           "value" : {
 38 |             "number" : 5,
 39 |             "set" : true,
 40 |             "infinite" : true
 41 |           }
 42 |         }, {
 43 |           "name" : "name",
 44 |           "value" : {
 45 |             "number" : 5,
 46 |             "set" : true,
 47 |             "infinite" : true
 48 |           }
 49 |         } ],
 50 |         "usage" : [ {
 51 |           "name" : "name",
 52 |           "value" : 5.637376656633329
 53 |         }, {
 54 |           "name" : "name",
 55 |           "value" : 5.637376656633329
 56 |         } ],
 57 |         "group_minutes" : [ {
 58 |           "name" : "name",
 59 |           "value" : {
 60 |             "number" : 5,
 61 |             "set" : true,
 62 |             "infinite" : true
 63 |           }
 64 |         }, {
 65 |           "name" : "name",
 66 |           "value" : {
 67 |             "number" : 5,
 68 |             "set" : true,
 69 |             "infinite" : true
 70 |           }
 71 |         } ]
 72 |       },
 73 |       "id" : 0
 74 |     }, {
 75 |       "cluster" : "cluster",
 76 |       "parent" : "parent",
 77 |       "shares_normalized" : {
 78 |         "number" : 6.027456183070403,
 79 |         "set" : true,
 80 |         "infinite" : true
 81 |       },
 82 |       "usage" : 9,
 83 |       "fairshare" : {
 84 |         "level" : 2.027123023002322,
 85 |         "factor" : 3.616076749251911
 86 |       },
 87 |       "type" : [ "USER", "USER" ],
 88 |       "effective_usage" : {
 89 |         "number" : 2.3021358869347655,
 90 |         "set" : true,
 91 |         "infinite" : true
 92 |       },
 93 |       "shares" : {
 94 |         "number" : 1,
 95 |         "set" : true,
 96 |         "infinite" : true
 97 |       },
 98 |       "partition" : "partition",
 99 |       "usage_normalized" : {
100 |         "number" : 7.061401241503109,
101 |         "set" : true,
102 |         "infinite" : true
103 |       },
104 |       "name" : "name",
105 |       "tres" : {
106 |         "run_seconds" : [ {
107 |           "name" : "name",
108 |           "value" : {
109 |             "number" : 5,
110 |             "set" : true,
111 |             "infinite" : true
112 |           }
113 |         }, {
114 |           "name" : "name",
115 |           "value" : {
116 |             "number" : 5,
117 |             "set" : true,
118 |             "infinite" : true
119 |           }
120 |         } ],
121 |         "usage" : [ {
122 |           "name" : "name",
123 |           "value" : 5.637376656633329
124 |         }, {
125 |           "name" : "name",
126 |           "value" : 5.637376656633329
127 |         } ],
128 |         "group_minutes" : [ {
129 |           "name" : "name",
130 |           "value" : {
131 |             "number" : 5,
132 |             "set" : true,
133 |             "infinite" : true
134 |           }
135 |         }, {
136 |           "name" : "name",
137 |           "value" : {
138 |             "number" : 5,
139 |             "set" : true,
140 |             "infinite" : true
141 |           }
142 |         } ]
143 |       },
144 |       "id" : 0
145 |     } ],
146 |     "total_shares" : 4
147 |   },
148 |   "meta" : {
149 |     "slurm" : {
150 |       "cluster" : "cluster",
151 |       "release" : "release",
152 |       "version" : {
153 |         "major" : "major",
154 |         "minor" : "minor",
155 |         "micro" : "micro"
156 |       }
157 |     },
158 |     "plugin" : {
159 |       "accounting_storage" : "accounting_storage",
160 |       "name" : "name",
161 |       "type" : "type",
162 |       "data_parser" : "data_parser"
163 |     },
164 |     "client" : {
165 |       "source" : "source",
166 |       "user" : "user",
167 |       "group" : "group"
168 |     },
169 |     "command" : [ "command", "command" ]
170 |   },
171 |   "warnings" : [ {
172 |     "description" : "description",
173 |     "source" : "source"
174 |   }, {
175 |     "description" : "description",
176 |     "source" : "source"
177 |   } ],
178 |   "errors" : [ {
179 |     "description" : "description",
180 |     "source" : "source",
181 |     "error" : "error",
182 |     "error_number" : 5
183 |   }, {
184 |     "description" : "description",
185 |     "source" : "source",
186 |     "error" : "error",
187 |     "error_number" : 5
188 |   } ]
189 | }
190 | 


--------------------------------------------------------------------------------
/testdata/SlurmV0041GetDiag200Response.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "meta" : {
  3 |     "slurm" : {
  4 |       "cluster" : "cluster",
  5 |       "release" : "release",
  6 |       "version" : {
  7 |         "major" : "major",
  8 |         "minor" : "minor",
  9 |         "micro" : "micro"
 10 |       }
 11 |     },
 12 |     "plugin" : {
 13 |       "accounting_storage" : "accounting_storage",
 14 |       "name" : "name",
 15 |       "type" : "type",
 16 |       "data_parser" : "data_parser"
 17 |     },
 18 |     "client" : {
 19 |       "source" : "source",
 20 |       "user" : "user",
 21 |       "group" : "group"
 22 |     },
 23 |     "command" : [ "command", "command" ]
 24 |   },
 25 |   "warnings" : [ {
 26 |     "description" : "description",
 27 |     "source" : "source"
 28 |   }, {
 29 |     "description" : "description",
 30 |     "source" : "source"
 31 |   } ],
 32 |   "errors" : [ {
 33 |     "description" : "description",
 34 |     "source" : "source",
 35 |     "error" : "error",
 36 |     "error_number" : 5
 37 |   }, {
 38 |     "description" : "description",
 39 |     "source" : "source",
 40 |     "error" : "error",
 41 |     "error_number" : 5
 42 |   } ],
 43 |   "statistics" : {
 44 |     "bf_cycle_max" : 4,
 45 |     "rpcs_by_message_type" : [ {
 46 |       "cycle_last" : 6,
 47 |       "average_time" : {
 48 |         "number" : 3,
 49 |         "set" : true,
 50 |         "infinite" : true
 51 |       },
 52 |       "type_id" : 0,
 53 |       "queued" : 5,
 54 |       "count" : 7,
 55 |       "dropped" : 4,
 56 |       "message_type" : "message_type",
 57 |       "total_time" : 4,
 58 |       "cycle_max" : 8
 59 |     }, {
 60 |       "cycle_last" : 6,
 61 |       "average_time" : {
 62 |         "number" : 3,
 63 |         "set" : true,
 64 |         "infinite" : true
 65 |       },
 66 |       "type_id" : 0,
 67 |       "queued" : 5,
 68 |       "count" : 7,
 69 |       "dropped" : 4,
 70 |       "message_type" : "message_type",
 71 |       "total_time" : 4,
 72 |       "cycle_max" : 8
 73 |     } ],
 74 |     "bf_backfilled_het_jobs" : 3,
 75 |     "bf_table_size" : 7,
 76 |     "schedule_cycle_depth" : 7,
 77 |     "bf_depth_sum" : 0,
 78 |     "job_states_ts" : {
 79 |       "number" : 6,
 80 |       "set" : true,
 81 |       "infinite" : true
 82 |     },
 83 |     "bf_queue_len" : 4,
 84 |     "jobs_started" : 6,
 85 |     "schedule_cycle_max" : 2,
 86 |     "server_thread_count" : 5,
 87 |     "bf_queue_len_sum" : 4,
 88 |     "bf_cycle_last" : 0,
 89 |     "bf_exit" : {
 90 |       "state_changed" : 5,
 91 |       "bf_max_time" : 3,
 92 |       "bf_max_job_start" : 7,
 93 |       "bf_node_space_size" : 7,
 94 |       "end_job_queue" : 8,
 95 |       "bf_max_job_test" : 3
 96 |     },
 97 |     "agent_thread_count" : 7,
 98 |     "jobs_completed" : 3,
 99 |     "bf_depth_mean" : 0,
100 |     "bf_depth_try_sum" : 6,
101 |     "schedule_cycle_mean" : 1,
102 |     "bf_table_size_sum" : 9,
103 |     "agent_queue_size" : 5,
104 |     "jobs_failed" : 1,
105 |     "bf_last_depth_try" : 4,
106 |     "req_time" : {
107 |       "number" : 6,
108 |       "set" : true,
109 |       "infinite" : true
110 |     },
111 |     "bf_cycle_counter" : 3,
112 |     "schedule_queue_length" : 8,
113 |     "bf_queue_len_mean" : 1,
114 |     "schedule_exit" : {
115 |       "max_sched_time" : 9,
116 |       "licenses" : 6,
117 |       "default_queue_depth" : 4,
118 |       "max_job_start" : 5,
119 |       "max_rpc_cnt" : 9,
120 |       "end_job_queue" : 1
121 |     },
122 |     "jobs_canceled" : 6,
123 |     "schedule_cycle_sum" : 7,
124 |     "jobs_submitted" : 9,
125 |     "schedule_cycle_mean_depth" : 1,
126 |     "schedule_cycle_per_minute" : 6,
127 |     "req_time_start" : {
128 |       "number" : 1,
129 |       "set" : true,
130 |       "infinite" : true
131 |     },
132 |     "jobs_running" : 6,
133 |     "bf_last_backfilled_jobs" : 6,
134 |     "bf_last_depth" : 3,
135 |     "bf_backfilled_jobs" : 5,
136 |     "rpcs_by_user" : [ {
137 |       "average_time" : {
138 |         "number" : 3,
139 |         "set" : true,
140 |         "infinite" : true
141 |       },
142 |       "user_id" : 0,
143 |       "count" : 2,
144 |       "total_time" : 1,
145 |       "user" : "user"
146 |     }, {
147 |       "average_time" : {
148 |         "number" : 3,
149 |         "set" : true,
150 |         "infinite" : true
151 |       },
152 |       "user_id" : 0,
153 |       "count" : 2,
154 |       "total_time" : 1,
155 |       "user" : "user"
156 |     } ],
157 |     "bf_cycle_mean" : 7,
158 |     "pending_rpcs_by_hostlist" : [ {
159 |       "type_id" : 4,
160 |       "count" : [ "count", "count" ],
161 |       "message_type" : "message_type"
162 |     }, {
163 |       "type_id" : 4,
164 |       "count" : [ "count", "count" ],
165 |       "message_type" : "message_type"
166 |     } ],
167 |     "dbd_agent_queue_size" : 9,
168 |     "bf_table_size_mean" : 0,
169 |     "jobs_pending" : 2,
170 |     "agent_count" : 2,
171 |     "bf_cycle_sum" : 6,
172 |     "parts_packed" : 0,
173 |     "bf_active" : true,
174 |     "bf_depth_mean_try" : 7,
175 |     "gettimeofday_latency" : 3,
176 |     "pending_rpcs" : [ {
177 |       "type_id" : 8,
178 |       "count" : 6,
179 |       "message_type" : "message_type"
180 |     }, {
181 |       "type_id" : 8,
182 |       "count" : 6,
183 |       "message_type" : "message_type"
184 |     } ],
185 |     "schedule_cycle_total" : 1,
186 |     "bf_when_last_cycle" : {
187 |       "number" : 9,
188 |       "set" : true,
189 |       "infinite" : true
190 |     },
191 |     "schedule_cycle_last" : 4
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/internal/slurm/queue.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 
  7 | 	"github.com/akyoto/cache"
  8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 10 | 	"github.com/prometheus/client_golang/prometheus"
 11 | )
 12 | 
 13 | type QueueCollector struct {
 14 | 	ctx         context.Context
 15 | 	pending     *prometheus.Desc
 16 | 	pending_dep *prometheus.Desc
 17 | 	running     *prometheus.Desc
 18 | 	suspended   *prometheus.Desc
 19 | 	cancelled   *prometheus.Desc
 20 | 	completing  *prometheus.Desc
 21 | 	completed   *prometheus.Desc
 22 | 	configuring *prometheus.Desc
 23 | 	failed      *prometheus.Desc
 24 | 	timeout     *prometheus.Desc
 25 | 	preempted   *prometheus.Desc
 26 | 	node_fail   *prometheus.Desc
 27 | }
 28 | 
 29 | func NewQueueCollector(ctx context.Context) *QueueCollector {
 30 | 	return &QueueCollector{
 31 | 		ctx:         ctx,
 32 | 		pending:     prometheus.NewDesc("slurm_queue_pending", "Pending jobs in queue", nil, nil),
 33 | 		pending_dep: prometheus.NewDesc("slurm_queue_pending_dependency", "Pending jobs because of dependency in queue", nil, nil),
 34 | 		running:     prometheus.NewDesc("slurm_queue_running", "Running jobs in the cluster", nil, nil),
 35 | 		suspended:   prometheus.NewDesc("slurm_queue_suspended", "Suspended jobs in the cluster", nil, nil),
 36 | 		cancelled:   prometheus.NewDesc("slurm_queue_cancelled", "Cancelled jobs in the cluster", nil, nil),
 37 | 		completing:  prometheus.NewDesc("slurm_queue_completing", "Completing jobs in the cluster", nil, nil),
 38 | 		completed:   prometheus.NewDesc("slurm_queue_completed", "Completed jobs in the cluster", nil, nil),
 39 | 		configuring: prometheus.NewDesc("slurm_queue_configuring", "Configuring jobs in the cluster", nil, nil),
 40 | 		failed:      prometheus.NewDesc("slurm_queue_failed", "Number of failed jobs", nil, nil),
 41 | 		timeout:     prometheus.NewDesc("slurm_queue_timeout", "Jobs stopped by timeout", nil, nil),
 42 | 		preempted:   prometheus.NewDesc("slurm_queue_preempted", "Number of preempted jobs", nil, nil),
 43 | 		node_fail:   prometheus.NewDesc("slurm_queue_node_fail", "Number of jobs stopped due to node fail", nil, nil),
 44 | 	}
 45 | }
 46 | 
 47 | func (qc *QueueCollector) Describe(ch chan<- *prometheus.Desc) {
 48 | 	ch <- qc.pending
 49 | 	ch <- qc.pending_dep
 50 | 	ch <- qc.running
 51 | 	ch <- qc.suspended
 52 | 	ch <- qc.cancelled
 53 | 	ch <- qc.completing
 54 | 	ch <- qc.completed
 55 | 	ch <- qc.configuring
 56 | 	ch <- qc.failed
 57 | 	ch <- qc.timeout
 58 | 	ch <- qc.preempted
 59 | 	ch <- qc.node_fail
 60 | }
 61 | 
 62 | func (qc *QueueCollector) Collect(ch chan<- prometheus.Metric) {
 63 | 	apiCache := qc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 64 | 	jobsRespBytes, found := apiCache.Get("jobs")
 65 | 	if !found {
 66 | 		slog.Error("failed to get jobs response for users metrics from cache")
 67 | 		return
 68 | 	}
 69 | 	jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte))
 70 | 	if err != nil {
 71 | 		slog.Error("failed to process jobs data for queue metrics", "error", err)
 72 | 		return
 73 | 	}
 74 | 	qm, err := ParseQueueMetrics(jobsData)
 75 | 	if err != nil {
 76 | 		slog.Error("failed to collect queue metrics", "error", err)
 77 | 		return
 78 | 	}
 79 | 	ch <- prometheus.MustNewConstMetric(qc.pending, prometheus.GaugeValue, qm.pending)
 80 | 	ch <- prometheus.MustNewConstMetric(qc.pending_dep, prometheus.GaugeValue, qm.pending_dep)
 81 | 	ch <- prometheus.MustNewConstMetric(qc.running, prometheus.GaugeValue, qm.running)
 82 | 	ch <- prometheus.MustNewConstMetric(qc.suspended, prometheus.GaugeValue, qm.suspended)
 83 | 	ch <- prometheus.MustNewConstMetric(qc.cancelled, prometheus.GaugeValue, qm.cancelled)
 84 | 	ch <- prometheus.MustNewConstMetric(qc.completing, prometheus.GaugeValue, qm.completing)
 85 | 	ch <- prometheus.MustNewConstMetric(qc.completed, prometheus.GaugeValue, qm.completed)
 86 | 	ch <- prometheus.MustNewConstMetric(qc.configuring, prometheus.GaugeValue, qm.configuring)
 87 | 	ch <- prometheus.MustNewConstMetric(qc.failed, prometheus.GaugeValue, qm.failed)
 88 | 	ch <- prometheus.MustNewConstMetric(qc.timeout, prometheus.GaugeValue, qm.timeout)
 89 | 	ch <- prometheus.MustNewConstMetric(qc.preempted, prometheus.GaugeValue, qm.preempted)
 90 | 	ch <- prometheus.MustNewConstMetric(qc.node_fail, prometheus.GaugeValue, qm.node_fail)
 91 | }
 92 | 
 93 | func NewQueueMetrics() *queueMetrics {
 94 | 	return &queueMetrics{}
 95 | }
 96 | 
 97 | type queueMetrics struct {
 98 | 	pending     float64
 99 | 	pending_dep float64
100 | 	running     float64
101 | 	suspended   float64
102 | 	cancelled   float64
103 | 	completing  float64
104 | 	completed   float64
105 | 	configuring float64
106 | 	failed      float64
107 | 	timeout     float64
108 | 	preempted   float64
109 | 	node_fail   float64
110 | }
111 | 
112 | func ParseQueueMetrics(jobsData *api.JobsData) (*queueMetrics, error) {
113 | 	qm := NewQueueMetrics()
114 | 	for _, j := range jobsData.Jobs {
115 | 		switch j.JobState {
116 | 		case types.JobStatePending:
117 | 			if j.Dependency != "" {
118 | 				qm.pending_dep++
119 | 			} else {
120 | 				qm.pending++
121 | 			}
122 | 		case types.JobStateRunning:
123 | 			qm.running++
124 | 		case types.JobStateSuspended:
125 | 			qm.suspended++
126 | 		case types.JobStateCancelled:
127 | 			qm.cancelled++
128 | 		case types.JobStateCompleting:
129 | 			qm.completing++
130 | 		case types.JobStateCompleted:
131 | 			qm.completed++
132 | 		case types.JobStateConfiguring:
133 | 			qm.configuring++
134 | 		case types.JobStateFailed:
135 | 			qm.failed++
136 | 		case types.JobStateTimeout:
137 | 			qm.timeout++
138 | 		case types.JobStatePreempted:
139 | 			qm.preempted++
140 | 		case types.JobStateNodeFail:
141 | 			qm.node_fail++
142 | 		}
143 | 	}
144 | 	return qm, nil
145 | }
146 | 


--------------------------------------------------------------------------------
/internal/api/transport.go:
--------------------------------------------------------------------------------
  1 | package api
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"log/slog"
  9 | 	"net"
 10 | 	"net/http"
 11 | 	"strings"
 12 | 
 13 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 14 | )
 15 | 
 16 | type slurmRestRequest struct {
 17 | 	req    *http.Request
 18 | 	client *http.Client
 19 | }
 20 | 
 21 | type SlurmRestResponse struct {
 22 | 	StatusCode int
 23 | 	Body       []byte
 24 | }
 25 | 
 26 | // GetSlurmRestResponse retrieves response data from slurm api
 27 | func GetSlurmRestResponse(ctx context.Context, endpointCtxKey types.Key) ([]byte, error) {
 28 | 	var endpointStr string
 29 | 	switch endpointCtxKey {
 30 | 	case types.ApiDiagEndpointKey:
 31 | 		endpointStr = "diag"
 32 | 	case types.ApiJobsEndpointKey:
 33 | 		endpointStr = "jobs"
 34 | 	case types.ApiNodesEndpointKey:
 35 | 		endpointStr = "nodes"
 36 | 	case types.ApiPartitionsEndpointKey:
 37 | 		endpointStr = "partitions"
 38 | 	case types.ApiSharesEndpointKey:
 39 | 		endpointStr = "shares"
 40 | 	default:
 41 | 		return nil, fmt.Errorf("invalid endpoint key")
 42 | 	}
 43 | 	slog.Debug("performing rest request", "endpoint", endpointStr)
 44 | 	nr, err := newSlurmRestRequest(ctx, endpointCtxKey)
 45 | 	if err != nil {
 46 | 		return nil, fmt.Errorf("failed to generate new slurm rest request: %v", err)
 47 | 	}
 48 | 	resp, err := nr.Send()
 49 | 	if err != nil {
 50 | 		return nil, fmt.Errorf("failed to retrieve slurm rest response: %v", err)
 51 | 	}
 52 | 	// sometimes slurm fails to get stuff. we want to error here
 53 | 	if resp.StatusCode == 500 {
 54 | 		slog.Debug("incorrect response status code", "endpoint", endpointStr, "code", resp.StatusCode, "body", string(resp.Body))
 55 | 
 56 | 		// try to unmarshal the api error and give a better log
 57 | 		var aed APIErrorData
 58 | 		var errStr string
 59 | 		err := json.Unmarshal(resp.Body, &aed)
 60 | 		if err != nil {
 61 | 			errStr = "tried to get more data about the error but failed. try debug mode for more information"
 62 | 		}
 63 | 		errStr = aed.ToString()
 64 | 		return nil, fmt.Errorf("internal server error (500) from slurm controller getting %s data: %s", endpointStr, errStr)
 65 | 	}
 66 | 	// unauthorized responses should say that
 67 | 	if resp.StatusCode == 401 {
 68 | 		return nil, fmt.Errorf("unauthorized: invalid credentials")
 69 | 	}
 70 | 	// otherwise, it should be status 200, so this catches unsupported status codes
 71 | 	if resp.StatusCode != 200 {
 72 | 		slog.Debug("incorrect response status code", "endpoint", endpointStr, "code", resp.StatusCode, "body", string(resp.Body))
 73 | 		return nil, fmt.Errorf("received incorrect status code for %s data", endpointStr)
 74 | 	}
 75 | 	slog.Debug("successfully queried slurm rest data", "endpoint", endpointStr)
 76 | 	return resp.Body, nil
 77 | }
 78 | 
 79 | // newSlurmRestRequest returns a new slurmRestRequest object which is used to perform
 80 | // http interactions with the slurmrest server. It configures everything up until
 81 | // the request is actually sent to get data.
 82 | func newSlurmRestRequest(ctx context.Context, k types.Key) (*slurmRestRequest, error) {
 83 | 	apiURL := ctx.Value(types.ApiURLKey).(string)
 84 | 
 85 | 	if strings.HasPrefix(apiURL, "unix://") {
 86 | 		return newSlurmUnixRestRequest(ctx, k)
 87 | 	} else if strings.HasPrefix(apiURL, "http://") || strings.HasPrefix(apiURL, "https://") {
 88 | 		return newSlurmInetRestRequest(ctx, k)
 89 | 	}
 90 | 	return nil, fmt.Errorf("invalid SLURM_EXPORTER_API_URL: %s", apiURL)
 91 | }
 92 | 
 93 | func newSlurmInetRestRequest(ctx context.Context, k types.Key) (*slurmRestRequest, error) {
 94 | 	apiUser := ctx.Value(types.ApiUserKey).(string)
 95 | 	apiToken := ctx.Value(types.ApiTokenKey).(string)
 96 | 	apiURL := ctx.Value(types.ApiURLKey).(string)
 97 | 	apiEndpoint := ctx.Value(k).(string)
 98 | 
 99 | 	url := fmt.Sprintf("%s/%s", apiURL, apiEndpoint)
100 | 	req, err := http.NewRequest("GET", url, nil)
101 | 	if err != nil {
102 | 		return nil, err
103 | 	}
104 | 	req.Header.Set("Accept", "application/json")
105 | 	req.Header.Set("X-SLURM-USER-NAME", apiUser)
106 | 	req.Header.Set("X-SLURM-USER-TOKEN", apiToken)
107 | 
108 | 	return &slurmRestRequest{
109 | 		req:    req,
110 | 		client: &http.Client{},
111 | 	}, nil
112 | }
113 | 
114 | func newSlurmUnixRestRequest(ctx context.Context, k types.Key) (*slurmRestRequest, error) {
115 | 	apiURL := ctx.Value(types.ApiURLKey).(string)
116 | 	apiEndpoint := ctx.Value(k).(string)
117 | 
118 | 	socketPath := strings.TrimPrefix(apiURL, "unix:")
119 | 	url := fmt.Sprintf("http://unix/%s", apiEndpoint)
120 | 	req, err := http.NewRequest("GET", url, nil)
121 | 	if err != nil {
122 | 		return nil, err
123 | 	}
124 | 	req.Header.Set("Accept", "application/json")
125 | 
126 | 	return &slurmRestRequest{
127 | 		req: req,
128 | 		client: &http.Client{
129 | 			Transport: &http.Transport{
130 | 				DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
131 | 					return net.Dial("unix", socketPath)
132 | 				},
133 | 				DisableKeepAlives: true,
134 | 			},
135 | 		},
136 | 	}, nil
137 | }
138 | 
139 | // slurmRestRequest.Send is used to perform the request against the slurmrest
140 | // server. It returns a *SlurmRestResponse which is a struct containing the
141 | // response status code and the bytes of the response body.
142 | func (sr slurmRestRequest) Send() (*SlurmRestResponse, error) {
143 | 	resp, err := sr.client.Do(sr.req)
144 | 	if err != nil {
145 | 		return nil, fmt.Errorf("failed to send request: %v", err)
146 | 	}
147 | 	defer resp.Body.Close()
148 | 
149 | 	body, err := io.ReadAll(resp.Body)
150 | 	if err != nil {
151 | 		return nil, fmt.Errorf("failed to read response body: %v", err)
152 | 	}
153 | 
154 | 	sresp := SlurmRestResponse{}
155 | 	sresp.StatusCode = resp.StatusCode
156 | 	sresp.Body = body
157 | 
158 | 	return &sresp, nil
159 | }
160 | 


--------------------------------------------------------------------------------
/testdata/V0041OpenapiNodesResp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes" : [ {
  3 |     "reason" : "reason",
  4 |     "gpu_spec" : "gpu_spec",
  5 |     "slurmd_start_time" : {
  6 |       "number" : 3,
  7 |       "set" : true,
  8 |       "infinite" : true
  9 |     },
 10 |     "features" : [ "features", "features" ],
 11 |     "hostname" : "hostname",
 12 |     "cores" : 1,
 13 |     "reason_changed_at" : {
 14 |       "number" : 9,
 15 |       "set" : true,
 16 |       "infinite" : true
 17 |     },
 18 |     "reservation" : "reservation",
 19 |     "tres" : "tres",
 20 |     "cpu_binding" : 5,
 21 |     "state" : [ "INVALID", "INVALID" ],
 22 |     "sockets" : 6,
 23 |     "energy" : {
 24 |       "current_watts" : {
 25 |         "number" : 1,
 26 |         "set" : true,
 27 |         "infinite" : true
 28 |       },
 29 |       "base_consumed_energy" : 4,
 30 |       "last_collected" : 1,
 31 |       "consumed_energy" : 7,
 32 |       "previous_consumed_energy" : 1,
 33 |       "average_watts" : 2
 34 |     },
 35 |     "partitions" : [ "partitions", "partitions" ],
 36 |     "gres_drained" : "gres_drained",
 37 |     "weight" : 6,
 38 |     "version" : "version",
 39 |     "gres_used" : "gres_used",
 40 |     "mcs_label" : "mcs_label",
 41 |     "real_memory" : 4,
 42 |     "instance_id" : "instance_id",
 43 |     "burstbuffer_network_address" : "burstbuffer_network_address",
 44 |     "port" : 1,
 45 |     "name" : "name",
 46 |     "resume_after" : {
 47 |       "number" : 9,
 48 |       "set" : true,
 49 |       "infinite" : true
 50 |     },
 51 |     "temporary_disk" : 2,
 52 |     "tres_used" : "tres_used",
 53 |     "effective_cpus" : 3,
 54 |     "instance_type" : "instance_type",
 55 |     "external_sensors" : {},
 56 |     "res_cores_per_gpu" : 5,
 57 |     "boards" : 0,
 58 |     "alloc_cpus" : 8,
 59 |     "active_features" : [ "active_features", "active_features" ],
 60 |     "reason_set_by_user" : "reason_set_by_user",
 61 |     "free_mem" : {
 62 |       "number" : 7,
 63 |       "set" : true,
 64 |       "infinite" : true
 65 |     },
 66 |     "alloc_idle_cpus" : 9,
 67 |     "extra" : "extra",
 68 |     "operating_system" : "operating_system",
 69 |     "power" : {},
 70 |     "architecture" : "architecture",
 71 |     "owner" : "owner",
 72 |     "cluster_name" : "cluster_name",
 73 |     "address" : "address",
 74 |     "cpus" : 9,
 75 |     "tres_weighted" : 6.438423552598547,
 76 |     "gres" : "gres",
 77 |     "threads" : 1,
 78 |     "boot_time" : {
 79 |       "number" : 6,
 80 |       "set" : true,
 81 |       "infinite" : true
 82 |     },
 83 |     "alloc_memory" : 6,
 84 |     "specialized_memory" : 7,
 85 |     "specialized_cpus" : "specialized_cpus",
 86 |     "specialized_cores" : 5,
 87 |     "last_busy" : {
 88 |       "number" : 6,
 89 |       "set" : true,
 90 |       "infinite" : true
 91 |     },
 92 |     "comment" : "comment",
 93 |     "next_state_after_reboot" : [ "INVALID", "INVALID" ],
 94 |     "cpu_load" : 2
 95 |   }, {
 96 |     "reason" : "reason",
 97 |     "gpu_spec" : "gpu_spec",
 98 |     "slurmd_start_time" : {
 99 |       "number" : 3,
100 |       "set" : true,
101 |       "infinite" : true
102 |     },
103 |     "features" : [ "features", "features" ],
104 |     "hostname" : "hostname",
105 |     "cores" : 1,
106 |     "reason_changed_at" : {
107 |       "number" : 9,
108 |       "set" : true,
109 |       "infinite" : true
110 |     },
111 |     "reservation" : "reservation",
112 |     "tres" : "tres",
113 |     "cpu_binding" : 5,
114 |     "state" : [ "INVALID", "INVALID" ],
115 |     "sockets" : 6,
116 |     "energy" : {
117 |       "current_watts" : {
118 |         "number" : 1,
119 |         "set" : true,
120 |         "infinite" : true
121 |       },
122 |       "base_consumed_energy" : 4,
123 |       "last_collected" : 1,
124 |       "consumed_energy" : 7,
125 |       "previous_consumed_energy" : 1,
126 |       "average_watts" : 2
127 |     },
128 |     "partitions" : [ "partitions", "partitions" ],
129 |     "gres_drained" : "gres_drained",
130 |     "weight" : 6,
131 |     "version" : "version",
132 |     "gres_used" : "gres_used",
133 |     "mcs_label" : "mcs_label",
134 |     "real_memory" : 4,
135 |     "instance_id" : "instance_id",
136 |     "burstbuffer_network_address" : "burstbuffer_network_address",
137 |     "port" : 1,
138 |     "name" : "name",
139 |     "resume_after" : {
140 |       "number" : 9,
141 |       "set" : true,
142 |       "infinite" : true
143 |     },
144 |     "temporary_disk" : 2,
145 |     "tres_used" : "tres_used",
146 |     "effective_cpus" : 3,
147 |     "instance_type" : "instance_type",
148 |     "external_sensors" : {},
149 |     "res_cores_per_gpu" : 5,
150 |     "boards" : 0,
151 |     "alloc_cpus" : 8,
152 |     "active_features" : [ "active_features", "active_features" ],
153 |     "reason_set_by_user" : "reason_set_by_user",
154 |     "free_mem" : {
155 |       "number" : 7,
156 |       "set" : true,
157 |       "infinite" : true
158 |     },
159 |     "alloc_idle_cpus" : 9,
160 |     "extra" : "extra",
161 |     "operating_system" : "operating_system",
162 |     "power" : {},
163 |     "architecture" : "architecture",
164 |     "owner" : "owner",
165 |     "cluster_name" : "cluster_name",
166 |     "address" : "address",
167 |     "cpus" : 9,
168 |     "tres_weighted" : 6.438423552598547,
169 |     "gres" : "gres",
170 |     "threads" : 1,
171 |     "boot_time" : {
172 |       "number" : 6,
173 |       "set" : true,
174 |       "infinite" : true
175 |     },
176 |     "alloc_memory" : 6,
177 |     "specialized_memory" : 7,
178 |     "specialized_cpus" : "specialized_cpus",
179 |     "specialized_cores" : 5,
180 |     "last_busy" : {
181 |       "number" : 6,
182 |       "set" : true,
183 |       "infinite" : true
184 |     },
185 |     "comment" : "comment",
186 |     "next_state_after_reboot" : [ "INVALID", "INVALID" ],
187 |     "cpu_load" : 2
188 |   } ],
189 |   "meta" : {
190 |     "slurm" : {
191 |       "cluster" : "cluster",
192 |       "release" : "release",
193 |       "version" : {
194 |         "major" : "major",
195 |         "minor" : "minor",
196 |         "micro" : "micro"
197 |       }
198 |     },
199 |     "plugin" : {
200 |       "accounting_storage" : "accounting_storage",
201 |       "name" : "name",
202 |       "type" : "type",
203 |       "data_parser" : "data_parser"
204 |     },
205 |     "client" : {
206 |       "source" : "source",
207 |       "user" : "user",
208 |       "group" : "group"
209 |     },
210 |     "command" : [ "command", "command" ]
211 |   },
212 |   "last_update" : {
213 |     "number" : 6,
214 |     "set" : true,
215 |     "infinite" : true
216 |   },
217 |   "warnings" : [ {
218 |     "description" : "description",
219 |     "source" : "source"
220 |   }, {
221 |     "description" : "description",
222 |     "source" : "source"
223 |   } ],
224 |   "errors" : [ {
225 |     "description" : "description",
226 |     "source" : "source",
227 |     "error" : "error",
228 |     "error_number" : 5
229 |   }, {
230 |     "description" : "description",
231 |     "source" : "source",
232 |     "error" : "error",
233 |     "error_number" : 5
234 |   } ]
235 | }
236 | 


--------------------------------------------------------------------------------
/internal/slurm/partitions.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 	"strings"
  7 | 
  8 | 	"github.com/akyoto/cache"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
 10 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 11 | 	"github.com/prometheus/client_golang/prometheus"
 12 | )
 13 | 
 14 | type PartitionsCollector struct {
 15 | 	ctx       context.Context
 16 | 	allocated *prometheus.Desc
 17 | 	idle      *prometheus.Desc
 18 | 	other     *prometheus.Desc
 19 | 	pending   *prometheus.Desc
 20 | 	total     *prometheus.Desc
 21 | }
 22 | 
 23 | func NewPartitionsCollector(ctx context.Context) *PartitionsCollector {
 24 | 	labels := []string{"partition"}
 25 | 	return &PartitionsCollector{
 26 | 		ctx:       ctx,
 27 | 		allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels, nil),
 28 | 		idle:      prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels, nil),
 29 | 		other:     prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels, nil),
 30 | 		pending:   prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels, nil),
 31 | 		total:     prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels, nil),
 32 | 	}
 33 | }
 34 | 
 35 | func (pc *PartitionsCollector) Describe(ch chan<- *prometheus.Desc) {
 36 | 	ch <- pc.allocated
 37 | 	ch <- pc.idle
 38 | 	ch <- pc.other
 39 | 	ch <- pc.pending
 40 | 	ch <- pc.total
 41 | }
 42 | 
 43 | func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) {
 44 | 	apiCache := pc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
 45 | 	partitionsRespBytes, found := apiCache.Get("partitions")
 46 | 	if !found {
 47 | 		slog.Error("failed to get partitions response for partitions metrics from cache")
 48 | 		return
 49 | 	}
 50 | 	jobsRespBytes, found := apiCache.Get("jobs")
 51 | 	if !found {
 52 | 		slog.Error("failed to get jobs response for users metrics from cache")
 53 | 		return
 54 | 	}
 55 | 	nodesRespBytes, found := apiCache.Get("nodes")
 56 | 	if !found {
 57 | 		slog.Error("failed to get nodes response for cpu metrics from cache")
 58 | 		return
 59 | 	}
 60 | 	partitionsData, err := api.ProcessPartitionsResponse(partitionsRespBytes.([]byte))
 61 | 	if err != nil {
 62 | 		slog.Error("failed to process partitions data for partitions metrics", "error", err)
 63 | 		return
 64 | 	}
 65 | 	jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte))
 66 | 	if err != nil {
 67 | 		slog.Error("failed to process jobs data for partitions metrics", "error", err)
 68 | 		return
 69 | 	}
 70 | 	nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte))
 71 | 	if err != nil {
 72 | 		slog.Error("failed to process nodes data for partitions metrics", "error", err)
 73 | 		return
 74 | 	}
 75 | 	pm, err := ParsePartitionsMetrics(partitionsData, jobsData, nodesData)
 76 | 	if err != nil {
 77 | 		slog.Error("failed to collect partitions metrics", "error", err)
 78 | 		return
 79 | 	}
 80 | 	for p := range pm {
 81 | 		if pm[p].cpus_allocated > 0 {
 82 | 			ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].cpus_allocated, p)
 83 | 		}
 84 | 		if pm[p].cpus_idle > 0 {
 85 | 			ch <- prometheus.MustNewConstMetric(pc.idle, prometheus.GaugeValue, pm[p].cpus_idle, p)
 86 | 		}
 87 | 		if pm[p].cpus_other > 0 {
 88 | 			ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].cpus_other, p)
 89 | 		}
 90 | 		if pm[p].cpus_total > 0 {
 91 | 			ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].cpus_total, p)
 92 | 		}
 93 | 		if pm[p].jobs_pending > 0 {
 94 | 			ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].jobs_pending, p)
 95 | 		}
 96 | 	}
 97 | }
 98 | 
 99 | func NewPartitionsMetrics() *partitionMetrics {
100 | 	return &partitionMetrics{0, 0, 0, 0, 0}
101 | }
102 | 
103 | type partitionMetrics struct {
104 | 	cpus_allocated float64
105 | 	cpus_idle      float64
106 | 	cpus_other     float64
107 | 	cpus_total     float64
108 | 	jobs_pending   float64
109 | }
110 | 
111 | // ParsePartitionsMetrics returns a map where the keys are the partition names and the values are a partitionMetrics struct
112 | func ParsePartitionsMetrics(partitionsData *api.PartitionsData, jobsData *api.JobsData, nodesData *api.NodesData) (map[string]*partitionMetrics, error) {
113 | 	partitions := make(map[string]*partitionMetrics)
114 | 	nodePartitions := make(map[string][]string)
115 | 
116 | 	// first, scan through partition data to easily get total cpus
117 | 	for _, p := range partitionsData.Partitions {
118 | 		_, exists := partitions[p.Name]
119 | 		if !exists {
120 | 			partitions[p.Name] = NewPartitionsMetrics()
121 | 		}
122 | 
123 | 		// cpu total
124 | 		partitions[p.Name].cpus_total = float64(p.Cpus)
125 | 	}
126 | 
127 | 	// we need to gather cpus from the nodes perspective because a node can
128 | 	// be a member of multiple partitions, running a job in one partition, and
129 | 	// we want to see that there are allocated cpus on the other partition because
130 | 	// of the shared node.
131 | 	for _, n := range nodesData.Nodes {
132 | 		nodePartitions[n.Name] = n.Partitions
133 | 	}
134 | 
135 | 	// to get used and available cpus, we need to scan through the job list and categorize
136 | 	// each job by its partition, adding the cpus as we go
137 | 	for _, n := range nodesData.Nodes {
138 | 		alloc_cpus := n.AllocCpus
139 | 		idle_cpus := n.AllocIdleCpus
140 | 		nodePartitionNames := n.Partitions
141 | 		for _, partitionName := range nodePartitionNames {
142 | 			// this needs to exist to handle the test data provided by SLURM
143 | 			// where the nodes response example data does not correspond to
144 | 			// the partitions response example data. in real data, the
145 | 			// partition names should already exist in the map
146 | 			_, exists := partitions[partitionName]
147 | 			if !exists {
148 | 				partitions[partitionName] = NewPartitionsMetrics()
149 | 			}
150 | 
151 | 			partitions[partitionName].cpus_allocated += float64(alloc_cpus)
152 | 			partitions[partitionName].cpus_idle += float64(idle_cpus)
153 | 		}
154 | 	}
155 | 
156 | 	// derive the other stat
157 | 	for i, p := range partitions {
158 | 		partitions[i].cpus_other = p.cpus_total - p.cpus_allocated - p.cpus_idle
159 | 	}
160 | 
161 | 	// lastly, we need to get a count of pending jobs for the partition
162 | 	for _, j := range jobsData.Jobs {
163 | 		// partition name can be comma-separated, so we iterate through it
164 | 		pnames := strings.Split(j.Partition, ",")
165 | 		for _, partitionName := range pnames {
166 | 			// this needs to exist to handle the test data provided by SLURM
167 | 			// where the nodes response example data does not correspond to
168 | 			// the partitions response example data. in real data, the
169 | 			// partition names should already exist in the map
170 | 			_, exists := partitions[partitionName]
171 | 			if !exists {
172 | 				partitions[partitionName] = NewPartitionsMetrics()
173 | 			}
174 | 			partitions[partitionName].jobs_pending += 1
175 | 		}
176 | 	}
177 | 
178 | 	return partitions, nil
179 | }
180 | 


--------------------------------------------------------------------------------
/testdata/V0041OpenapiPartitionResp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "partitions" : [ {
  3 |     "cluster" : "cluster",
  4 |     "cpus" : {
  5 |       "task_binding" : 6,
  6 |       "total" : 1
  7 |     },
  8 |     "timeouts" : {
  9 |       "resume" : {
 10 |         "number" : 9,
 11 |         "set" : true,
 12 |         "infinite" : true
 13 |       },
 14 |       "suspend" : {
 15 |         "number" : 6,
 16 |         "set" : true,
 17 |         "infinite" : true
 18 |       }
 19 |     },
 20 |     "groups" : {
 21 |       "allowed" : "allowed"
 22 |     },
 23 |     "alternate" : "alternate",
 24 |     "select_type" : [ "CPU", "CPU" ],
 25 |     "suspend_time" : {
 26 |       "number" : 8,
 27 |       "set" : true,
 28 |       "infinite" : true
 29 |     },
 30 |     "priority" : {
 31 |       "tier" : 9,
 32 |       "job_factor" : 5
 33 |     },
 34 |     "node_sets" : "node_sets",
 35 |     "maximums" : {
 36 |       "shares" : 1,
 37 |       "nodes" : {
 38 |         "number" : 1,
 39 |         "set" : true,
 40 |         "infinite" : true
 41 |       },
 42 |       "over_time_limit" : {
 43 |         "number" : 1,
 44 |         "set" : true,
 45 |         "infinite" : true
 46 |       },
 47 |       "cpus_per_node" : {
 48 |         "number" : 3,
 49 |         "set" : true,
 50 |         "infinite" : true
 51 |       },
 52 |       "cpus_per_socket" : {
 53 |         "number" : 2,
 54 |         "set" : true,
 55 |         "infinite" : true
 56 |       },
 57 |       "partition_memory_per_node" : {
 58 |         "number" : 1,
 59 |         "set" : true,
 60 |         "infinite" : true
 61 |       },
 62 |       "oversubscribe" : {
 63 |         "jobs" : 6,
 64 |         "flags" : [ "force", "force" ]
 65 |       },
 66 |       "memory_per_cpu" : 4,
 67 |       "time" : {
 68 |         "number" : 7,
 69 |         "set" : true,
 70 |         "infinite" : true
 71 |       },
 72 |       "partition_memory_per_cpu" : {
 73 |         "number" : 7,
 74 |         "set" : true,
 75 |         "infinite" : true
 76 |       }
 77 |     },
 78 |     "nodes" : {
 79 |       "configured" : "configured",
 80 |       "total" : 0,
 81 |       "allowed_allocation" : "allowed_allocation"
 82 |     },
 83 |     "partition" : {
 84 |       "state" : [ "INACTIVE", "INACTIVE" ]
 85 |     },
 86 |     "qos" : {
 87 |       "deny" : "deny",
 88 |       "allowed" : "allowed",
 89 |       "assigned" : "assigned"
 90 |     },
 91 |     "defaults" : {
 92 |       "partition_memory_per_node" : {
 93 |         "number" : 2,
 94 |         "set" : true,
 95 |         "infinite" : true
 96 |       },
 97 |       "memory_per_cpu" : 5,
 98 |       "time" : {
 99 |         "number" : 7,
100 |         "set" : true,
101 |         "infinite" : true
102 |       },
103 |       "job" : "job",
104 |       "partition_memory_per_cpu" : {
105 |         "number" : 5,
106 |         "set" : true,
107 |         "infinite" : true
108 |       }
109 |     },
110 |     "name" : "name",
111 |     "tres" : {
112 |       "configured" : "configured",
113 |       "billing_weights" : "billing_weights"
114 |     },
115 |     "accounts" : {
116 |       "deny" : "deny",
117 |       "allowed" : "allowed"
118 |     },
119 |     "minimums" : {
120 |       "nodes" : 4
121 |     },
122 |     "grace_time" : 9
123 |   }, {
124 |     "cluster" : "cluster",
125 |     "cpus" : {
126 |       "task_binding" : 6,
127 |       "total" : 1
128 |     },
129 |     "timeouts" : {
130 |       "resume" : {
131 |         "number" : 9,
132 |         "set" : true,
133 |         "infinite" : true
134 |       },
135 |       "suspend" : {
136 |         "number" : 6,
137 |         "set" : true,
138 |         "infinite" : true
139 |       }
140 |     },
141 |     "groups" : {
142 |       "allowed" : "allowed"
143 |     },
144 |     "alternate" : "alternate",
145 |     "select_type" : [ "CPU", "CPU" ],
146 |     "suspend_time" : {
147 |       "number" : 8,
148 |       "set" : true,
149 |       "infinite" : true
150 |     },
151 |     "priority" : {
152 |       "tier" : 9,
153 |       "job_factor" : 5
154 |     },
155 |     "node_sets" : "node_sets",
156 |     "maximums" : {
157 |       "shares" : 1,
158 |       "nodes" : {
159 |         "number" : 1,
160 |         "set" : true,
161 |         "infinite" : true
162 |       },
163 |       "over_time_limit" : {
164 |         "number" : 1,
165 |         "set" : true,
166 |         "infinite" : true
167 |       },
168 |       "cpus_per_node" : {
169 |         "number" : 3,
170 |         "set" : true,
171 |         "infinite" : true
172 |       },
173 |       "cpus_per_socket" : {
174 |         "number" : 2,
175 |         "set" : true,
176 |         "infinite" : true
177 |       },
178 |       "partition_memory_per_node" : {
179 |         "number" : 1,
180 |         "set" : true,
181 |         "infinite" : true
182 |       },
183 |       "oversubscribe" : {
184 |         "jobs" : 6,
185 |         "flags" : [ "force", "force" ]
186 |       },
187 |       "memory_per_cpu" : 4,
188 |       "time" : {
189 |         "number" : 7,
190 |         "set" : true,
191 |         "infinite" : true
192 |       },
193 |       "partition_memory_per_cpu" : {
194 |         "number" : 7,
195 |         "set" : true,
196 |         "infinite" : true
197 |       }
198 |     },
199 |     "nodes" : {
200 |       "configured" : "configured",
201 |       "total" : 0,
202 |       "allowed_allocation" : "allowed_allocation"
203 |     },
204 |     "partition" : {
205 |       "state" : [ "INACTIVE", "INACTIVE" ]
206 |     },
207 |     "qos" : {
208 |       "deny" : "deny",
209 |       "allowed" : "allowed",
210 |       "assigned" : "assigned"
211 |     },
212 |     "defaults" : {
213 |       "partition_memory_per_node" : {
214 |         "number" : 2,
215 |         "set" : true,
216 |         "infinite" : true
217 |       },
218 |       "memory_per_cpu" : 5,
219 |       "time" : {
220 |         "number" : 7,
221 |         "set" : true,
222 |         "infinite" : true
223 |       },
224 |       "job" : "job",
225 |       "partition_memory_per_cpu" : {
226 |         "number" : 5,
227 |         "set" : true,
228 |         "infinite" : true
229 |       }
230 |     },
231 |     "name" : "name",
232 |     "tres" : {
233 |       "configured" : "configured",
234 |       "billing_weights" : "billing_weights"
235 |     },
236 |     "accounts" : {
237 |       "deny" : "deny",
238 |       "allowed" : "allowed"
239 |     },
240 |     "minimums" : {
241 |       "nodes" : 4
242 |     },
243 |     "grace_time" : 9
244 |   } ],
245 |   "meta" : {
246 |     "slurm" : {
247 |       "cluster" : "cluster",
248 |       "release" : "release",
249 |       "version" : {
250 |         "major" : "major",
251 |         "minor" : "minor",
252 |         "micro" : "micro"
253 |       }
254 |     },
255 |     "plugin" : {
256 |       "accounting_storage" : "accounting_storage",
257 |       "name" : "name",
258 |       "type" : "type",
259 |       "data_parser" : "data_parser"
260 |     },
261 |     "client" : {
262 |       "source" : "source",
263 |       "user" : "user",
264 |       "group" : "group"
265 |     },
266 |     "command" : [ "command", "command" ]
267 |   },
268 |   "last_update" : {
269 |     "number" : 9,
270 |     "set" : true,
271 |     "infinite" : true
272 |   },
273 |   "warnings" : [ {
274 |     "description" : "description",
275 |     "source" : "source"
276 |   }, {
277 |     "description" : "description",
278 |     "source" : "source"
279 |   } ],
280 |   "errors" : [ {
281 |     "description" : "description",
282 |     "source" : "source",
283 |     "error" : "error",
284 |     "error_number" : 5
285 |   }, {
286 |     "description" : "description",
287 |     "source" : "source",
288 |     "error" : "error",
289 |     "error_number" : 5
290 |   } ]
291 | }
292 | 


--------------------------------------------------------------------------------
/internal/slurm/scheduler.go:
--------------------------------------------------------------------------------
  1 | package slurm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 
  7 | 	"github.com/akyoto/cache"
  8 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/api"
  9 | 	"github.com/lcrownover/prometheus-slurm-exporter/internal/types"
 10 | 	"github.com/prometheus/client_golang/prometheus"
 11 | )
 12 | 
 13 | type SchedulerCollector struct {
 14 | 	ctx                               context.Context
 15 | 	threads                           *prometheus.Desc
 16 | 	queue_size                        *prometheus.Desc
 17 | 	dbd_queue_size                    *prometheus.Desc
 18 | 	last_cycle                        *prometheus.Desc
 19 | 	mean_cycle                        *prometheus.Desc
 20 | 	cycle_per_minute                  *prometheus.Desc
 21 | 	backfill_last_cycle               *prometheus.Desc
 22 | 	backfill_mean_cycle               *prometheus.Desc
 23 | 	backfill_depth_mean               *prometheus.Desc
 24 | 	total_backfilled_jobs_since_start *prometheus.Desc
 25 | 	total_backfilled_jobs_since_cycle *prometheus.Desc
 26 | 	total_backfilled_heterogeneous    *prometheus.Desc
 27 | }
 28 | 
 29 | func NewSchedulerCollector(ctx context.Context) *SchedulerCollector {
 30 | 	return &SchedulerCollector{
 31 | 		ctx: ctx,
 32 | 		threads: prometheus.NewDesc(
 33 | 			"slurm_scheduler_threads",
 34 | 			"Information provided by the Slurm sdiag command, number of scheduler threads ",
 35 | 			nil,
 36 | 			nil),
 37 | 		queue_size: prometheus.NewDesc(
 38 | 			"slurm_scheduler_queue_size",
 39 | 			"Information provided by the Slurm sdiag command, length of the scheduler queue",
 40 | 			nil,
 41 | 			nil),
 42 | 		dbd_queue_size: prometheus.NewDesc(
 43 | 			"slurm_scheduler_dbd_queue_size",
 44 | 			"Information provided by the Slurm sdiag command, length of the DBD agent queue",
 45 | 			nil,
 46 | 			nil),
 47 | 		last_cycle: prometheus.NewDesc(
 48 | 			"slurm_scheduler_last_cycle",
 49 | 			"Information provided by the Slurm sdiag command, scheduler last cycle time in (microseconds)",
 50 | 			nil,
 51 | 			nil),
 52 | 		mean_cycle: prometheus.NewDesc(
 53 | 			"slurm_scheduler_mean_cycle",
 54 | 			"Information provided by the Slurm sdiag command, scheduler mean cycle time in (microseconds)",
 55 | 			nil,
 56 | 			nil),
 57 | 		cycle_per_minute: prometheus.NewDesc(
 58 | 			"slurm_scheduler_cycle_per_minute",
 59 | 			"Information provided by the Slurm sdiag command, number scheduler cycles per minute",
 60 | 			nil,
 61 | 			nil),
 62 | 		backfill_last_cycle: prometheus.NewDesc(
 63 | 			"slurm_scheduler_backfill_last_cycle",
 64 | 			"Information provided by the Slurm sdiag command, scheduler backfill last cycle time in (microseconds)",
 65 | 			nil,
 66 | 			nil),
 67 | 		backfill_mean_cycle: prometheus.NewDesc(
 68 | 			"slurm_scheduler_backfill_mean_cycle",
 69 | 			"Information provided by the Slurm sdiag command, scheduler backfill mean cycle time in (microseconds)",
 70 | 			nil,
 71 | 			nil),
 72 | 		backfill_depth_mean: prometheus.NewDesc(
 73 | 			"slurm_scheduler_backfill_depth_mean",
 74 | 			"Information provided by the Slurm sdiag command, scheduler backfill mean depth",
 75 | 			nil,
 76 | 			nil),
 77 | 		total_backfilled_jobs_since_start: prometheus.NewDesc(
 78 | 			"slurm_scheduler_backfilled_jobs_since_start_total",
 79 | 			"Information provided by the Slurm sdiag command, number of jobs started thanks to backfilling since last slurm start",
 80 | 			nil,
 81 | 			nil),
 82 | 		total_backfilled_jobs_since_cycle: prometheus.NewDesc(
 83 | 			"slurm_scheduler_backfilled_jobs_since_cycle_total",
 84 | 			"Information provided by the Slurm sdiag command, number of jobs started thanks to backfilling since last time stats where reset",
 85 | 			nil,
 86 | 			nil),
 87 | 		total_backfilled_heterogeneous: prometheus.NewDesc(
 88 | 			"slurm_scheduler_backfilled_heterogeneous_total",
 89 | 			"Information provided by the Slurm sdiag command, number of heterogeneous job components started thanks to backfilling since last Slurm start",
 90 | 			nil,
 91 | 			nil),
 92 | 	}
 93 | }
 94 | 
 95 | // Send all metric descriptions
 96 | func (c *SchedulerCollector) Describe(ch chan<- *prometheus.Desc) {
 97 | 	ch <- c.threads
 98 | 	ch <- c.queue_size
 99 | 	ch <- c.dbd_queue_size
100 | 	ch <- c.last_cycle
101 | 	ch <- c.mean_cycle
102 | 	ch <- c.cycle_per_minute
103 | 	ch <- c.backfill_last_cycle
104 | 	ch <- c.backfill_mean_cycle
105 | 	ch <- c.backfill_depth_mean
106 | 	ch <- c.total_backfilled_jobs_since_start
107 | 	ch <- c.total_backfilled_jobs_since_cycle
108 | 	ch <- c.total_backfilled_heterogeneous
109 | }
110 | 
111 | // Send the values of all metrics
112 | func (sc *SchedulerCollector) Collect(ch chan<- prometheus.Metric) {
113 | 	apiCache := sc.ctx.Value(types.ApiCacheKey).(*cache.Cache)
114 | 	diagRespBytes, found := apiCache.Get("diag")
115 | 	if !found {
116 | 		slog.Error("failed to get diag response for scheduler metrics from cache")
117 | 		return
118 | 	}
119 | 	diagData, err := api.ProcessDiagResponse(diagRespBytes.([]byte))
120 | 	if err != nil {
121 | 		slog.Error("failed to process diag response for scheduler metrics", "error", err)
122 | 		return
123 | 	}
124 | 	sm, err := ParseSchedulerMetrics(diagData)
125 | 	if err != nil {
126 | 		slog.Error("failed to collect scheduler metrics", "error", err)
127 | 		return
128 | 	}
129 | 	ch <- prometheus.MustNewConstMetric(sc.threads, prometheus.GaugeValue, sm.threads)
130 | 	ch <- prometheus.MustNewConstMetric(sc.queue_size, prometheus.GaugeValue, sm.queue_size)
131 | 	ch <- prometheus.MustNewConstMetric(sc.dbd_queue_size, prometheus.GaugeValue, sm.dbd_queue_size)
132 | 	ch <- prometheus.MustNewConstMetric(sc.last_cycle, prometheus.GaugeValue, sm.last_cycle)
133 | 	ch <- prometheus.MustNewConstMetric(sc.mean_cycle, prometheus.GaugeValue, sm.mean_cycle)
134 | 	ch <- prometheus.MustNewConstMetric(sc.cycle_per_minute, prometheus.GaugeValue, sm.cycle_per_minute)
135 | 	ch <- prometheus.MustNewConstMetric(sc.backfill_last_cycle, prometheus.GaugeValue, sm.backfill_last_cycle)
136 | 	ch <- prometheus.MustNewConstMetric(sc.backfill_mean_cycle, prometheus.GaugeValue, sm.backfill_mean_cycle)
137 | 	ch <- prometheus.MustNewConstMetric(sc.backfill_depth_mean, prometheus.GaugeValue, sm.backfill_depth_mean)
138 | 	ch <- prometheus.MustNewConstMetric(sc.total_backfilled_jobs_since_start, prometheus.GaugeValue, sm.total_backfilled_jobs_since_start)
139 | 	ch <- prometheus.MustNewConstMetric(sc.total_backfilled_jobs_since_cycle, prometheus.GaugeValue, sm.total_backfilled_jobs_since_cycle)
140 | 	ch <- prometheus.MustNewConstMetric(sc.total_backfilled_heterogeneous, prometheus.GaugeValue, sm.total_backfilled_heterogeneous)
141 | }
142 | 
143 | func NewSchedulerMetrics() *schedulerMetrics {
144 | 	return &schedulerMetrics{}
145 | }
146 | 
147 | type schedulerMetrics struct {
148 | 	threads                           float64
149 | 	queue_size                        float64
150 | 	dbd_queue_size                    float64
151 | 	last_cycle                        float64
152 | 	mean_cycle                        float64
153 | 	cycle_per_minute                  float64
154 | 	backfill_last_cycle               float64
155 | 	backfill_mean_cycle               float64
156 | 	backfill_depth_mean               float64
157 | 	total_backfilled_jobs_since_start float64
158 | 	total_backfilled_jobs_since_cycle float64
159 | 	total_backfilled_heterogeneous    float64
160 | }
161 | 
162 | // Extract the relevant metrics from the sdiag output
163 | func ParseSchedulerMetrics(diagData *api.DiagData) (*schedulerMetrics, error) {
164 | 	sm := NewSchedulerMetrics()
165 | 
166 | 	sm.threads = float64(diagData.ServerThreadCount)
167 | 	sm.queue_size = float64(diagData.AgentQueueSize)
168 | 	sm.dbd_queue_size = float64(diagData.DbdAgentQueueSize)
169 | 	sm.last_cycle = float64(diagData.ScheduleCycleLast)
170 | 	sm.mean_cycle = float64(diagData.ScheduleCycleMean)
171 | 	sm.cycle_per_minute = float64(diagData.ScheduleCyclePerMinute)
172 | 	sm.backfill_depth_mean = float64(diagData.BfDepthMean)
173 | 	sm.backfill_last_cycle = float64(diagData.BfCycleLast)
174 | 	sm.backfill_mean_cycle = float64(diagData.BfCycleMean)
175 | 	sm.total_backfilled_jobs_since_cycle = float64(diagData.BfBackfilledJobs)
176 | 	sm.total_backfilled_heterogeneous = float64(diagData.BfBackfilledHetJobs)
177 | 	sm.total_backfilled_jobs_since_start = float64(diagData.BfLastBackfilledJobs)
178 | 	return sm, nil
179 | }
180 | 


--------------------------------------------------------------------------------
/testdata/V0040OpenapiSharesResp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "shares" : {
  3 |     "shares" : [ {
  4 |       "cluster" : "cluster",
  5 |       "parent" : "parent",
  6 |       "shares_normalized" : {
  7 |         "number" : 6.027456183070403,
  8 |         "set" : true,
  9 |         "infinite" : true
 10 |       },
 11 |       "usage" : 9,
 12 |       "fairshare" : {
 13 |         "level" : 2.027123023002322,
 14 |         "factor" : 3.616076749251911
 15 |       },
 16 |       "type" : [ "USER", "USER" ],
 17 |       "effective_usage" : 2.3021358869347655,
 18 |       "shares" : {
 19 |         "number" : 1,
 20 |         "set" : true,
 21 |         "infinite" : true
 22 |       },
 23 |       "partition" : "partition",
 24 |       "usage_normalized" : {
 25 |         "number" : 7.061401241503109,
 26 |         "set" : true,
 27 |         "infinite" : true
 28 |       },
 29 |       "name" : "name",
 30 |       "tres" : {
 31 |         "run_seconds" : [ {
 32 |           "name" : "name",
 33 |           "value" : {
 34 |             "number" : 5,
 35 |             "set" : true,
 36 |             "infinite" : true
 37 |           }
 38 |         }, {
 39 |           "name" : "name",
 40 |           "value" : {
 41 |             "number" : 5,
 42 |             "set" : true,
 43 |             "infinite" : true
 44 |           }
 45 |         } ],
 46 |         "usage" : [ {
 47 |           "name" : "name",
 48 |           "value" : 5.637376656633329
 49 |         }, {
 50 |           "name" : "name",
 51 |           "value" : 5.637376656633329
 52 |         } ],
 53 |         "group_minutes" : [ {
 54 |           "name" : "name",
 55 |           "value" : {
 56 |             "number" : 5,
 57 |             "set" : true,
 58 |             "infinite" : true
 59 |           }
 60 |         }, {
 61 |           "name" : "name",
 62 |           "value" : {
 63 |             "number" : 5,
 64 |             "set" : true,
 65 |             "infinite" : true
 66 |           }
 67 |         } ]
 68 |       },
 69 |       "id" : 0
 70 |     },
 71 |     {
 72 |       "id": 104,
 73 |       "cluster": "cluster1",
 74 |       "name": "user1",
 75 |       "parent": "group1",
 76 |       "partition": "",
 77 |       "shares_normalized": {
 78 |         "set": true,
 79 |         "infinite": false,
 80 |         "number": 0.333333
 81 |       },
 82 |       "shares": {
 83 |         "set": true,
 84 |         "infinite": false,
 85 |         "number": 1
 86 |       },
 87 |       "tres": {
 88 |         "run_seconds": [
 89 |           {
 90 |             "name": "cpu",
 91 |             "value": {
 92 |               "set": true,
 93 |               "infinite": false,
 94 |               "number": 0
 95 |             }
 96 |           },
 97 |           {
 98 |             "name": "mem",
 99 |             "value": {
100 |               "set": true,
101 |               "infinite": false,
102 |               "number": 0
103 |             }
104 |           },
105 |           {
106 |             "name": "energy",
107 |             "value": {
108 |               "set": true,
109 |               "infinite": false,
110 |               "number": 0
111 |             }
112 |           },
113 |           {
114 |             "name": "node",
115 |             "value": {
116 |               "set": true,
117 |               "infinite": false,
118 |               "number": 0
119 |             }
120 |           },
121 |           {
122 |             "name": "billing",
123 |             "value": {
124 |               "set": true,
125 |               "infinite": false,
126 |               "number": 0
127 |             }
128 |           },
129 |           {
130 |             "name": "fs\/disk",
131 |             "value": {
132 |               "set": true,
133 |               "infinite": false,
134 |               "number": 0
135 |             }
136 |           },
137 |           {
138 |             "name": "vmem",
139 |             "value": {
140 |               "set": true,
141 |               "infinite": false,
142 |               "number": 0
143 |             }
144 |           },
145 |           {
146 |             "name": "pages",
147 |             "value": {
148 |               "set": true,
149 |               "infinite": false,
150 |               "number": 0
151 |             }
152 |           },
153 |           {
154 |             "name": "gres\/gpu",
155 |             "value": {
156 |               "set": true,
157 |               "infinite": false,
158 |               "number": 0
159 |             }
160 |           },
161 |           {
162 |             "name": "gres\/gpu:a100",
163 |             "value": {
164 |               "set": true,
165 |               "infinite": false,
166 |               "number": 0
167 |             }
168 |           },
169 |           {
170 |             "name": "gres\/gpumem",
171 |             "value": {
172 |               "set": true,
173 |               "infinite": false,
174 |               "number": 0
175 |             }
176 |           },
177 |           {
178 |             "name": "gres\/gpuutil",
179 |             "value": {
180 |               "set": true,
181 |               "infinite": false,
182 |               "number": 0
183 |             }
184 |           }
185 |         ],
186 |         "group_minutes": [
187 |           {
188 |             "name": "cpu",
189 |             "value": {
190 |               "set": false,
191 |               "infinite": true,
192 |               "number": 0
193 |             }
194 |           },
195 |           {
196 |             "name": "mem",
197 |             "value": {
198 |               "set": false,
199 |               "infinite": true,
200 |               "number": 0
201 |             }
202 |           },
203 |           {
204 |             "name": "energy",
205 |             "value": {
206 |               "set": false,
207 |               "infinite": true,
208 |               "number": 0
209 |             }
210 |           },
211 |           {
212 |             "name": "node",
213 |             "value": {
214 |               "set": false,
215 |               "infinite": true,
216 |               "number": 0
217 |             }
218 |           },
219 |           {
220 |             "name": "billing",
221 |             "value": {
222 |               "set": false,
223 |               "infinite": true,
224 |               "number": 0
225 |             }
226 |           },
227 |           {
228 |             "name": "fs\/disk",
229 |             "value": {
230 |               "set": false,
231 |               "infinite": true,
232 |               "number": 0
233 |             }
234 |           },
235 |           {
236 |             "name": "vmem",
237 |             "value": {
238 |               "set": false,
239 |               "infinite": true,
240 |               "number": 0
241 |             }
242 |           },
243 |           {
244 |             "name": "pages",
245 |             "value": {
246 |               "set": false,
247 |               "infinite": true,
248 |               "number": 0
249 |             }
250 |           },
251 |           {
252 |             "name": "gres\/gpu",
253 |             "value": {
254 |               "set": false,
255 |               "infinite": true,
256 |               "number": 0
257 |             }
258 |           },
259 |           {
260 |             "name": "gres\/gpu:a100",
261 |             "value": {
262 |               "set": false,
263 |               "infinite": true,
264 |               "number": 0
265 |             }
266 |           },
267 |           {
268 |             "name": "gres\/gpumem",
269 |             "value": {
270 |               "set": false,
271 |               "infinite": true,
272 |               "number": 0
273 |             }
274 |           },
275 |           {
276 |             "name": "gres\/gpuutil",
277 |             "value": {
278 |               "set": false,
279 |               "infinite": true,
280 |               "number": 0
281 |             }
282 |           }
283 |         ],
284 |         "usage": [
285 |             {
286 |               "name" : "name",
287 |               "value" : 5.637376656633329
288 |             }, {
289 |               "name" : "name",
290 |               "value" : 5.637376656633329
291 |             }
292 |         ]
293 | 
294 |         }
295 |     },
296 |     {
297 |       "cluster" : "cluster",
298 |       "parent" : "parent",
299 |       "shares_normalized" : {
300 |         "number" : 6.027456183070403,
301 |         "set" : true,
302 |         "infinite" : true
303 |       },
304 |       "usage" : 9,
305 |       "fairshare" : {
306 |         "level" : Infinity,
307 |         "factor" : 3.616076749251911
308 |       },
309 |       "type" : [ "USER", "USER" ],
310 |       "effective_usage" : 2.3021358869347655,
311 |       "shares" : {
312 |         "number" : 1,
313 |         "set" : true,
314 |         "infinite" : true
315 |       },
316 |       "partition" : "partition",
317 |       "usage_normalized" : {
318 |         "number" : 7.061401241503109,
319 |         "set" : true,
320 |         "infinite" : true
321 |       },
322 |       "name" : "name",
323 |       "tres" : {
324 |         "run_seconds" : [ {
325 |           "name" : "name",
326 |           "value" : {
327 |             "number" : 5,
328 |             "set" : true,
329 |             "infinite" : true
330 |           }
331 |         }, {
332 |           "name" : "name",
333 |           "value" : {
334 |             "number" : 5,
335 |             "set" : true,
336 |             "infinite" : true
337 |           }
338 |         } ],
339 |         "usage" : [
340 |             {
341 |               "name" : "name",
342 |               "value" : 5.637376656633329
343 |             }, {
344 |               "name" : "name",
345 |               "value" : 5.637376656633329
346 |             }
347 |         ],
348 |         "group_minutes" : [ {
349 |           "name" : "name",
350 |           "value" : {
351 |             "number" : 5,
352 |             "set" : true,
353 |             "infinite" : true
354 |           }
355 |         }, {
356 |           "name" : "name",
357 |           "value" : {
358 |             "number" : 5,
359 |             "set" : true,
360 |             "infinite" : true
361 |           }
362 |         } ]
363 |       },
364 |       "id" : 0
365 |     } ],
366 |     "total_shares" : 4
367 |   },
368 |   "meta" : {
369 |     "slurm" : {
370 |       "cluster" : "cluster",
371 |       "release" : "release",
372 |       "version" : {
373 |         "major" : "major",
374 |         "minor" : "minor",
375 |         "micro" : "micro"
376 |       }
377 |     },
378 |     "plugin" : {
379 |       "accounting_storage" : "accounting_storage",
380 |       "name" : "name",
381 |       "type" : "type",
382 |       "data_parser" : "data_parser"
383 |     },
384 |     "client" : {
385 |       "source" : "source",
386 |       "user" : "user",
387 |       "group" : "group"
388 |     },
389 |     "command" : [ "command", "command" ]
390 |   },
391 |   "warnings" : [ {
392 |     "description" : "description",
393 |     "source" : "source"
394 |   }, {
395 |     "description" : "description",
396 |     "source" : "source"
397 |   } ],
398 |   "errors" : [ {
399 |     "description" : "description",
400 |     "source" : "source",
401 |     "error" : "error",
402 |     "error_number" : 5
403 |   }, {
404 |     "description" : "description",
405 |     "source" : "source",
406 |     "error" : "error",
407 |     "error_number" : 5
408 |   } ]
409 | }
410 | 


--------------------------------------------------------------------------------
/testdata/V0040OpenapiDiagResp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "statistics": {
  3 |         "parts_packed": 1,
  4 |         "req_time": {
  5 |             "set": true,
  6 |             "infinite": false,
  7 |             "number": 1726764981
  8 |         },
  9 |         "req_time_start": {
 10 |             "set": true,
 11 |             "infinite": false,
 12 |             "number": 1726704000
 13 |         },
 14 |         "server_thread_count": 2,
 15 |         "agent_queue_size": 0,
 16 |         "agent_count": 0,
 17 |         "agent_thread_count": 0,
 18 |         "dbd_agent_queue_size": 0,
 19 |         "gettimeofday_latency": 17,
 20 |         "schedule_cycle_max": 14942,
 21 |         "schedule_cycle_last": 22,
 22 |         "schedule_cycle_total": 1065,
 23 |         "schedule_cycle_mean": 49,
 24 |         "schedule_cycle_mean_depth": 0,
 25 |         "schedule_cycle_per_minute": 1,
 26 |         "schedule_queue_length": 0,
 27 |         "schedule_exit": {
 28 |             "end_job_queue": 1065,
 29 |             "default_queue_depth": 0,
 30 |             "max_job_start": 0,
 31 |             "max_rpc_cnt": 0,
 32 |             "max_sched_time": 0,
 33 |             "licenses": 0
 34 |         },
 35 |         "jobs_submitted": 2,
 36 |         "jobs_started": 0,
 37 |         "jobs_completed": 2,
 38 |         "jobs_canceled": 0,
 39 |         "jobs_failed": 0,
 40 |         "jobs_pending": 25,
 41 |         "jobs_running": 1,
 42 |         "job_states_ts": {
 43 |             "set": true,
 44 |             "infinite": false,
 45 |             "number": 1726764972
 46 |         },
 47 |         "bf_backfilled_jobs": 13,
 48 |         "bf_last_backfilled_jobs": 0,
 49 |         "bf_backfilled_het_jobs": 0,
 50 |         "bf_cycle_counter": 0,
 51 |         "bf_cycle_mean": 0,
 52 |         "bf_depth_mean": 0,
 53 |         "bf_depth_mean_try": 0,
 54 |         "bf_cycle_sum": 0,
 55 |         "bf_cycle_last": 0,
 56 |         "bf_last_depth": 0,
 57 |         "bf_last_depth_try": 0,
 58 |         "bf_depth_sum": 0,
 59 |         "bf_depth_try_sum": 0,
 60 |         "bf_queue_len": 0,
 61 |         "bf_queue_len_mean": 0,
 62 |         "bf_queue_len_sum": 0,
 63 |         "bf_table_size": 1,
 64 |         "bf_table_size_mean": 0,
 65 |         "bf_when_last_cycle": {
 66 |             "set": true,
 67 |             "infinite": false,
 68 |             "number": 1726695861
 69 |         },
 70 |         "bf_active": false,
 71 |         "bf_exit": {
 72 |             "end_job_queue": 0,
 73 |             "bf_max_job_start": 0,
 74 |             "bf_max_job_test": 0,
 75 |             "bf_max_time": 0,
 76 |             "bf_node_space_size": 0,
 77 |             "state_changed": 0
 78 |         },
 79 |         "rpcs_by_message_type": [
 80 |             {
 81 |                 "message_type": "REQUEST_TRIGGER_PULL",
 82 |                 "type_id": 2030,
 83 |                 "count": 1,
 84 |                 "average_time": 104,
 85 |                 "total_time": 104
 86 |             },
 87 |             {
 88 |                 "message_type": "REQUEST_CONTROL_STATUS",
 89 |                 "type_id": 2053,
 90 |                 "count": 3578,
 91 |                 "average_time": 21,
 92 |                 "total_time": 76126
 93 |             },
 94 |             {
 95 |                 "message_type": "REQUEST_FED_INFO",
 96 |                 "type_id": 2049,
 97 |                 "count": 59,
 98 |                 "average_time": 21,
 99 |                 "total_time": 1287
100 |             },
101 |             {
102 |                 "message_type": "REQUEST_JOB_USER_INFO",
103 |                 "type_id": 2039,
104 |                 "count": 14,
105 |                 "average_time": 393,
106 |                 "total_time": 5513
107 |             },
108 |             {
109 |                 "message_type": "REQUEST_PARTITION_INFO",
110 |                 "type_id": 2009,
111 |                 "count": 402960,
112 |                 "average_time": 42,
113 |                 "total_time": 17203621
114 |             },
115 |             {
116 |                 "message_type": "REQUEST_SUBMIT_BATCH_JOB",
117 |                 "type_id": 4003,
118 |                 "count": 6,
119 |                 "average_time": 2101,
120 |                 "total_time": 12608
121 |             },
122 |             {
123 |                 "message_type": "REQUEST_NODE_INFO",
124 |                 "type_id": 2007,
125 |                 "count": 404869,
126 |                 "average_time": 370518,
127 |                 "total_time": 150011441628
128 |             },
129 |             {
130 |                 "message_type": "REQUEST_CONFIG",
131 |                 "type_id": 2015,
132 |                 "count": 661,
133 |                 "average_time": 73,
134 |                 "total_time": 48817
135 |             },
136 |             {
137 |                 "message_type": "MESSAGE_NODE_REGISTRATION_STATUS",
138 |                 "type_id": 1002,
139 |                 "count": 999,
140 |                 "average_time": 503,
141 |                 "total_time": 503284
142 |             },
143 |             {
144 |                 "message_type": "REQUEST_COMPLETE_PROLOG",
145 |                 "type_id": 6018,
146 |                 "count": 45,
147 |                 "average_time": 16740,
148 |                 "total_time": 753313
149 |             },
150 |             {
151 |                 "message_type": "REQUEST_COMPLETE_BATCH_SCRIPT",
152 |                 "type_id": 5018,
153 |                 "count": 36,
154 |                 "average_time": 196,
155 |                 "total_time": 7069
156 |             },
157 |             {
158 |                 "message_type": "REQUEST_STEP_COMPLETE",
159 |                 "type_id": 5016,
160 |                 "count": 44,
161 |                 "average_time": 168,
162 |                 "total_time": 7411
163 |             },
164 |             {
165 |                 "message_type": "REQUEST_JOB_INFO_SINGLE",
166 |                 "type_id": 2021,
167 |                 "count": 45,
168 |                 "average_time": 2438,
169 |                 "total_time": 109735
170 |             },
171 |             {
172 |                 "message_type": "MESSAGE_EPILOG_COMPLETE",
173 |                 "type_id": 6012,
174 |                 "count": 39,
175 |                 "average_time": 30745088,
176 |                 "total_time": 1199058444
177 |             },
178 |             {
179 |                 "message_type": "REQUEST_HET_JOB_ALLOC_INFO",
180 |                 "type_id": 4027,
181 |                 "count": 2,
182 |                 "average_time": 205,
183 |                 "total_time": 410
184 |             },
185 |             {
186 |                 "message_type": "REQUEST_JOB_STEP_CREATE",
187 |                 "type_id": 5001,
188 |                 "count": 3,
189 |                 "average_time": 283,
190 |                 "total_time": 850
191 |             },
192 |             {
193 |                 "message_type": "REQUEST_RESOURCE_ALLOCATION",
194 |                 "type_id": 4001,
195 |                 "count": 18,
196 |                 "average_time": 274360,
197 |                 "total_time": 4938484
198 |             },
199 |             {
200 |                 "message_type": "REQUEST_JOB_READY",
201 |                 "type_id": 4019,
202 |                 "count": 2,
203 |                 "average_time": 22,
204 |                 "total_time": 44
205 |             },
206 |             {
207 |                 "message_type": "REQUEST_UPDATE_PARTITION",
208 |                 "type_id": 3005,
209 |                 "count": 34,
210 |                 "average_time": 201,
211 |                 "total_time": 6843
212 |             },
213 |             {
214 |                 "message_type": "ACCOUNTING_REGISTER_CTLD",
215 |                 "type_id": 10003,
216 |                 "count": 1,
217 |                 "average_time": 86444,
218 |                 "total_time": 86444
219 |             },
220 |             {
221 |                 "message_type": "REQUEST_PERSIST_INIT",
222 |                 "type_id": 6500,
223 |                 "count": 1,
224 |                 "average_time": 57,
225 |                 "total_time": 57
226 |             },
227 |             {
228 |                 "message_type": "ACCOUNTING_UPDATE_MSG",
229 |                 "type_id": 10001,
230 |                 "count": 1,
231 |                 "average_time": 22,
232 |                 "total_time": 22
233 |             },
234 |             {
235 |                 "message_type": "REQUEST_AUTH_TOKEN",
236 |                 "type_id": 5039,
237 |                 "count": 1,
238 |                 "average_time": 262,
239 |                 "total_time": 262
240 |             },
241 |             {
242 |                 "message_type": "REQUEST_JOB_INFO",
243 |                 "type_id": 2003,
244 |                 "count": 15,
245 |                 "average_time": 597,
246 |                 "total_time": 8969
247 |             },
248 |             {
249 |                 "message_type": "REQUEST_STATS_INFO",
250 |                 "type_id": 2035,
251 |                 "count": 9,
252 |                 "average_time": 31,
253 |                 "total_time": 281
254 |             },
255 |             {
256 |                 "message_type": "REQUEST_SHARE_INFO",
257 |                 "type_id": 2022,
258 |                 "count": 8,
259 |                 "average_time": 3486,
260 |                 "total_time": 27888
261 |             },
262 |             {
263 |                 "message_type": "REQUEST_CANCEL_JOB_STEP",
264 |                 "type_id": 5005,
265 |                 "count": 1,
266 |                 "average_time": 218,
267 |                 "total_time": 218
268 |             },
269 |             {
270 |                 "message_type": "REQUEST_COMPLETE_JOB_ALLOCATION",
271 |                 "type_id": 5017,
272 |                 "count": 24,
273 |                 "average_time": 298,
274 |                 "total_time": 7167
275 |             },
276 |             {
277 |                 "message_type": "REQUEST_JOB_ALLOCATION_INFO",
278 |                 "type_id": 4014,
279 |                 "count": 11,
280 |                 "average_time": 21,
281 |                 "total_time": 235
282 |             },
283 |             {
284 |                 "message_type": "REQUEST_KILL_JOB",
285 |                 "type_id": 5032,
286 |                 "count": 2,
287 |                 "average_time": 177,
288 |                 "total_time": 354
289 |             }
290 |         ],
291 |         "rpcs_by_user": [
292 |             {
293 |                 "user": "root",
294 |                 "user_id": 0,
295 |                 "count": 809723,
296 |                 "average_time": 186766,
297 |                 "total_time": 151229024182
298 |             },
299 |             {
300 |                 "user": "slurm",
301 |                 "user_id": 58,
302 |                 "count": 3582,
303 |                 "average_time": 45,
304 |                 "total_time": 162753
305 |             },
306 |             {
307 |                 "user": "vspauldi",
308 |                 "user_id": 239489,
309 |                 "count": 7,
310 |                 "average_time": 512,
311 |                 "total_time": 3590
312 |             }
313 |         ]
314 |     },
315 |     "meta": {
316 |         "plugin": {
317 |             "type": "openapi/slurmctld",
318 |             "name": "Slurm OpenAPI slurmctld",
319 |             "data_parser": "data_parser/v0.0.40",
320 |             "accounting_storage": "accounting_storage/slurmdbd"
321 |         },
322 |         "client": {
323 |             "source": "[10.174.138.225]:59585",
324 |             "user": "root",
325 |             "group": "root"
326 |         },
327 |         "command": [],
328 |         "slurm": {
329 |             "version": {
330 |                 "major": "23",
331 |                 "micro": "1",
332 |                 "minor": "11"
333 |             },
334 |             "release": "23.11.1",
335 |             "cluster": "mycluster"
336 |         }
337 |     },
338 |     "errors": [],
339 |     "warnings": []
340 | }
341 | 


--------------------------------------------------------------------------------
/testdata/V0040OpenapiPartitionResp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "partitions": [
  3 |     {
  4 |       "nodes": {
  5 |         "allowed_allocation": "",
  6 |         "configured": "n[0111-0135,0180-0196]",
  7 |         "total": 42
  8 |       },
  9 |       "accounts": {
 10 |         "allowed": "",
 11 |         "deny": ""
 12 |       },
 13 |       "groups": {
 14 |         "allowed": ""
 15 |       },
 16 |       "qos": {
 17 |         "allowed": "",
 18 |         "deny": "",
 19 |         "assigned": "compute"
 20 |       },
 21 |       "alternate": "",
 22 |       "tres": {
 23 |         "billing_weights": "",
 24 |         "configured": "cpu=5376,mem=21186186M,node=42,billing=5376"
 25 |       },
 26 |       "cluster": "",
 27 |       "cpus": {
 28 |         "task_binding": 0,
 29 |         "total": 5376
 30 |       },
 31 |       "defaults": {
 32 |         "memory_per_cpu": -9223372036854771712,
 33 |         "partition_memory_per_cpu": {
 34 |           "set": true,
 35 |           "infinite": false,
 36 |           "number": 4096
 37 |         },
 38 |         "partition_memory_per_node": {
 39 |           "set": false,
 40 |           "infinite": false,
 41 |           "number": 0
 42 |         },
 43 |         "time": {
 44 |           "set": true,
 45 |           "infinite": false,
 46 |           "number": 1440
 47 |         },
 48 |         "job": ""
 49 |       },
 50 |       "grace_time": 0,
 51 |       "maximums": {
 52 |         "cpus_per_node": {
 53 |           "set": false,
 54 |           "infinite": true,
 55 |           "number": 0
 56 |         },
 57 |         "cpus_per_socket": {
 58 |           "set": false,
 59 |           "infinite": true,
 60 |           "number": 0
 61 |         },
 62 |         "memory_per_cpu": 0,
 63 |         "partition_memory_per_cpu": {
 64 |           "set": false,
 65 |           "infinite": false,
 66 |           "number": 0
 67 |         },
 68 |         "partition_memory_per_node": {
 69 |           "set": true,
 70 |           "infinite": false,
 71 |           "number": 0
 72 |         },
 73 |         "nodes": {
 74 |           "set": false,
 75 |           "infinite": true,
 76 |           "number": 0
 77 |         },
 78 |         "shares": 1,
 79 |         "oversubscribe": {
 80 |           "jobs": 1,
 81 |           "flags": []
 82 |         },
 83 |         "time": {
 84 |           "set": true,
 85 |           "infinite": false,
 86 |           "number": 1440
 87 |         },
 88 |         "over_time_limit": {
 89 |           "set": false,
 90 |           "infinite": false,
 91 |           "number": 0
 92 |         }
 93 |       },
 94 |       "minimums": {
 95 |         "nodes": 1
 96 |       },
 97 |       "name": "compute",
 98 |       "node_sets": "",
 99 |       "priority": {
100 |         "job_factor": 200,
101 |         "tier": 200
102 |       },
103 |       "timeouts": {
104 |         "resume": {
105 |           "set": false,
106 |           "infinite": false,
107 |           "number": 0
108 |         },
109 |         "suspend": {
110 |           "set": false,
111 |           "infinite": false,
112 |           "number": 0
113 |         }
114 |       },
115 |       "partition": {
116 |         "state": [
117 |           "UP"
118 |         ]
119 |       },
120 |       "suspend_time": {
121 |         "set": false,
122 |         "infinite": false,
123 |         "number": 0
124 |       }
125 |     },
126 |     {
127 |       "nodes": {
128 |         "allowed_allocation": "",
129 |         "configured": "n[0149-0160,0162-0172]",
130 |         "total": 23
131 |       },
132 |       "accounts": {
133 |         "allowed": "",
134 |         "deny": ""
135 |       },
136 |       "groups": {
137 |         "allowed": ""
138 |       },
139 |       "qos": {
140 |         "allowed": "",
141 |         "deny": "",
142 |         "assigned": "gpu"
143 |       },
144 |       "alternate": "",
145 |       "tres": {
146 |         "billing_weights": "",
147 |         "configured": "cpu=1104,mem=8505383M,node=23,billing=1104,gres/gpu=155"
148 |       },
149 |       "cluster": "",
150 |       "cpus": {
151 |         "task_binding": 0,
152 |         "total": 1104
153 |       },
154 |       "defaults": {
155 |         "memory_per_cpu": -9223372036854771712,
156 |         "partition_memory_per_cpu": {
157 |           "set": true,
158 |           "infinite": false,
159 |           "number": 4096
160 |         },
161 |         "partition_memory_per_node": {
162 |           "set": false,
163 |           "infinite": false,
164 |           "number": 0
165 |         },
166 |         "time": {
167 |           "set": true,
168 |           "infinite": false,
169 |           "number": 1440
170 |         },
171 |         "job": ""
172 |       },
173 |       "grace_time": 0,
174 |       "maximums": {
175 |         "cpus_per_node": {
176 |           "set": false,
177 |           "infinite": true,
178 |           "number": 0
179 |         },
180 |         "cpus_per_socket": {
181 |           "set": false,
182 |           "infinite": true,
183 |           "number": 0
184 |         },
185 |         "memory_per_cpu": 0,
186 |         "partition_memory_per_cpu": {
187 |           "set": false,
188 |           "infinite": false,
189 |           "number": 0
190 |         },
191 |         "partition_memory_per_node": {
192 |           "set": true,
193 |           "infinite": false,
194 |           "number": 0
195 |         },
196 |         "nodes": {
197 |           "set": false,
198 |           "infinite": true,
199 |           "number": 0
200 |         },
201 |         "shares": 1,
202 |         "oversubscribe": {
203 |           "jobs": 1,
204 |           "flags": []
205 |         },
206 |         "time": {
207 |           "set": true,
208 |           "infinite": false,
209 |           "number": 1440
210 |         },
211 |         "over_time_limit": {
212 |           "set": false,
213 |           "infinite": false,
214 |           "number": 0
215 |         }
216 |       },
217 |       "minimums": {
218 |         "nodes": 1
219 |       },
220 |       "name": "gpu",
221 |       "node_sets": "",
222 |       "priority": {
223 |         "job_factor": 200,
224 |         "tier": 200
225 |       },
226 |       "timeouts": {
227 |         "resume": {
228 |           "set": false,
229 |           "infinite": false,
230 |           "number": 0
231 |         },
232 |         "suspend": {
233 |           "set": false,
234 |           "infinite": false,
235 |           "number": 0
236 |         }
237 |       },
238 |       "partition": {
239 |         "state": [
240 |           "UP"
241 |         ]
242 |       },
243 |       "suspend_time": {
244 |         "set": false,
245 |         "infinite": false,
246 |         "number": 0
247 |       }
248 |     },
249 |     {
250 |       "nodes": {
251 |         "allowed_allocation": "",
252 |         "configured": "n[0141-0148,0372-0379]",
253 |         "total": 16
254 |       },
255 |       "accounts": {
256 |         "allowed": "",
257 |         "deny": ""
258 |       },
259 |       "groups": {
260 |         "allowed": ""
261 |       },
262 |       "qos": {
263 |         "allowed": "",
264 |         "deny": "",
265 |         "assigned": "memory"
266 |       },
267 |       "alternate": "",
268 |       "tres": {
269 |         "billing_weights": "",
270 |         "configured": "cpu=896,mem=36741334M,node=16,billing=896"
271 |       },
272 |       "cluster": "",
273 |       "cpus": {
274 |         "task_binding": 0,
275 |         "total": 896
276 |       },
277 |       "defaults": {
278 |         "memory_per_cpu": -9223372036854771712,
279 |         "partition_memory_per_cpu": {
280 |           "set": true,
281 |           "infinite": false,
282 |           "number": 4096
283 |         },
284 |         "partition_memory_per_node": {
285 |           "set": false,
286 |           "infinite": false,
287 |           "number": 0
288 |         },
289 |         "time": {
290 |           "set": true,
291 |           "infinite": false,
292 |           "number": 1440
293 |         },
294 |         "job": ""
295 |       },
296 |       "grace_time": 0,
297 |       "maximums": {
298 |         "cpus_per_node": {
299 |           "set": false,
300 |           "infinite": true,
301 |           "number": 0
302 |         },
303 |         "cpus_per_socket": {
304 |           "set": false,
305 |           "infinite": true,
306 |           "number": 0
307 |         },
308 |         "memory_per_cpu": 0,
309 |         "partition_memory_per_cpu": {
310 |           "set": false,
311 |           "infinite": false,
312 |           "number": 0
313 |         },
314 |         "partition_memory_per_node": {
315 |           "set": true,
316 |           "infinite": false,
317 |           "number": 0
318 |         },
319 |         "nodes": {
320 |           "set": false,
321 |           "infinite": true,
322 |           "number": 0
323 |         },
324 |         "shares": 1,
325 |         "oversubscribe": {
326 |           "jobs": 1,
327 |           "flags": []
328 |         },
329 |         "time": {
330 |           "set": true,
331 |           "infinite": false,
332 |           "number": 1440
333 |         },
334 |         "over_time_limit": {
335 |           "set": false,
336 |           "infinite": false,
337 |           "number": 0
338 |         }
339 |       },
340 |       "minimums": {
341 |         "nodes": 1
342 |       },
343 |       "name": "memory",
344 |       "node_sets": "",
345 |       "priority": {
346 |         "job_factor": 200,
347 |         "tier": 200
348 |       },
349 |       "timeouts": {
350 |         "resume": {
351 |           "set": false,
352 |           "infinite": false,
353 |           "number": 0
354 |         },
355 |         "suspend": {
356 |           "set": false,
357 |           "infinite": false,
358 |           "number": 0
359 |         }
360 |       },
361 |       "partition": {
362 |         "state": [
363 |           "UP"
364 |         ]
365 |       },
366 |       "suspend_time": {
367 |         "set": false,
368 |         "infinite": false,
369 |         "number": 0
370 |       }
371 |     },
372 |     {
373 |       "nodes": {
374 |         "allowed_allocation": "",
375 |         "configured": "n[0142,0144,0146,0148,0372,0374,0376,0378]",
376 |         "total": 8
377 |       },
378 |       "accounts": {
379 |         "allowed": "",
380 |         "deny": ""
381 |       },
382 |       "groups": {
383 |         "allowed": ""
384 |       },
385 |       "qos": {
386 |         "allowed": "",
387 |         "deny": "",
388 |         "assigned": "memory"
389 |       },
390 |       "alternate": "",
391 |       "tres": {
392 |         "billing_weights": "",
393 |         "configured": "cpu=448,mem=18370667M,node=8,billing=448"
394 |       },
395 |       "cluster": "",
396 |       "cpus": {
397 |         "task_binding": 0,
398 |         "total": 448
399 |       },
400 |       "defaults": {
401 |         "memory_per_cpu": -9223372036854771712,
402 |         "partition_memory_per_cpu": {
403 |           "set": true,
404 |           "infinite": false,
405 |           "number": 4096
406 |         },
407 |         "partition_memory_per_node": {
408 |           "set": false,
409 |           "infinite": false,
410 |           "number": 0
411 |         },
412 |         "time": {
413 |           "set": true,
414 |           "infinite": false,
415 |           "number": 20160
416 |         },
417 |         "job": ""
418 |       },
419 |       "grace_time": 0,
420 |       "maximums": {
421 |         "cpus_per_node": {
422 |           "set": false,
423 |           "infinite": true,
424 |           "number": 0
425 |         },
426 |         "cpus_per_socket": {
427 |           "set": false,
428 |           "infinite": true,
429 |           "number": 0
430 |         },
431 |         "memory_per_cpu": 0,
432 |         "partition_memory_per_cpu": {
433 |           "set": false,
434 |           "infinite": false,
435 |           "number": 0
436 |         },
437 |         "partition_memory_per_node": {
438 |           "set": true,
439 |           "infinite": false,
440 |           "number": 0
441 |         },
442 |         "nodes": {
443 |           "set": false,
444 |           "infinite": true,
445 |           "number": 0
446 |         },
447 |         "shares": 1,
448 |         "oversubscribe": {
449 |           "jobs": 1,
450 |           "flags": []
451 |         },
452 |         "time": {
453 |           "set": true,
454 |           "infinite": false,
455 |           "number": 20160
456 |         },
457 |         "over_time_limit": {
458 |           "set": false,
459 |           "infinite": false,
460 |           "number": 0
461 |         }
462 |       },
463 |       "minimums": {
464 |         "nodes": 1
465 |       },
466 |       "name": "memorylong",
467 |       "node_sets": "",
468 |       "priority": {
469 |         "job_factor": 200,
470 |         "tier": 200
471 |       },
472 |       "timeouts": {
473 |         "resume": {
474 |           "set": false,
475 |           "infinite": false,
476 |           "number": 0
477 |         },
478 |         "suspend": {
479 |           "set": false,
480 |           "infinite": false,
481 |           "number": 0
482 |         }
483 |       },
484 |       "partition": {
485 |         "state": [
486 |           "UP"
487 |         ]
488 |       },
489 |       "suspend_time": {
490 |         "set": false,
491 |         "infinite": false,
492 |         "number": 0
493 |       }
494 |     },
495 |     {
496 |       "nodes": {
497 |         "allowed_allocation": "",
498 |         "configured": "n[0013-0044,0049-0136,0141-0189,0191-0196,0199,0201-0242,0244-0269,0301-0308,0310-0399,0998-1000]",
499 |         "total": 345
500 |       },
501 |       "accounts": {
502 |         "allowed": "",
503 |         "deny": ""
504 |       },
505 |       "groups": {
506 |         "allowed": ""
507 |       },
508 |       "qos": {
509 |         "allowed": "",
510 |         "deny": "",
511 |         "assigned": "preempt"
512 |       },
513 |       "alternate": "",
514 |       "tres": {
515 |         "billing_weights": "",
516 |         "configured": "cpu=17772,mem=145334877M,node=345,billing=17772,gres/gpu=230"
517 |       },
518 |       "cluster": "",
519 |       "cpus": {
520 |         "task_binding": 0,
521 |         "total": 17772
522 |       },
523 |       "defaults": {
524 |         "memory_per_cpu": -9223372036854771712,
525 |         "partition_memory_per_cpu": {
526 |           "set": true,
527 |           "infinite": false,
528 |           "number": 4096
529 |         },
530 |         "partition_memory_per_node": {
531 |           "set": false,
532 |           "infinite": false,
533 |           "number": 0
534 |         },
535 |         "time": {
536 |           "set": true,
537 |           "infinite": false,
538 |           "number": 10080
539 |         },
540 |         "job": ""
541 |       },
542 |       "grace_time": 0,
543 |       "maximums": {
544 |         "cpus_per_node": {
545 |           "set": false,
546 |           "infinite": true,
547 |           "number": 0
548 |         },
549 |         "cpus_per_socket": {
550 |           "set": false,
551 |           "infinite": true,
552 |           "number": 0
553 |         },
554 |         "memory_per_cpu": 0,
555 |         "partition_memory_per_cpu": {
556 |           "set": false,
557 |           "infinite": false,
558 |           "number": 0
559 |         },
560 |         "partition_memory_per_node": {
561 |           "set": true,
562 |           "infinite": false,
563 |           "number": 0
564 |         },
565 |         "nodes": {
566 |           "set": true,
567 |           "infinite": false,
568 |           "number": 48
569 |         },
570 |         "shares": 1,
571 |         "oversubscribe": {
572 |           "jobs": 1,
573 |           "flags": []
574 |         },
575 |         "time": {
576 |           "set": true,
577 |           "infinite": false,
578 |           "number": 10080
579 |         },
580 |         "over_time_limit": {
581 |           "set": false,
582 |           "infinite": false,
583 |           "number": 0
584 |         }
585 |       },
586 |       "minimums": {
587 |         "nodes": 1
588 |       },
589 |       "name": "preempt",
590 |       "node_sets": "",
591 |       "priority": {
592 |         "job_factor": 1,
593 |         "tier": 1
594 |       },
595 |       "timeouts": {
596 |         "resume": {
597 |           "set": false,
598 |           "infinite": false,
599 |           "number": 0
600 |         },
601 |         "suspend": {
602 |           "set": false,
603 |           "infinite": false,
604 |           "number": 0
605 |         }
606 |       },
607 |       "partition": {
608 |         "state": [
609 |           "UP"
610 |         ]
611 |       },
612 |       "suspend_time": {
613 |         "set": false,
614 |         "infinite": false,
615 |         "number": 0
616 |       }
617 |     }
618 |   ],
619 |   "last_update": {
620 |     "set": true,
621 |     "infinite": false,
622 |     "number": 1727286013
623 |   },
624 |   "meta": {
625 |     "plugin": {
626 |       "type": "openapi/slurmctld",
627 |       "name": "Slurm OpenAPI slurmctld",
628 |       "data_parser": "data_parser/v0.0.40",
629 |       "accounting_storage": "accounting_storage/slurmdbd"
630 |     },
631 |     "client": {
632 |       "source": "[10.174.139.128]:55418",
633 |       "user": "root",
634 |       "group": "root"
635 |     },
636 |     "command": [],
637 |     "slurm": {
638 |       "version": {
639 |         "major": "23",
640 |         "micro": "1",
641 |         "minor": "11"
642 |       },
643 |       "release": "23.11.1",
644 |       "cluster": "cluster"
645 |     }
646 |   },
647 |   "errors": [],
648 |   "warnings": []
649 | }
650 | 


--------------------------------------------------------------------------------
/testdata/V0040OpenapiJobInfoResp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "jobs": [
  3 |     {
  4 |       "account": "jamming",
  5 |       "accrue_time": {
  6 |         "set": true,
  7 |         "infinite": false,
  8 |         "number": 1722268326
  9 |       },
 10 |       "admin_comment": "",
 11 |       "allocating_node": "login1",
 12 |       "array_job_id": {
 13 |         "set": true,
 14 |         "infinite": false,
 15 |         "number": 7725337
 16 |       },
 17 |       "array_task_id": {
 18 |         "set": true,
 19 |         "infinite": false,
 20 |         "number": 411
 21 |       },
 22 |       "array_max_tasks": {
 23 |         "set": true,
 24 |         "infinite": false,
 25 |         "number": 0
 26 |       },
 27 |       "array_task_string": "",
 28 |       "association_id": 70,
 29 |       "batch_features": "",
 30 |       "batch_flag": true,
 31 |       "batch_host": "n0180",
 32 |       "flags": [
 33 |         "ACCRUE_COUNT_CLEARED",
 34 |         "JOB_WAS_RUNNING",
 35 |         "USING_DEFAULT_QOS",
 36 |         "USING_DEFAULT_WCKEY"
 37 |       ],
 38 |       "burst_buffer": "",
 39 |       "burst_buffer_state": "",
 40 |       "cluster": "talapas",
 41 |       "cluster_features": "",
 42 |       "command": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/relaxAndShearScheme.srun",
 43 |       "comment": "",
 44 |       "container": "",
 45 |       "container_id": "",
 46 |       "contiguous": false,
 47 |       "core_spec": 0,
 48 |       "thread_spec": 32766,
 49 |       "cores_per_socket": {
 50 |         "set": false,
 51 |         "infinite": false,
 52 |         "number": 0
 53 |       },
 54 |       "billable_tres": {
 55 |         "set": true,
 56 |         "infinite": false,
 57 |         "number": 1.0
 58 |       },
 59 |       "cpus_per_task": {
 60 |         "set": true,
 61 |         "infinite": false,
 62 |         "number": 1
 63 |       },
 64 |       "cpu_frequency_minimum": {
 65 |         "set": false,
 66 |         "infinite": false,
 67 |         "number": 0
 68 |       },
 69 |       "cpu_frequency_maximum": {
 70 |         "set": false,
 71 |         "infinite": false,
 72 |         "number": 0
 73 |       },
 74 |       "cpu_frequency_governor": {
 75 |         "set": false,
 76 |         "infinite": false,
 77 |         "number": 0
 78 |       },
 79 |       "cpus_per_tres": "",
 80 |       "cron": "",
 81 |       "deadline": {
 82 |         "set": true,
 83 |         "infinite": false,
 84 |         "number": 0
 85 |       },
 86 |       "delay_boot": {
 87 |         "set": true,
 88 |         "infinite": false,
 89 |         "number": 0
 90 |       },
 91 |       "dependency": "",
 92 |       "derived_exit_code": {
 93 |         "status": [
 94 |           "SUCCESS"
 95 |         ],
 96 |         "return_code": {
 97 |           "set": true,
 98 |           "infinite": false,
 99 |           "number": 0
100 |         },
101 |         "signal": {
102 |           "id": {
103 |             "set": false,
104 |             "infinite": false,
105 |             "number": 0
106 |           },
107 |           "name": ""
108 |         }
109 |       },
110 |       "eligible_time": {
111 |         "set": true,
112 |         "infinite": false,
113 |         "number": 1722268326
114 |       },
115 |       "end_time": {
116 |         "set": true,
117 |         "infinite": false,
118 |         "number": 1722360761
119 |       },
120 |       "excluded_nodes": "",
121 |       "exit_code": {
122 |         "status": [
123 |           "SUCCESS"
124 |         ],
125 |         "return_code": {
126 |           "set": true,
127 |           "infinite": false,
128 |           "number": 0
129 |         },
130 |         "signal": {
131 |           "id": {
132 |             "set": false,
133 |             "infinite": false,
134 |             "number": 0
135 |           },
136 |           "name": ""
137 |         }
138 |       },
139 |       "extra": "",
140 |       "failed_node": "",
141 |       "features": "",
142 |       "federation_origin": "",
143 |       "federation_siblings_active": "",
144 |       "federation_siblings_viable": "",
145 |       "gres_detail": [],
146 |       "group_id": 131,
147 |       "group_name": "uoregon",
148 |       "het_job_id": {
149 |         "set": true,
150 |         "infinite": false,
151 |         "number": 0
152 |       },
153 |       "het_job_id_set": "",
154 |       "het_job_offset": {
155 |         "set": true,
156 |         "infinite": false,
157 |         "number": 0
158 |       },
159 |       "job_id": 7745162,
160 |       "job_resources": {
161 |         "nodes": "n0180",
162 |         "allocated_cores": 1,
163 |         "allocated_cpus": 0,
164 |         "allocated_hosts": 1,
165 |         "allocated_nodes": [
166 |           {
167 |             "sockets": {
168 |               "1": {
169 |                 "cores": {
170 |                   "51": "allocated"
171 |                 }
172 |               }
173 |             },
174 |             "nodename": "n0180",
175 |             "cpus_used": 0,
176 |             "memory_used": 0,
177 |             "memory_allocated": 4096
178 |           }
179 |         ]
180 |       },
181 |       "job_size_str": [],
182 |       "job_state": [
183 |         "RUNNING"
184 |       ],
185 |       "last_sched_evaluation": {
186 |         "set": true,
187 |         "infinite": false,
188 |         "number": 1722274361
189 |       },
190 |       "licenses": "",
191 |       "mail_type": [],
192 |       "mail_user": "rdennis",
193 |       "max_cpus": {
194 |         "set": true,
195 |         "infinite": false,
196 |         "number": 0
197 |       },
198 |       "max_nodes": {
199 |         "set": true,
200 |         "infinite": false,
201 |         "number": 0
202 |       },
203 |       "mcs_label": "",
204 |       "memory_per_tres": "",
205 |       "name": "rands",
206 |       "network": "",
207 |       "nodes": "n0180",
208 |       "nice": 0,
209 |       "tasks_per_core": {
210 |         "set": false,
211 |         "infinite": true,
212 |         "number": 0
213 |       },
214 |       "tasks_per_tres": {
215 |         "set": true,
216 |         "infinite": false,
217 |         "number": 0
218 |       },
219 |       "tasks_per_node": {
220 |         "set": true,
221 |         "infinite": false,
222 |         "number": 1
223 |       },
224 |       "tasks_per_socket": {
225 |         "set": false,
226 |         "infinite": true,
227 |         "number": 0
228 |       },
229 |       "tasks_per_board": {
230 |         "set": true,
231 |         "infinite": false,
232 |         "number": 0
233 |       },
234 |       "cpus": {
235 |         "set": true,
236 |         "infinite": false,
237 |         "number": 1
238 |       },
239 |       "node_count": {
240 |         "set": true,
241 |         "infinite": false,
242 |         "number": 1
243 |       },
244 |       "tasks": {
245 |         "set": true,
246 |         "infinite": false,
247 |         "number": 1
248 |       },
249 |       "partition": "preempt",
250 |       "prefer": "",
251 |       "memory_per_cpu": {
252 |         "set": true,
253 |         "infinite": false,
254 |         "number": 4096
255 |       },
256 |       "memory_per_node": {
257 |         "set": false,
258 |         "infinite": false,
259 |         "number": 0
260 |       },
261 |       "minimum_cpus_per_node": {
262 |         "set": true,
263 |         "infinite": false,
264 |         "number": 1
265 |       },
266 |       "minimum_tmp_disk_per_node": {
267 |         "set": true,
268 |         "infinite": false,
269 |         "number": 0
270 |       },
271 |       "power": {
272 |         "flags": []
273 |       },
274 |       "preempt_time": {
275 |         "set": true,
276 |         "infinite": false,
277 |         "number": 0
278 |       },
279 |       "preemptable_time": {
280 |         "set": true,
281 |         "infinite": false,
282 |         "number": 1722274361
283 |       },
284 |       "pre_sus_time": {
285 |         "set": true,
286 |         "infinite": false,
287 |         "number": 0
288 |       },
289 |       "priority": {
290 |         "set": true,
291 |         "infinite": false,
292 |         "number": 169465
293 |       },
294 |       "profile": [
295 |         "NOT_SET"
296 |       ],
297 |       "qos": "normal",
298 |       "reboot": false,
299 |       "required_nodes": "",
300 |       "minimum_switches": 0,
301 |       "requeue": false,
302 |       "resize_time": {
303 |         "set": true,
304 |         "infinite": false,
305 |         "number": 0
306 |       },
307 |       "restart_cnt": 0,
308 |       "resv_name": "",
309 |       "scheduled_nodes": "",
310 |       "selinux_context": "",
311 |       "shared": [],
312 |       "exclusive": [],
313 |       "oversubscribe": true,
314 |       "show_flags": [
315 |         "ALL",
316 |         "DETAIL",
317 |         "LOCAL"
318 |       ],
319 |       "sockets_per_board": 0,
320 |       "sockets_per_node": {
321 |         "set": false,
322 |         "infinite": false,
323 |         "number": 0
324 |       },
325 |       "start_time": {
326 |         "set": true,
327 |         "infinite": false,
328 |         "number": 1722274361
329 |       },
330 |       "state_description": "",
331 |       "state_reason": "None",
332 |       "standard_error": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_411.out",
333 |       "standard_input": "/dev/null",
334 |       "standard_output": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_411.out",
335 |       "submit_time": {
336 |         "set": true,
337 |         "infinite": false,
338 |         "number": 1722268317
339 |       },
340 |       "suspend_time": {
341 |         "set": true,
342 |         "infinite": false,
343 |         "number": 0
344 |       },
345 |       "system_comment": "",
346 |       "time_limit": {
347 |         "set": true,
348 |         "infinite": false,
349 |         "number": 1440
350 |       },
351 |       "time_minimum": {
352 |         "set": true,
353 |         "infinite": false,
354 |         "number": 0
355 |       },
356 |       "threads_per_core": {
357 |         "set": false,
358 |         "infinite": false,
359 |         "number": 0
360 |       },
361 |       "tres_bind": "",
362 |       "tres_freq": "",
363 |       "tres_per_job": "",
364 |       "tres_per_node": "",
365 |       "tres_per_socket": "",
366 |       "tres_per_task": "",
367 |       "tres_req_str": "cpu=1,mem=4G,node=1,billing=1",
368 |       "tres_alloc_str": "cpu=1,mem=4G,node=1,billing=1",
369 |       "user_id": 110622,
370 |       "user_name": "rdennis",
371 |       "maximum_switch_wait_time": 0,
372 |       "wckey": "",
373 |       "current_working_directory": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all"
374 |     },
375 |     {
376 |       "account": "jamming",
377 |       "accrue_time": {
378 |         "set": true,
379 |         "infinite": false,
380 |         "number": 1722268326
381 |       },
382 |       "admin_comment": "",
383 |       "allocating_node": "login1",
384 |       "array_job_id": {
385 |         "set": true,
386 |         "infinite": false,
387 |         "number": 7725337
388 |       },
389 |       "array_task_id": {
390 |         "set": false,
391 |         "infinite": false,
392 |         "number": 0
393 |       },
394 |       "array_max_tasks": {
395 |         "set": true,
396 |         "infinite": false,
397 |         "number": 0
398 |       },
399 |       "array_task_string": "412-9999",
400 |       "association_id": 70,
401 |       "batch_features": "",
402 |       "batch_flag": true,
403 |       "batch_host": "",
404 |       "flags": [
405 |         "USING_DEFAULT_QOS",
406 |         "USING_DEFAULT_WCKEY"
407 |       ],
408 |       "burst_buffer": "",
409 |       "burst_buffer_state": "",
410 |       "cluster": "talapas",
411 |       "cluster_features": "",
412 |       "command": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/relaxAndShearScheme.srun",
413 |       "comment": "",
414 |       "container": "",
415 |       "container_id": "",
416 |       "contiguous": false,
417 |       "core_spec": 0,
418 |       "thread_spec": 32766,
419 |       "cores_per_socket": {
420 |         "set": false,
421 |         "infinite": false,
422 |         "number": 0
423 |       },
424 |       "billable_tres": {
425 |         "set": false,
426 |         "infinite": false,
427 |         "number": 0.0
428 |       },
429 |       "cpus_per_task": {
430 |         "set": true,
431 |         "infinite": false,
432 |         "number": 1
433 |       },
434 |       "cpu_frequency_minimum": {
435 |         "set": false,
436 |         "infinite": false,
437 |         "number": 0
438 |       },
439 |       "cpu_frequency_maximum": {
440 |         "set": false,
441 |         "infinite": false,
442 |         "number": 0
443 |       },
444 |       "cpu_frequency_governor": {
445 |         "set": false,
446 |         "infinite": false,
447 |         "number": 0
448 |       },
449 |       "cpus_per_tres": "",
450 |       "cron": "",
451 |       "deadline": {
452 |         "set": true,
453 |         "infinite": false,
454 |         "number": 0
455 |       },
456 |       "delay_boot": {
457 |         "set": true,
458 |         "infinite": false,
459 |         "number": 0
460 |       },
461 |       "dependency": "",
462 |       "derived_exit_code": {
463 |         "status": [
464 |           "SUCCESS"
465 |         ],
466 |         "return_code": {
467 |           "set": true,
468 |           "infinite": false,
469 |           "number": 0
470 |         },
471 |         "signal": {
472 |           "id": {
473 |             "set": false,
474 |             "infinite": false,
475 |             "number": 0
476 |           },
477 |           "name": ""
478 |         }
479 |       },
480 |       "eligible_time": {
481 |         "set": true,
482 |         "infinite": false,
483 |         "number": 1722268326
484 |       },
485 |       "end_time": {
486 |         "set": true,
487 |         "infinite": false,
488 |         "number": 0
489 |       },
490 |       "excluded_nodes": "",
491 |       "exit_code": {
492 |         "status": [
493 |           "SUCCESS"
494 |         ],
495 |         "return_code": {
496 |           "set": true,
497 |           "infinite": false,
498 |           "number": 0
499 |         },
500 |         "signal": {
501 |           "id": {
502 |             "set": false,
503 |             "infinite": false,
504 |             "number": 0
505 |           },
506 |           "name": ""
507 |         }
508 |       },
509 |       "extra": "",
510 |       "failed_node": "",
511 |       "features": "",
512 |       "federation_origin": "",
513 |       "federation_siblings_active": "",
514 |       "federation_siblings_viable": "",
515 |       "gres_detail": [],
516 |       "group_id": 131,
517 |       "group_name": "uoregon",
518 |       "het_job_id": {
519 |         "set": true,
520 |         "infinite": false,
521 |         "number": 0
522 |       },
523 |       "het_job_id_set": "",
524 |       "het_job_offset": {
525 |         "set": true,
526 |         "infinite": false,
527 |         "number": 0
528 |       },
529 |       "job_id": 7725337,
530 |       "job_resources": {},
531 |       "job_size_str": [],
532 |       "job_state": [
533 |         "PENDING"
534 |       ],
535 |       "last_sched_evaluation": {
536 |         "set": true,
537 |         "infinite": false,
538 |         "number": 1722274361
539 |       },
540 |       "licenses": "",
541 |       "mail_type": [],
542 |       "mail_user": "rdennis",
543 |       "max_cpus": {
544 |         "set": true,
545 |         "infinite": false,
546 |         "number": 0
547 |       },
548 |       "max_nodes": {
549 |         "set": true,
550 |         "infinite": false,
551 |         "number": 1
552 |       },
553 |       "mcs_label": "",
554 |       "memory_per_tres": "",
555 |       "name": "rands",
556 |       "network": "",
557 |       "nodes": "",
558 |       "nice": 0,
559 |       "tasks_per_core": {
560 |         "set": false,
561 |         "infinite": true,
562 |         "number": 0
563 |       },
564 |       "tasks_per_tres": {
565 |         "set": true,
566 |         "infinite": false,
567 |         "number": 0
568 |       },
569 |       "tasks_per_node": {
570 |         "set": true,
571 |         "infinite": false,
572 |         "number": 1
573 |       },
574 |       "tasks_per_socket": {
575 |         "set": false,
576 |         "infinite": true,
577 |         "number": 0
578 |       },
579 |       "tasks_per_board": {
580 |         "set": true,
581 |         "infinite": false,
582 |         "number": 0
583 |       },
584 |       "cpus": {
585 |         "set": true,
586 |         "infinite": false,
587 |         "number": 1
588 |       },
589 |       "node_count": {
590 |         "set": true,
591 |         "infinite": false,
592 |         "number": 1
593 |       },
594 |       "tasks": {
595 |         "set": true,
596 |         "infinite": false,
597 |         "number": 1
598 |       },
599 |       "partition": "preempt",
600 |       "prefer": "",
601 |       "memory_per_cpu": {
602 |         "set": true,
603 |         "infinite": false,
604 |         "number": 4096
605 |       },
606 |       "memory_per_node": {
607 |         "set": false,
608 |         "infinite": false,
609 |         "number": 0
610 |       },
611 |       "minimum_cpus_per_node": {
612 |         "set": true,
613 |         "infinite": false,
614 |         "number": 1
615 |       },
616 |       "minimum_tmp_disk_per_node": {
617 |         "set": true,
618 |         "infinite": false,
619 |         "number": 0
620 |       },
621 |       "power": {
622 |         "flags": []
623 |       },
624 |       "preempt_time": {
625 |         "set": true,
626 |         "infinite": false,
627 |         "number": 0
628 |       },
629 |       "preemptable_time": {
630 |         "set": true,
631 |         "infinite": false,
632 |         "number": 0
633 |       },
634 |       "pre_sus_time": {
635 |         "set": true,
636 |         "infinite": false,
637 |         "number": 0
638 |       },
639 |       "priority": {
640 |         "set": true,
641 |         "infinite": false,
642 |         "number": 169465
643 |       },
644 |       "profile": [
645 |         "NOT_SET"
646 |       ],
647 |       "qos": "normal",
648 |       "reboot": false,
649 |       "required_nodes": "",
650 |       "minimum_switches": 0,
651 |       "requeue": false,
652 |       "resize_time": {
653 |         "set": true,
654 |         "infinite": false,
655 |         "number": 0
656 |       },
657 |       "restart_cnt": 0,
658 |       "resv_name": "",
659 |       "scheduled_nodes": "",
660 |       "selinux_context": "",
661 |       "shared": [],
662 |       "exclusive": [],
663 |       "oversubscribe": true,
664 |       "show_flags": [
665 |         "ALL",
666 |         "DETAIL",
667 |         "LOCAL"
668 |       ],
669 |       "sockets_per_board": 0,
670 |       "sockets_per_node": {
671 |         "set": false,
672 |         "infinite": false,
673 |         "number": 0
674 |       },
675 |       "start_time": {
676 |         "set": true,
677 |         "infinite": false,
678 |         "number": 0
679 |       },
680 |       "state_description": "",
681 |       "state_reason": "Resources",
682 |       "standard_error": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_4294967294.out",
683 |       "standard_input": "/dev/null",
684 |       "standard_output": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_4294967294.out",
685 |       "submit_time": {
686 |         "set": true,
687 |         "infinite": false,
688 |         "number": 1722268317
689 |       },
690 |       "suspend_time": {
691 |         "set": true,
692 |         "infinite": false,
693 |         "number": 0
694 |       },
695 |       "system_comment": "",
696 |       "time_limit": {
697 |         "set": true,
698 |         "infinite": false,
699 |         "number": 1440
700 |       },
701 |       "time_minimum": {
702 |         "set": true,
703 |         "infinite": false,
704 |         "number": 0
705 |       },
706 |       "threads_per_core": {
707 |         "set": false,
708 |         "infinite": false,
709 |         "number": 0
710 |       },
711 |       "tres_bind": "",
712 |       "tres_freq": "",
713 |       "tres_per_job": "",
714 |       "tres_per_node": "",
715 |       "tres_per_socket": "",
716 |       "tres_per_task": "",
717 |       "tres_req_str": "cpu=1,mem=4G,node=1,billing=1",
718 |       "tres_alloc_str": "",
719 |       "user_id": 110622,
720 |       "user_name": "rdennis",
721 |       "maximum_switch_wait_time": 0,
722 |       "wckey": "",
723 |       "current_working_directory": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all"
724 |     }
725 |   ],
726 |   "last_backfill": {
727 |     "set": true,
728 |     "infinite": false,
729 |     "number": 1722274362
730 |   },
731 |   "last_update": {
732 |     "set": true,
733 |     "infinite": false,
734 |     "number": 1722274381
735 |   },
736 |   "meta": {
737 |     "plugin": {
738 |       "type": "openapi/slurmctld",
739 |       "name": "Slurm OpenAPI slurmctld",
740 |       "data_parser": "data_parser/v0.0.40",
741 |       "accounting_storage": "accounting_storage/slurmdbd"
742 |     },
743 |     "client": {
744 |       "source": "[10.174.139.109]:49957",
745 |       "user": "root",
746 |       "group": "root"
747 |     },
748 |     "command": [],
749 |     "slurm": {
750 |       "version": {
751 |         "major": "23",
752 |         "micro": "1",
753 |         "minor": "11"
754 |       },
755 |       "release": "23.11.1",
756 |       "cluster": "talapas"
757 |     }
758 |   },
759 |   "errors": [],
760 |   "warnings": []
761 | }
762 | 


--------------------------------------------------------------------------------