├── internal ├── api │ ├── transport_test.go │ ├── endpoints.go │ ├── endpoints_2311.go │ ├── endpoints_2405.go │ ├── endpoints_2411.go │ ├── handler.go │ ├── cache.go │ ├── unmarshalers_2311_test.go │ ├── unmarshalers_2405_test.go │ ├── unmarshalers_2411_test.go │ ├── responses_2311.go │ ├── responses_2405.go │ ├── responses_2411.go │ ├── unmarshalers.go │ └── transport.go ├── util │ ├── time.go │ ├── fmt.go │ ├── infinity.go │ └── testdata.go ├── types │ ├── keys.go │ ├── nodes.go │ └── jobs.go └── slurm │ ├── fairshare.go │ ├── users.go │ ├── gpus.go │ ├── cpus.go │ ├── account.go │ ├── node.go │ ├── nodes.go │ ├── queue.go │ ├── partitions.go │ └── scheduler.go ├── .gitignore ├── images ├── Job_Status.png ├── Node_Status.png └── Scheduler_Info.png ├── docker ├── commands ├── lets_go_job.sbatch ├── hello_world_job.sbatch ├── slurmdbd.conf ├── cgroup.conf ├── start_jobs.sh ├── slurm.conf ├── slurm.dockerfile ├── start_slurm.sh ├── 23.11.dockerfile ├── build_slurm_version.py └── 24.05.dockerfile ├── openapitools.json ├── extras └── systemd │ └── prometheus-slurm-exporter.service ├── go.mod ├── Makefile ├── .github └── workflows │ └── gotest.yml ├── .goreleaser.yaml ├── go.sum ├── CONTRIBUTING.md ├── README.md ├── cmd └── prometheus-slurm-exporter │ └── main.go └── testdata ├── SlurmV0041GetShares200Response.json ├── V0041OpenapiSharesResp.json ├── SlurmV0041GetDiag200Response.json ├── V0041OpenapiNodesResp.json ├── V0041OpenapiPartitionResp.json ├── V0040OpenapiSharesResp.json ├── V0040OpenapiDiagResp.json ├── V0040OpenapiPartitionResp.json └── V0040OpenapiJobInfoResp.json /internal/api/transport_test.go: -------------------------------------------------------------------------------- 1 | package api 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | go/ 3 | *.snap 4 | 5 | dist/ 6 | .ansible 7 | -------------------------------------------------------------------------------- /images/Job_Status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcrownover/prometheus-slurm-exporter/HEAD/images/Job_Status.png -------------------------------------------------------------------------------- /images/Node_Status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcrownover/prometheus-slurm-exporter/HEAD/images/Node_Status.png -------------------------------------------------------------------------------- /images/Scheduler_Info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcrownover/prometheus-slurm-exporter/HEAD/images/Scheduler_Info.png -------------------------------------------------------------------------------- /internal/util/time.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import "time" 4 | 5 | func NowEpoch() int64 { 6 | return time.Now().Unix() 7 | } 8 | -------------------------------------------------------------------------------- /docker/commands: -------------------------------------------------------------------------------- 1 | docker build -t 24.05 -f 24.05.dockerfile . 2 | 3 | docker run -it --rm --name slurm_container --entrypoint /bin/bash 24.05 4 | -------------------------------------------------------------------------------- /internal/util/fmt.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import "strings" 4 | 5 | func RemoveWhitespace(s string) string { 6 | return strings.Join(strings.Fields(s), "") 7 | } 8 | -------------------------------------------------------------------------------- /openapitools.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "./node_modules/@openapitools/openapi-generator-cli/config.schema.json", 3 | "spaces": 2, 4 | "generator-cli": { 5 | "version": "7.8.0" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docker/lets_go_job.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=lets_go 3 | #SBATCH --output=/jobs/output/lets_go.out 4 | #SBATCH --error=/jobs/err/lets_go.err 5 | #SBATCH --ntasks=1 6 | printf "Lets go\n" 7 | -------------------------------------------------------------------------------- /docker/hello_world_job.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=hello_world 3 | #SBATCH --output=/jobs/output/hello_world.out 4 | #SBATCH --error=/jobs/err/hello_world.err 5 | #SBATCH --time=00:05:00 6 | #SBATCH --ntasks=1 7 | printf "Hello world\n" 8 | -------------------------------------------------------------------------------- /internal/types/keys.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type Key int 4 | 5 | const ( 6 | ApiCacheKey Key = iota 7 | ApiCacheTimeoutKey 8 | ApiUserKey 9 | ApiTokenKey 10 | ApiURLKey 11 | ApiJobsEndpointKey 12 | ApiNodesEndpointKey 13 | ApiPartitionsEndpointKey 14 | ApiDiagEndpointKey 15 | ApiSharesEndpointKey 16 | ) 17 | -------------------------------------------------------------------------------- /extras/systemd/prometheus-slurm-exporter.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus SLURM Exporter 3 | 4 | [Service] 5 | ExecStart=/usr/local/sbin/prometheus-slurm-exporter 6 | EnvironmentFile=/etc/prometheus-slurm-exporter/env.conf 7 | Restart=always 8 | RestartSec=15 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | -------------------------------------------------------------------------------- /docker/slurmdbd.conf: -------------------------------------------------------------------------------- 1 | AuthInfo=/var/run/munge/munge.socket.2 2 | AuthType=auth/munge 3 | DbdHost=localhost 4 | DebugLevel=info 5 | DbdPort=6819 6 | LogFile=/var/log/slurm/slurmdbd.log 7 | SlurmUser=slurm 8 | StorageHost=localhost # or the database server host 9 | StoragePass=root 10 | StorageType=accounting_storage/mysql 11 | StorageUser=slurm 12 | -------------------------------------------------------------------------------- /docker/cgroup.conf: -------------------------------------------------------------------------------- 1 | CgroupMountpoint=/sys/fs/cgroup 2 | ConstrainCores=yes 3 | EnableControllers=yes 4 | ConstrainRAMSpace=yes 5 | CgroupPlugin=cgroup/v2 6 | ConstrainSwapSpace=yes 7 | ConstrainDevices=yes 8 | #CgroupAutomount=yes Defunct option 9 | AllowedRamSpace=100 10 | AllowedSwapSpace=0 11 | MaxRAMPercent=99 12 | MaxSwapPercent=0 13 | MinRAMSpace=200 14 | 15 | IgnoreSystemd=yes 16 | -------------------------------------------------------------------------------- /docker/start_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Submitting jobs" 4 | sbatch /jobs/hello_world_job.sbatch 5 | if [ $? -eq 0 ]; then 6 | echo "hello_world_job.sbatch submitted successfully" 7 | else 8 | echo "Failed to submit hello_world_job.sbatch" 9 | fi 10 | 11 | sbatch /jobs/lets_go_job.sbatch 12 | if [ $? -eq 0 ]; then 13 | echo "lets_go_job.sbatch submitted successfully" 14 | else 15 | echo "Failed to submit lets_go_job.sbatch" 16 | fi 17 | -------------------------------------------------------------------------------- /internal/api/endpoints.go: -------------------------------------------------------------------------------- 1 | package api 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 7 | ) 8 | 9 | type endpoint struct { 10 | key types.Key 11 | name string 12 | path string 13 | } 14 | 15 | // this gives a compile warning but centralizes the endpoints 16 | var endpoints = versionedEndpoints 17 | 18 | func RegisterEndpoints(ctx context.Context) context.Context { 19 | for _, e := range endpoints { 20 | ctx = context.WithValue(ctx, e.key, e.path) 21 | } 22 | return ctx 23 | } 24 | -------------------------------------------------------------------------------- /internal/api/endpoints_2311.go: -------------------------------------------------------------------------------- 1 | //go:build 2311 2 | 3 | package api 4 | 5 | import ( 6 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 7 | ) 8 | 9 | var versionedEndpoints = []endpoint{ 10 | {types.ApiJobsEndpointKey, "jobs", "/slurm/v0.0.40/jobs"}, 11 | {types.ApiNodesEndpointKey, "nodes", "/slurm/v0.0.40/nodes"}, 12 | {types.ApiPartitionsEndpointKey, "partitions", "/slurm/v0.0.40/partitions"}, 13 | {types.ApiDiagEndpointKey, "diag", "/slurm/v0.0.40/diag"}, 14 | {types.ApiSharesEndpointKey, "shares", "/slurm/v0.0.40/shares"}, 15 | } 16 | -------------------------------------------------------------------------------- /internal/api/endpoints_2405.go: -------------------------------------------------------------------------------- 1 | //go:build 2405 2 | 3 | package api 4 | 5 | import ( 6 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 7 | ) 8 | 9 | var versionedEndpoints = []endpoint{ 10 | {types.ApiJobsEndpointKey, "jobs", "/slurm/v0.0.41/jobs"}, 11 | {types.ApiNodesEndpointKey, "nodes", "/slurm/v0.0.41/nodes"}, 12 | {types.ApiPartitionsEndpointKey, "partitions", "/slurm/v0.0.41/partitions"}, 13 | {types.ApiDiagEndpointKey, "diag", "/slurm/v0.0.41/diag"}, 14 | {types.ApiSharesEndpointKey, "shares", "/slurm/v0.0.41/shares"}, 15 | } 16 | -------------------------------------------------------------------------------- /internal/api/endpoints_2411.go: -------------------------------------------------------------------------------- 1 | //go:build 2411 2 | 3 | package api 4 | 5 | import ( 6 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 7 | ) 8 | 9 | var versionedEndpoints = []endpoint{ 10 | {types.ApiJobsEndpointKey, "jobs", "/slurm/v0.0.42/jobs"}, 11 | {types.ApiNodesEndpointKey, "nodes", "/slurm/v0.0.42/nodes"}, 12 | {types.ApiPartitionsEndpointKey, "partitions", "/slurm/v0.0.42/partitions"}, 13 | {types.ApiDiagEndpointKey, "diag", "/slurm/v0.0.42/diag"}, 14 | {types.ApiSharesEndpointKey, "shares", "/slurm/v0.0.42/shares"}, 15 | } 16 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/lcrownover/prometheus-slurm-exporter 2 | 3 | go 1.22.5 4 | 5 | require ( 6 | github.com/akyoto/cache v1.0.6 7 | github.com/prometheus/client_golang v1.19.1 8 | ) 9 | 10 | require ( 11 | github.com/beorn7/perks v1.0.1 // indirect 12 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 13 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 14 | github.com/prometheus/client_model v0.6.1 // indirect 15 | github.com/prometheus/common v0.55.0 // indirect 16 | github.com/prometheus/procfs v0.15.1 // indirect 17 | golang.org/x/sys v0.22.0 // indirect 18 | google.golang.org/protobuf v1.34.2 // indirect 19 | ) 20 | -------------------------------------------------------------------------------- /internal/api/handler.go: -------------------------------------------------------------------------------- 1 | package api 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "net/http" 7 | 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/prometheus/client_golang/prometheus/promhttp" 10 | ) 11 | 12 | func beforeCollect(ctx context.Context) { 13 | err := PopulateCache(ctx) 14 | if err != nil { 15 | slog.Error("error populating request cache", "error", err) 16 | } 17 | } 18 | 19 | func afterCollect(ctx context.Context) { 20 | WipeCache(ctx) 21 | } 22 | 23 | func MetricsHandler(r *prometheus.Registry, ctx context.Context) http.HandlerFunc { 24 | h := promhttp.HandlerFor(r, promhttp.HandlerOpts{}) 25 | 26 | return func(w http.ResponseWriter, r *http.Request) { 27 | beforeCollect(ctx) 28 | h.ServeHTTP(w, r) 29 | afterCollect(ctx) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /internal/types/nodes.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type NodeState string 4 | 5 | const ( 6 | NodeStateAlloc NodeState = "alloc" 7 | NodeStateComp NodeState = "comp" 8 | NodeStateDown NodeState = "down" 9 | NodeStateDrain NodeState = "drain" 10 | NodeStateFail NodeState = "fail" 11 | NodeStateErr NodeState = "err" 12 | NodeStateIdle NodeState = "idle" 13 | NodeStateMaint NodeState = "maint" 14 | NodeStateMix NodeState = "mix" 15 | NodeStateResv NodeState = "resv" 16 | NodeStatePlanned NodeState = "planned" 17 | NodeStateNotResponding NodeState = "not_responding" 18 | NodeStateInvalid NodeState = "invalid" 19 | NodeStateInvalidReg NodeState = "invalid_reg" 20 | NodeStateDynamicNorm NodeState = "dynamic_norm" 21 | ) 22 | -------------------------------------------------------------------------------- /internal/types/jobs.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | type JobState string 4 | 5 | const ( 6 | JobStatePending JobState = "pending" 7 | JobStateCompleted JobState = "pompleted" 8 | JobStateFailed JobState = "failed" 9 | JobStateOutOfMemory JobState = "out_of_memory" 10 | JobStateRunning JobState = "running" 11 | JobStateSuspended JobState = "suspended" 12 | JobStateUnknown JobState = "unknown" 13 | JobStateTimeout JobState = "timeout" 14 | JobStateCancelled JobState = "cancelled" 15 | JobStateCompleting JobState = "completing" 16 | JobStateConfiguring JobState = "configuring" 17 | JobStatePreempted JobState = "preempted" 18 | JobStateNodeFail JobState = "node_fail" 19 | ) 20 | 21 | type SlurmJobsResponse struct { 22 | Jobs []slurmJob `json:"jobs"` 23 | } 24 | 25 | type slurmJobCPUs struct { 26 | Number int `json:"number"` 27 | } 28 | 29 | type slurmJob struct { 30 | Account string `json:"account"` 31 | JobStates []string `json:"job_state"` 32 | CPUs slurmJobCPUs `json:"cpus"` 33 | } 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = prometheus-slurm-exporter 2 | 3 | ifndef SLURM_VERSION 4 | $(error SLURM_VERSION environment variable is not set) 5 | endif 6 | 7 | slurm_version := ${SLURM_VERSION} 8 | 9 | # If SLURM_VERSION is "all", print an error message for the default build target 10 | build: 11 | ifeq ($(slurm_version),all) 12 | $(error You must set a specific SLURM_VERSION to build) 13 | else 14 | mkdir -p bin/ 15 | go build -tags=$(subst .,,$(slurm_version)) -o bin/prometheus-slurm-exporter cmd/prometheus-slurm-exporter/main.go 16 | endif 17 | 18 | test: 19 | ifeq ($(slurm_version),all) 20 | # Generate and test for version 24.05 21 | go test -tags=2405 -v ./... 22 | # Generate and test for version 23.11 23 | go test -tags=2311 -v ./... 24 | else 25 | go test -tags=$(subst .,,$(slurm_version)) -v ./... 26 | endif 27 | 28 | install: 29 | cp bin/prometheus-slurm-exporter /usr/local/sbin/prometheus-slurm-exporter 30 | cp extras/systemd/prometheus-slurm-exporter.service /etc/systemd/system/prometheus-slurm-exporter.service 31 | systemctl daemon-reload 32 | -------------------------------------------------------------------------------- /internal/util/infinity.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import "strings" 4 | 5 | func CleanseInfinity(b []byte) []byte { 6 | // this is disgusting but the response has values of "Infinity" which are 7 | // not json unmarshal-able, so I manually replace all the "Infinity"s with the correct 8 | // float64 value that represents Infinity. 9 | // this will be fixed in v0.0.42 10 | // https://support.schedmd.com/show_bug.cgi?id=20817 11 | // 12 | // https://github.com/lcrownover/prometheus-slurm-exporter/issues/8 13 | // also reported that folks are getting "inf" back, so I'll protect for that too 14 | bs := string(b) 15 | maxFloatStr := ": 1.7976931348623157e+308" 16 | // replacing the longer strings first should prevent any partial replacements 17 | bs = strings.ReplaceAll(bs, ": Infinity", maxFloatStr) 18 | bs = strings.ReplaceAll(bs, ": infinity", maxFloatStr) 19 | // sometimes it'd return "inf", so let's cover for that too. 20 | bs = strings.ReplaceAll(bs, ": Inf", maxFloatStr) 21 | bs = strings.ReplaceAll(bs, ": inf", maxFloatStr) 22 | return []byte(bs) 23 | } 24 | -------------------------------------------------------------------------------- /.github/workflows/gotest.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Go Test 3 | 4 | on: 5 | push: 6 | branches: 7 | - main 8 | - develop 9 | pull_request: 10 | branches: 11 | - main 12 | - develop 13 | 14 | jobs: 15 | 16 | build_2311: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Set up Go 22 | uses: actions/setup-go@v5 23 | with: 24 | go-version: '1.22.5' 25 | 26 | - name: Build 27 | run: go build -tags=2311 -v ./... 28 | 29 | - name: Test 30 | run: go test -tags=2311 -v ./... 31 | 32 | build_2405: 33 | runs-on: ubuntu-latest 34 | steps: 35 | - uses: actions/checkout@v4 36 | 37 | - name: Set up Go 38 | uses: actions/setup-go@v5 39 | with: 40 | go-version: '1.22.5' 41 | 42 | - name: Build 43 | run: go build -tags=2405 -v ./... 44 | 45 | - name: Test 46 | run: go test -tags=2405 -v ./... 47 | 48 | build_2411: 49 | runs-on: ubuntu-latest 50 | steps: 51 | - uses: actions/checkout@v4 52 | 53 | - name: Set up Go 54 | uses: actions/setup-go@v5 55 | with: 56 | go-version: '1.22.5' 57 | 58 | - name: Build 59 | run: go build -tags=2411 -v ./... 60 | 61 | - name: Test 62 | run: go test -tags=2411 -v ./... 63 | -------------------------------------------------------------------------------- /internal/util/testdata.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "os" 8 | "path" 9 | "runtime" 10 | ) 11 | 12 | // getTestDataDir returns the path to the `testdata` directory in the project 13 | func getTestDataDir() string { 14 | _, filename, _, _ := runtime.Caller(0) 15 | dir := path.Join(path.Dir(filename), "../..") 16 | err := os.Chdir(dir) 17 | if err != nil { 18 | panic(err) 19 | } 20 | return fmt.Sprintf(dir + "/testdata/") 21 | } 22 | 23 | // GetTestDataFilePath returns the full filepath to the specified filename 24 | // of test data. 25 | // 26 | // Example: GetTestDataFilePath("SomeTestData.json") -> 27 | // 28 | // /home/me/prometheus-slurm-exporter/testdata/SomeTestData.json 29 | func GetTestDataFilePath(filename string) string { 30 | testDataDir := getTestDataDir() 31 | return fmt.Sprintf("%s/%s", testDataDir, filename) 32 | } 33 | 34 | // ReadTestDataBytes takes the short filename of the desired test data file 35 | // and returns that data as bytes. 36 | func ReadTestDataBytes(filename string) []byte { 37 | filepath := GetTestDataFilePath(filename) 38 | file, err := os.Open(filepath) 39 | if err != nil { 40 | log.Fatalf("failed to open file: %v\n", err) 41 | } 42 | defer file.Close() 43 | 44 | data, err := io.ReadAll(file) 45 | if err != nil { 46 | log.Fatalf("failed to read file: %v\n", err) 47 | } 48 | 49 | return data 50 | } 51 | -------------------------------------------------------------------------------- /docker/slurm.conf: -------------------------------------------------------------------------------- 1 | # Basic SLURM configuration 2 | ClusterName=slurm_head 3 | SlurmdPort=6280 4 | SlurmUser=slurm 5 | SlurmctldPort=6281 6 | ProctrackType=proctrack/pgid 7 | # Slurm controller host 8 | #AccountingStorageType=accounting_storage/slurmdbd <-- this breaks squeue and sinfo. Does not allow the daemons to properly come up 9 | AuthType=auth/munge 10 | #AuthAltTypes=auth/jwt 11 | SlurmctldHost=localhost 12 | SlurmctldParameters=enable_configless 13 | #ControlMachine=localhost <- not needed and is ignored by slurm.conf and slurmd 14 | StateSaveLocation=/var/spool/slurm 15 | #CgroupPlugin=cgroup/v2 16 | #DebugFlags=all <- this "all" option does not work. Tried "ALL" as well. Removing all together now 17 | SlurmdDebug=debug3 18 | SlurmdLogFile=/var/log/slurm/slurmd.log 19 | SlurmctldLogFile=/var/log/slurm/slurmctld.log 20 | 21 | #TaskPlugin=task/cgroup 12/23/24 22 | #SelectType=select/linear 12/23/24 23 | 24 | # Specify node as both a controller and compute node 25 | #NodeName=localhost CPUs=2 RealMemory=2048 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1 State=UNKNOWN 26 | NodeName=localhost CPUs=1 RealMemory=2048 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 27 | #NodeName=localhost CPUs=12 RealMemory=2048 Sockets=1 CoresPerSocket=6 ThreadsPerCore=1 State=UNKNOWN 28 | 29 | # partition with both controller and compute node 30 | PartitionName=debug Nodes=localhost Default=YES MaxTime=INFINITE State=UP 31 | -------------------------------------------------------------------------------- /internal/api/cache.go: -------------------------------------------------------------------------------- 1 | package api 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log/slog" 7 | "strings" 8 | "sync" 9 | 10 | "github.com/akyoto/cache" 11 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 12 | ) 13 | 14 | // PopulateCache is used to populate the cache with data from the slurm api 15 | func PopulateCache(ctx context.Context) error { 16 | slog.Debug("populating cache") 17 | var data []byte 18 | var err error 19 | 20 | apiCache := ctx.Value(types.ApiCacheKey).(*cache.Cache) 21 | 22 | var wg sync.WaitGroup 23 | wg.Add(len(endpoints)) 24 | errors := make(chan error, len(endpoints)) 25 | 26 | for _, e := range endpoints { 27 | go func(e endpoint) { 28 | defer wg.Done() 29 | data, err = GetSlurmRestResponse(ctx, e.key) 30 | if err != nil { 31 | errors <- fmt.Errorf("failed to get slurmrestd %s response: %v", e.path, err) 32 | } 33 | apiCache.Set(e.name, data, 0) 34 | }(e) 35 | } 36 | 37 | wg.Wait() 38 | close(errors) 39 | 40 | var errmsgs []string 41 | for err := range errors { 42 | errmsgs = append(errmsgs, err.Error()) 43 | return fmt.Errorf("error(s) encountered calling slurm api: [%s]", strings.Join(errmsgs, ", ")) 44 | } 45 | 46 | slog.Debug("finished populating cache") 47 | 48 | return nil 49 | } 50 | 51 | func WipeCache(ctx context.Context) error { 52 | apiCache := ctx.Value(types.ApiCacheKey).(*cache.Cache) 53 | apiCache.Delete("diag") 54 | apiCache.Delete("nodes") 55 | apiCache.Delete("jobs") 56 | apiCache.Delete("partitions") 57 | apiCache.Delete("shares") 58 | return nil 59 | } 60 | -------------------------------------------------------------------------------- /docker/slurm.dockerfile: -------------------------------------------------------------------------------- 1 | FROM nathanhess/slurm:full-root 2 | 3 | # Install systemd 4 | RUN apt-get update && apt-get install -y \ 5 | systemd \ 6 | && apt-get clean \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | # Create necessary directories 10 | RUN mkdir -p /etc/systemd/system/multi-user.target.wants /container/jobs /container/output /container/err 11 | 12 | # Copy Slurm configuration files 13 | COPY slurm.conf /etc/slurm/slurm.conf 14 | COPY cgroup.conf /etc/slurm/cgroup.conf 15 | 16 | # Set arguments for resources 17 | ARG CPU=4 18 | ARG MEMORY=8192 19 | 20 | # Display OS info 21 | RUN echo "Container OS:" && cat /etc/os-release 22 | 23 | # Create sample SLURM job scripts 24 | RUN echo '#!/bin/bash\n\ 25 | #SBATCH --job-name=hello_world\n\ 26 | #SBATCH --output=/container/output/hello_world.out\n\ 27 | #SBATCH --error=/container/err/hello_world.err\n\ 28 | #SBATCH --time=00:05:00\n\ 29 | #SBATCH --ntasks=1\n\n\ 30 | echo "Hello World"\n\ 31 | sleep 300' > /container/jobs/hello_world_job.sbatch 32 | 33 | RUN echo '#!/bin/bash\n\ 34 | #SBATCH --job-name=lets_go\n\ 35 | #SBATCH --output=/container/output/lets_go.out\n\ 36 | #SBATCH --error=/container/err/lets_go.err\n\ 37 | #SBATCH --time=00:05:00\n\ 38 | #SBATCH --ntasks=1\n\n\ 39 | echo "Let'\''s Go"\n\ 40 | sleep 300' > /container/jobs/lets_go_job.sbatch 41 | 42 | RUN chmod +x /container/jobs/hello_world_job.sbatch /container/jobs/lets_go_job.sbatch 43 | 44 | COPY start_slurm.sh /usr/local/bin/start_slurm.sh 45 | RUN chmod +x /usr/local/bin/start_slurm.sh 46 | 47 | ENTRYPOINT ["/usr/local/bin/start_slurm.sh"] 48 | CMD ["/bin/systemd"] 49 | 50 | -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | # This is an example .goreleaser.yml file with some sensible defaults. 2 | # Make sure to check the documentation at https://goreleaser.com 3 | 4 | # The lines below are called `modelines`. See `:help modeline` 5 | # Feel free to remove those if you don't want/need to use them. 6 | # yaml-language-server: $schema=https://goreleaser.com/static/schema.json 7 | # vim: set ts=2 sw=2 tw=0 fo=cnqoj 8 | 9 | version: 2 10 | 11 | before: 12 | hooks: 13 | - go mod tidy 14 | 15 | archives: 16 | - format: binary 17 | 18 | changelog: 19 | sort: asc 20 | filters: 21 | exclude: 22 | - "^docs:" 23 | - "^test:" 24 | builds: 25 | - id: 'slurm-23.11' 26 | main: ./cmd/prometheus-slurm-exporter/main.go 27 | binary: prometheus-slurm-exporter_slurm-23.11_{{ .Os }}_{{ .Arch }} 28 | flags: 29 | - -tags=2311 30 | env: 31 | - CGO_ENABLED=0 32 | goos: 33 | - linux 34 | goarch: 35 | - amd64 36 | - arm64 37 | 38 | - id: 'slurm-24.05' 39 | main: ./cmd/prometheus-slurm-exporter/main.go 40 | binary: prometheus-slurm-exporter_slurm-24.05_{{ .Os }}_{{ .Arch }} 41 | flags: 42 | - -tags=2405 43 | env: 44 | - CGO_ENABLED=0 45 | goos: 46 | - linux 47 | goarch: 48 | - amd64 49 | - arm64 50 | 51 | - id: 'slurm-24.11' 52 | main: ./cmd/prometheus-slurm-exporter/main.go 53 | binary: prometheus-slurm-exporter_slurm-24.11_{{ .Os }}_{{ .Arch }} 54 | flags: 55 | - -tags=2411 56 | env: 57 | - CGO_ENABLED=0 58 | goos: 59 | - linux 60 | goarch: 61 | - amd64 62 | - arm64 63 | -------------------------------------------------------------------------------- /internal/api/unmarshalers_2311_test.go: -------------------------------------------------------------------------------- 1 | //go:build 2311 2 | 3 | package api 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/util" 10 | ) 11 | 12 | func TestUnmarshalDiagResponse(t *testing.T) { 13 | var r DiagResp 14 | fb := util.ReadTestDataBytes("V0040OpenapiDiagResp.json") 15 | err := json.Unmarshal(fb, &r) 16 | if err != nil { 17 | t.Fatalf("failed to unmarshal diag response: %v\n", err) 18 | } 19 | } 20 | 21 | func TestUnmarshalJobsResponse(t *testing.T) { 22 | var r JobsResp 23 | fb := util.ReadTestDataBytes("V0040OpenapiJobInfoResp.json") 24 | err := json.Unmarshal(fb, &r) 25 | if err != nil { 26 | t.Fatalf("failed to unmarshal jobs response: %v\n", err) 27 | } 28 | } 29 | 30 | func TestUnmarshalNodesResponse(t *testing.T) { 31 | var r NodesResp 32 | fb := util.ReadTestDataBytes("V0040OpenapiNodesResp.json") 33 | err := json.Unmarshal(fb, &r) 34 | if err != nil { 35 | t.Fatalf("failed to unmarshal nodes response: %v\n", err) 36 | } 37 | } 38 | 39 | func TestUnmarshalPartitionsResponse(t *testing.T) { 40 | var r PartitionsResp 41 | fb := util.ReadTestDataBytes("V0040OpenapiPartitionResp.json") 42 | err := json.Unmarshal(fb, &r) 43 | if err != nil { 44 | t.Fatalf("failed to unmarshal partition response: %v\n", err) 45 | } 46 | } 47 | 48 | func TestUnmarshalSharesResponse(t *testing.T) { 49 | var r SharesResp 50 | fb := util.ReadTestDataBytes("V0040OpenapiSharesResp.json") 51 | fb = util.CleanseInfinity(fb) 52 | err := json.Unmarshal(fb, &r) 53 | if err != nil { 54 | t.Fatalf("failed to unmarshal shares response: %v\n", err) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /internal/api/unmarshalers_2405_test.go: -------------------------------------------------------------------------------- 1 | //go:build 2405 2 | 3 | package api 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/util" 10 | ) 11 | 12 | func TestUnmarshalDiagResponse(t *testing.T) { 13 | var r DiagResp 14 | fb := util.ReadTestDataBytes("SlurmV0041GetDiag200Response.json") 15 | err := json.Unmarshal(fb, &r) 16 | if err != nil { 17 | t.Fatalf("failed to unmarshal diag response: %v\n", err) 18 | } 19 | } 20 | 21 | func TestUnmarshalJobsResponse(t *testing.T) { 22 | var r JobsResp 23 | fb := util.ReadTestDataBytes("V0041OpenapiJobInfoResp.json") 24 | err := json.Unmarshal(fb, &r) 25 | if err != nil { 26 | t.Fatalf("failed to unmarshal jobs response: %v\n", err) 27 | } 28 | } 29 | 30 | func TestUnmarshalNodesResponse(t *testing.T) { 31 | var r NodesResp 32 | fb := util.ReadTestDataBytes("V0041OpenapiNodesResp.json") 33 | err := json.Unmarshal(fb, &r) 34 | if err != nil { 35 | t.Fatalf("failed to unmarshal nodes response: %v\n", err) 36 | } 37 | } 38 | 39 | func TestUnmarshalPartitionsResponse(t *testing.T) { 40 | var r PartitionsResp 41 | fb := util.ReadTestDataBytes("V0041OpenapiPartitionResp.json") 42 | err := json.Unmarshal(fb, &r) 43 | if err != nil { 44 | t.Fatalf("failed to unmarshal partition response: %v\n", err) 45 | } 46 | } 47 | 48 | func TestUnmarshalSharesResponse(t *testing.T) { 49 | var r SharesResp 50 | fb := util.ReadTestDataBytes("V0041OpenapiSharesResp.json") 51 | fb = util.CleanseInfinity(fb) 52 | err := json.Unmarshal(fb, &r) 53 | if err != nil { 54 | t.Fatalf("failed to unmarshal shares response: %v\n", err) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /internal/api/unmarshalers_2411_test.go: -------------------------------------------------------------------------------- 1 | //go:build 2411 2 | 3 | package api 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/util" 10 | ) 11 | 12 | func TestUnmarshalDiagResponse(t *testing.T) { 13 | var r DiagResp 14 | fb := util.ReadTestDataBytes("SlurmV0041GetDiag200Response.json") 15 | err := json.Unmarshal(fb, &r) 16 | if err != nil { 17 | t.Fatalf("failed to unmarshal diag response: %v\n", err) 18 | } 19 | } 20 | 21 | func TestUnmarshalJobsResponse(t *testing.T) { 22 | var r JobsResp 23 | fb := util.ReadTestDataBytes("V0041OpenapiJobInfoResp.json") 24 | err := json.Unmarshal(fb, &r) 25 | if err != nil { 26 | t.Fatalf("failed to unmarshal jobs response: %v\n", err) 27 | } 28 | } 29 | 30 | func TestUnmarshalNodesResponse(t *testing.T) { 31 | var r NodesResp 32 | fb := util.ReadTestDataBytes("V0041OpenapiNodesResp.json") 33 | err := json.Unmarshal(fb, &r) 34 | if err != nil { 35 | t.Fatalf("failed to unmarshal nodes response: %v\n", err) 36 | } 37 | } 38 | 39 | func TestUnmarshalPartitionsResponse(t *testing.T) { 40 | var r PartitionsResp 41 | fb := util.ReadTestDataBytes("V0041OpenapiPartitionResp.json") 42 | err := json.Unmarshal(fb, &r) 43 | if err != nil { 44 | t.Fatalf("failed to unmarshal partition response: %v\n", err) 45 | } 46 | } 47 | 48 | func TestUnmarshalSharesResponse(t *testing.T) { 49 | var r SharesResp 50 | fb := util.ReadTestDataBytes("V0041OpenapiSharesResp.json") 51 | fb = util.CleanseInfinity(fb) 52 | err := json.Unmarshal(fb, &r) 53 | if err != nil { 54 | t.Fatalf("failed to unmarshal shares response: %v\n", err) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /docker/start_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Ensure logfile and /var/log/munge have the correct ownership 4 | # Start munge daemon as munge user 5 | echo "Starting the munge daemon" 6 | sudo -u munge /usr/sbin/munged 7 | 8 | # Check if munge daemon started successfully 9 | if ps aux | grep -q '[m]unged'; then 10 | echo "Munge daemon started successfully" 11 | else 12 | echo "Failed to start Munge daemon" 13 | exit 1 14 | fi 15 | 16 | # Output the Slurm configuration 17 | #cat /usr/local/etc/slurm.conf 18 | # Start the slurmctld daemon 19 | echo "Starting the slurmctld daemon" 20 | if /slurm/src/slurmctld/slurmctld -f /usr/local/etc/slurm.conf; then 21 | echo "slurmctld daemon started successfully" 22 | else 23 | echo "Failed to start slurmctld daemon" 24 | exit 1 25 | fi 26 | 27 | # Start the slurmd daemon 28 | echo "Starting the slurmd daemon" 29 | if /slurm/src/slurmd/slurmd/slurmd --conf-server localhost:6281; then 30 | echo "slurmd daemon started successfully" 31 | else 32 | echo "Failed to start slurmd daemon" 33 | exit 1 34 | fi 35 | 36 | echo "Starting the slurmdbd daemon" 37 | if /slurm/src/slurmdbd/slurmdbd; then 38 | echo "slurmdbd daemon started successfully" 39 | else 40 | echo "Failed to start slurmd daemon" 41 | exit 1 42 | fi 43 | 44 | sleep 3 45 | ps aux | grep munged | grep -v grep 46 | ps aux | grep slurmd | grep -v grep 47 | ps aux | grep slurmctld | grep -v grep 48 | ps aux | grep slurmdbd | grep -v grep 49 | #echo "Submitting jobs" 50 | #sbatch /jobs/hello_world_job.sbatch 51 | #if [ $? -eq 0 ]; then 52 | # echo "hello_world_job.sbatch submitted successfully" 53 | #else 54 | # echo "Failed to submit hello_world_job.sbatch" 55 | #fi 56 | # 57 | #sbatch /jobs/lets_go_job.sbatch 58 | #if [ $? -eq 0 ]; then 59 | # echo "lets_go_job.sbatch submitted successfully" 60 | #else 61 | # echo "Failed to submit lets_go_job.sbatch" 62 | #fi 63 | 64 | # Keep the container running 65 | #tail -f /dev/null 66 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/akyoto/cache v1.0.6 h1:5XGVVYoi2i+DZLLPuVIXtsNIJ/qaAM16XT0LaBaXd2k= 2 | github.com/akyoto/cache v1.0.6/go.mod h1:WfxTRqKhfgAG71Xh6E3WLpjhBtZI37O53G4h5s+3iM4= 3 | github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 4 | github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 5 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 6 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 7 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 8 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 9 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 10 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 11 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 12 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 13 | github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= 14 | github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= 15 | github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= 16 | github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= 17 | github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= 18 | github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= 19 | github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= 20 | github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= 21 | golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= 22 | golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 23 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= 24 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= 25 | -------------------------------------------------------------------------------- /internal/slurm/fairshare.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | type FairShareCollector struct { 14 | ctx context.Context 15 | fairshare *prometheus.Desc 16 | } 17 | 18 | func NewFairShareCollector(ctx context.Context) *FairShareCollector { 19 | labels := []string{"account"} 20 | return &FairShareCollector{ 21 | ctx: ctx, 22 | fairshare: prometheus.NewDesc("slurm_account_fairshare", "FairShare for account", labels, nil), 23 | } 24 | } 25 | 26 | func (fsc *FairShareCollector) Describe(ch chan<- *prometheus.Desc) { 27 | ch <- fsc.fairshare 28 | } 29 | 30 | func (fsc *FairShareCollector) Collect(ch chan<- prometheus.Metric) { 31 | apiCache := fsc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 32 | sharesRespBytes, found := apiCache.Get("shares") 33 | if !found { 34 | slog.Error("failed to get shares response for fair share metrics from cache") 35 | return 36 | } 37 | 38 | sharesData, err := api.ProcessSharesResponse(sharesRespBytes.([]byte)) 39 | if err != nil { 40 | slog.Error("failed to process shares response for fair share metrics", "error", err) 41 | return 42 | } 43 | fsm, err := ParseFairShareMetrics(sharesData) 44 | if err != nil { 45 | slog.Error("failed to collect fair share metrics", "error", err) 46 | return 47 | } 48 | for f := range fsm { 49 | ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f) 50 | } 51 | } 52 | 53 | type fairShareMetrics struct { 54 | fairshare float64 55 | } 56 | 57 | func NewFairShareMetrics() *fairShareMetrics { 58 | return &fairShareMetrics{} 59 | } 60 | 61 | func ParseFairShareMetrics(sharesData *api.SharesData) (map[string]*fairShareMetrics, error) { 62 | accounts := make(map[string]*fairShareMetrics) 63 | for _, s := range sharesData.Shares { 64 | account := s.Name 65 | if account == "root" { 66 | // we don't care about the root account 67 | continue 68 | } 69 | if _, exists := accounts[account]; !exists { 70 | accounts[account] = NewFairShareMetrics() 71 | } 72 | accounts[account].fairshare = s.EffectiveUsage 73 | } 74 | return accounts, nil 75 | } 76 | -------------------------------------------------------------------------------- /internal/api/responses_2311.go: -------------------------------------------------------------------------------- 1 | //go:build 2311 2 | 3 | package api 4 | 5 | var apiVersion = "23.11" 6 | 7 | type DiagResp struct { 8 | Statistics struct { 9 | ServerThreadCount *int32 `json:"server_thread_count"` 10 | AgentQueueSize *int32 `json:"agent_queue_size"` 11 | DbdAgentQueueSize *int32 `json:"dbd_agent_queue_size"` 12 | ScheduleCycleLast *int32 `json:"schedule_cycle_last"` 13 | ScheduleCycleMean *int64 `json:"schedule_cycle_mean"` 14 | ScheduleCyclePerMinute *int64 `json:"schedule_cycle_per_minute"` 15 | BfDepthMean *int64 `json:"bf_depth_mean"` 16 | BfCycleLast *int32 `json:"bf_cycle_last"` 17 | BfCycleMean *int64 `json:"bf_cycle_mean"` 18 | BfBackfilledJobs *int32 `json:"bf_backfilled_jobs"` 19 | BfLastBackfilledJobs *int32 `json:"bf_last_backfilled_jobs"` 20 | BfBackfilledHetJobs *int32 `json:"bf_backfilled_het_jobs"` 21 | } `json:"statistics"` 22 | } 23 | 24 | type JobsResp struct { 25 | Jobs []struct { 26 | Account *string `json:"account"` 27 | UserName *string `json:"user_name"` 28 | Partition *string `json:"partition"` 29 | JobState []string `json:"job_state"` 30 | Dependency *string `json:"dependency"` 31 | JobResources struct { 32 | Cpus *int32 `json:"allocated_cores"` 33 | } `json:"job_resources"` 34 | } `json:"jobs"` 35 | } 36 | 37 | type NodesResp struct { 38 | Nodes []struct { 39 | Name *string `json:"name,omitempty"` 40 | Hostname *string `json:"hostname,omitempty"` 41 | State []string `json:"state,omitempty"` 42 | Tres *string `json:"tres,omitempty"` 43 | TresUsed *string `json:"tres_used,omitempty"` 44 | Partitions []string `json:"partitions,omitempty"` 45 | AllocMemory *int64 `json:"alloc_memory,omitempty"` 46 | RealMemory *int64 `json:"real_memory,omitempty"` 47 | AllocCpus *int32 `json:"alloc_cpus,omitempty"` 48 | AllocIdleCpus *int32 `json:"alloc_idle_cpus,omitempty"` 49 | Cpus *int32 `json:"cpus,omitempty"` 50 | } `json:"nodes"` 51 | } 52 | 53 | type PartitionsResp struct { 54 | Partitions []struct { 55 | Name *string `json:"name,omitempty"` 56 | Cpus *struct { 57 | Total *int32 `json:"total"` 58 | } `json:"cpus"` 59 | Nodes *struct { 60 | Configured *string `json:"configured"` 61 | } `json:"nodes"` 62 | } `json:"partitions"` 63 | } 64 | 65 | type SharesResp struct { 66 | Shares struct { 67 | Shares []struct { 68 | Name *string `json:"name"` 69 | EffectiveUsage *float64 `json:"effective_usage"` 70 | } `json:"shares"` 71 | } `json:"shares"` 72 | } 73 | -------------------------------------------------------------------------------- /internal/api/responses_2405.go: -------------------------------------------------------------------------------- 1 | //go:build 2405 2 | 3 | package api 4 | 5 | var apiVersion = "24.05" 6 | 7 | type DiagResp struct { 8 | Statistics struct { 9 | ServerThreadCount *int32 `json:"server_thread_count"` 10 | AgentQueueSize *int32 `json:"agent_queue_size"` 11 | DbdAgentQueueSize *int32 `json:"dbd_agent_queue_size"` 12 | ScheduleCycleLast *int32 `json:"schedule_cycle_last"` 13 | ScheduleCycleMean *int64 `json:"schedule_cycle_mean"` 14 | ScheduleCyclePerMinute *int64 `json:"schedule_cycle_per_minute"` 15 | BfDepthMean *int64 `json:"bf_depth_mean"` 16 | BfCycleLast *int32 `json:"bf_cycle_last"` 17 | BfCycleMean *int64 `json:"bf_cycle_mean"` 18 | BfBackfilledJobs *int32 `json:"bf_backfilled_jobs"` 19 | BfLastBackfilledJobs *int32 `json:"bf_last_backfilled_jobs"` 20 | BfBackfilledHetJobs *int32 `json:"bf_backfilled_het_jobs"` 21 | } `json:"statistics"` 22 | } 23 | 24 | type JobsResp struct { 25 | Jobs []struct { 26 | Account *string `json:"account"` 27 | UserName *string `json:"user_name"` 28 | Partition *string `json:"partition"` 29 | JobState []string `json:"job_state"` 30 | Dependency *string `json:"dependency"` 31 | JobResources struct { 32 | Cpus *int32 `json:"cpus"` 33 | } `json:"job_resources"` 34 | } `json:"jobs"` 35 | } 36 | 37 | type NodesResp struct { 38 | Nodes []struct { 39 | Name *string `json:"name,omitempty"` 40 | Hostname *string `json:"hostname,omitempty"` 41 | State []string `json:"state,omitempty"` 42 | Tres *string `json:"tres,omitempty"` 43 | TresUsed *string `json:"tres_used,omitempty"` 44 | Partitions []string `json:"partitions,omitempty"` 45 | AllocMemory *int64 `json:"alloc_memory,omitempty"` 46 | RealMemory *int64 `json:"real_memory,omitempty"` 47 | AllocCpus *int32 `json:"alloc_cpus,omitempty"` 48 | AllocIdleCpus *int32 `json:"alloc_idle_cpus,omitempty"` 49 | Cpus *int32 `json:"cpus,omitempty"` 50 | } `json:"nodes"` 51 | } 52 | 53 | type PartitionsResp struct { 54 | Partitions []struct { 55 | Name *string `json:"name,omitempty"` 56 | Cpus *struct { 57 | Total *int32 `json:"total"` 58 | } `json:"cpus"` 59 | Nodes *struct { 60 | Configured *string `json:"configured"` 61 | } `json:"nodes"` 62 | } `json:"partitions"` 63 | } 64 | 65 | type SharesResp struct { 66 | Shares struct { 67 | Shares []struct { 68 | Name *string `json:"name"` 69 | EffectiveUsage struct { 70 | Number *float64 `json:"number"` 71 | } `json:"effective_usage"` 72 | } `json:"shares"` 73 | } `json:"shares"` 74 | } 75 | -------------------------------------------------------------------------------- /internal/api/responses_2411.go: -------------------------------------------------------------------------------- 1 | //go:build 2411 2 | 3 | package api 4 | 5 | var apiVersion = "24.11" 6 | 7 | type DiagResp struct { 8 | Statistics struct { 9 | ServerThreadCount *int32 `json:"server_thread_count"` 10 | AgentQueueSize *int32 `json:"agent_queue_size"` 11 | DbdAgentQueueSize *int32 `json:"dbd_agent_queue_size"` 12 | ScheduleCycleLast *int32 `json:"schedule_cycle_last"` 13 | ScheduleCycleMean *int64 `json:"schedule_cycle_mean"` 14 | ScheduleCyclePerMinute *int64 `json:"schedule_cycle_per_minute"` 15 | BfDepthMean *int64 `json:"bf_depth_mean"` 16 | BfCycleLast *int32 `json:"bf_cycle_last"` 17 | BfCycleMean *int64 `json:"bf_cycle_mean"` 18 | BfBackfilledJobs *int32 `json:"bf_backfilled_jobs"` 19 | BfLastBackfilledJobs *int32 `json:"bf_last_backfilled_jobs"` 20 | BfBackfilledHetJobs *int32 `json:"bf_backfilled_het_jobs"` 21 | } `json:"statistics"` 22 | } 23 | 24 | type JobsResp struct { 25 | Jobs []struct { 26 | Account *string `json:"account"` 27 | UserName *string `json:"user_name"` 28 | Partition *string `json:"partition"` 29 | JobState []string `json:"job_state"` 30 | Dependency *string `json:"dependency"` 31 | JobResources struct { 32 | Cpus *int32 `json:"cpus"` 33 | } `json:"job_resources"` 34 | } `json:"jobs"` 35 | } 36 | 37 | type NodesResp struct { 38 | Nodes []struct { 39 | Name *string `json:"name,omitempty"` 40 | Hostname *string `json:"hostname,omitempty"` 41 | State []string `json:"state,omitempty"` 42 | Tres *string `json:"tres,omitempty"` 43 | TresUsed *string `json:"tres_used,omitempty"` 44 | Partitions []string `json:"partitions,omitempty"` 45 | AllocMemory *int64 `json:"alloc_memory,omitempty"` 46 | RealMemory *int64 `json:"real_memory,omitempty"` 47 | AllocCpus *int32 `json:"alloc_cpus,omitempty"` 48 | AllocIdleCpus *int32 `json:"alloc_idle_cpus,omitempty"` 49 | Cpus *int32 `json:"cpus,omitempty"` 50 | } `json:"nodes"` 51 | } 52 | 53 | type PartitionsResp struct { 54 | Partitions []struct { 55 | Name *string `json:"name,omitempty"` 56 | Cpus *struct { 57 | Total *int32 `json:"total"` 58 | } `json:"cpus"` 59 | Nodes *struct { 60 | Configured *string `json:"configured"` 61 | } `json:"nodes"` 62 | } `json:"partitions"` 63 | } 64 | 65 | type SharesResp struct { 66 | Shares struct { 67 | Shares []struct { 68 | Name *string `json:"name"` 69 | EffectiveUsage *struct { 70 | Number *float64 `json:"number"` 71 | } `json:"effective_usage"` 72 | } `json:"shares"` 73 | } `json:"shares"` 74 | } 75 | -------------------------------------------------------------------------------- /internal/api/unmarshalers.go: -------------------------------------------------------------------------------- 1 | package api 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "log/slog" 7 | 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/util" 9 | ) 10 | 11 | func ProcessDiagResponse(b []byte) (*DiagData, error) { 12 | var r DiagResp 13 | if len(b) == 0 { 14 | return nil, fmt.Errorf("failed to unmarshal diag response, body is empty") 15 | } 16 | err := json.Unmarshal(b, &r) 17 | if err != nil { 18 | slog.Debug("failed to unmarshal diag response", "body", string(b)) 19 | return nil, fmt.Errorf("failed to unmarshall diag response data: %v", err) 20 | } 21 | d := NewDiagData() 22 | d.FromResponse(r) 23 | return d, nil 24 | } 25 | 26 | // ProcessJobsResponse converts the response bytes into a slurm type 27 | func ProcessJobsResponse(b []byte) (*JobsData, error) { 28 | var r JobsResp 29 | if len(b) == 0 { 30 | return nil, fmt.Errorf("failed to unmarshal jobs response, body is empty") 31 | } 32 | err := json.Unmarshal(b, &r) 33 | if err != nil { 34 | slog.Debug("failed to unmarshal jobs response", "body", string(b)) 35 | return nil, fmt.Errorf("failed to unmarshall jobs response data: %v", err) 36 | } 37 | d := NewJobsData() 38 | d.FromResponse(r) 39 | return d, nil 40 | } 41 | 42 | // ProcessNodesResponse converts the response bytes into a slurm type 43 | func ProcessNodesResponse(b []byte) (*NodesData, error) { 44 | var r NodesResp 45 | if len(b) == 0 { 46 | return nil, fmt.Errorf("failed to unmarshal nodes response, body is empty") 47 | } 48 | err := json.Unmarshal(b, &r) 49 | if err != nil { 50 | slog.Debug("failed to unmarshal nodes response", "body", string(b)) 51 | return nil, fmt.Errorf("failed to unmarshall nodes response data: %v", err) 52 | } 53 | d := NewNodesData() 54 | d.FromResponse(r) 55 | return d, nil 56 | } 57 | 58 | // ProcessPartitionsResponse converts the response bytes into a slurm type 59 | func ProcessPartitionsResponse(b []byte) (*PartitionsData, error) { 60 | var r PartitionsResp 61 | if len(b) == 0 { 62 | return nil, fmt.Errorf("failed to unmarshal partitions response, body is empty") 63 | } 64 | err := json.Unmarshal(b, &r) 65 | if err != nil { 66 | slog.Debug("failed to unmarshal partitions response", "body", string(b)) 67 | return nil, fmt.Errorf("failed to unmarshall partitions response data: %v", err) 68 | } 69 | d := NewPartitionsData() 70 | d.FromResponse(r) 71 | return d, nil 72 | } 73 | 74 | // ProcessSharesResponse converts the response bytes into a slurm type 75 | func ProcessSharesResponse(b []byte) (*SharesData, error) { 76 | b = util.CleanseInfinity(b) 77 | var r SharesResp 78 | if len(b) == 0 { 79 | return nil, fmt.Errorf("failed to unmarshal shares response, body is empty") 80 | } 81 | err := json.Unmarshal(b, &r) 82 | if err != nil { 83 | slog.Debug("failed to unmarshal shares response", "body", string(b)) 84 | return nil, fmt.Errorf("failed to unmarshall shares response data: %v", err) 85 | } 86 | 87 | d := NewSharesData() 88 | d.FromResponse(r) 89 | return d, nil 90 | } 91 | -------------------------------------------------------------------------------- /docker/23.11.dockerfile: -------------------------------------------------------------------------------- 1 | FROM rockylinux:8 2 | RUN dnf update -y && \ 3 | dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \ 4 | dnf install -y --enablerepo=devel mariadb-devel python3-PyMySQL hwloc lz4-devel wget bzip2 perl munge-devel munge cmake jansson libjwt-devel libjwt json-c-devel json-c http-parser-devel http-parser libcgroup libcgroup-tools dbus-devel && \ 5 | dnf group install -y "Development Tools" 6 | 7 | RUN dnf install -y sudo 8 | 9 | RUN dnf -y update && \ 10 | dnf install -y systemd && \ 11 | dnf clean all && \ 12 | rm -rf /var/lib/apt/lists/* 13 | 14 | RUN adduser slurm 15 | 16 | # Install http_parser 17 | RUN git clone --depth 1 --single-branch -b v2.9.4 https://github.com/nodejs/http-parser.git http_parser \ 18 | && cd http_parser \ 19 | && make \ 20 | && make install 21 | 22 | RUN dnf install -y systemd 23 | 24 | WORKDIR /slurm 25 | RUN wget https://download.schedmd.com/slurm/slurm-23.11-latest.tar.bz2 && tar -xvjf slurm-23.11-latest.tar.bz2 --strip-components=1 26 | 27 | RUN ./configure \ 28 | --with-cgroup-v2 \ 29 | --with-http-parser=/usr/local/ \ 30 | --enable-slurmrestd \ 31 | && make && make install 32 | 33 | # Create the /var/log/slurm directory and set permissions 34 | RUN mkdir -p /var/log/slurm && \ 35 | chown slurm:slurm /var/log/slurm && \ 36 | chmod 750 /var/log/slurm && \ 37 | touch /var/log/slurm/slurmd.log && \ 38 | touch /var/log/slurm/slurmctld.log && \ 39 | chown slurm:slurm /var/log/slurm/slurmctld.log /var/log/slurm/slurmd.log 40 | 41 | RUN getent group munge || groupadd -r munge && \ 42 | getent passwd munge || useradd -r -g munge munge && \ 43 | mkdir -p /var/log/munge && \ 44 | chown munge:munge /var/log/munge && \ 45 | chmod 750 /var/log/munge && \ 46 | /usr/sbin/create-munge-key && \ 47 | chown munge:munge /etc/munge/munge.key && \ 48 | chmod 400 /etc/munge/munge.key 49 | 50 | RUN touch /var/log/munge/munged.log && \ 51 | chown munge:munge /var/log/munge/munged.log 52 | 53 | COPY slurm.conf /usr/local/etc/slurm.conf 54 | 55 | USER root 56 | COPY cgroup.conf /usr/local/etc/cgroup.conf 57 | COPY slurm.conf /usr/local/etc/slurm.conf 58 | COPY start_slurm.sh /start_slurm.sh 59 | RUN chmod 755 /start_slurm.sh 60 | RUN mkdir -p /var/spool/slurm /var/spool/slurmd && \ 61 | chown slurm:slurm /var/spool/slurm /var/spool/slurmd 62 | 63 | RUN mkdir -p /jobs /jobs/output /jobs/err 64 | 65 | # Create sample SLURM job scripts 66 | 67 | COPY hello_world_job.sbatch /jobs/hello_world_job.sbatch 68 | COPY lets_go_job.sbatch /jobs/lets_go_job.sbatch 69 | 70 | RUN chmod +x /jobs/hello_world_job.sbatch /jobs/lets_go_job.sbatch 71 | 72 | EXPOSE 6280 73 | 74 | RUN ln -s slurm/src/slurmd/slurmd/slurmd /bin/slurmd 75 | 76 | #RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.41 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.41.json 77 | #RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.40 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.40.json 78 | ENTRYPOINT ["/start_slurm.sh"] 79 | -------------------------------------------------------------------------------- /docker/build_slurm_version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script is just a quick tool to generate a slurmrestd container 4 | # and dump the latest openapi spec to ./openapi-specs 5 | 6 | import subprocess 7 | import sys 8 | 9 | if len(sys.argv) != 2: 10 | print(f"usage: {sys.argv[0]} ") 11 | exit(1) 12 | 13 | slurm_version = sys.argv[1] 14 | 15 | versions = { 16 | "24.05": { 17 | "api_version": "0.0.41", 18 | "container_version": "24.05", 19 | }, 20 | "23.11": { 21 | "api_version": "0.0.40", 22 | "container_version": "24.05", 23 | }, 24 | } 25 | 26 | if slurm_version not in versions: 27 | print( 28 | "supported slurm versions: {}".format(", ".join([v for v in versions.keys()])) 29 | ) 30 | exit(1) 31 | 32 | oapi_version = versions[slurm_version]["api_version"] 33 | container_version = versions[slurm_version]["container_version"] 34 | 35 | 36 | def cleanup_container(container_version: str): 37 | container_delete_command = f"docker rm -f slurm-{container_version}" 38 | s = subprocess.run( 39 | container_delete_command.split(), 40 | stdout=subprocess.PIPE, 41 | stderr=subprocess.PIPE, 42 | universal_newlines=True, 43 | ) 44 | 45 | if s.returncode != 0: 46 | raise Exception(f"Failed to clean up container: {s.stderr}") 47 | 48 | 49 | def build_container(container_version: str): 50 | build_command = f"docker build -t slurm_{container_version} --file {container_version}.dockerfile ." 51 | s = subprocess.run( 52 | build_command.split(), 53 | stdout=subprocess.PIPE, 54 | stderr=subprocess.PIPE, 55 | universal_newlines=True, 56 | ) 57 | 58 | if s.returncode != 0: 59 | raise Exception(f"Failed to build SLURM: {s.stderr}") 60 | 61 | 62 | def create_container(container_version: str) -> str: 63 | create_command = ( 64 | f"docker create --name slurm-{container_version} slurm_{container_version}" 65 | ) 66 | s = subprocess.run( 67 | create_command.split(), 68 | stdout=subprocess.PIPE, 69 | stderr=subprocess.PIPE, 70 | universal_newlines=True, 71 | ) 72 | 73 | if s.returncode != 0: 74 | raise Exception(f"Failed to create SLURM container: {s.stderr}") 75 | 76 | return s.stdout.strip() 77 | 78 | 79 | def copy_container_file(container_id: str, oapi_version: str): 80 | copy_command = f"docker cp {container_id}:/slurm/v{oapi_version}.json ../openapi-specs/{slurm_version}.json" 81 | s = subprocess.run( 82 | copy_command.split(), 83 | stdout=subprocess.PIPE, 84 | stderr=subprocess.PIPE, 85 | universal_newlines=True, 86 | ) 87 | 88 | if s.returncode != 0: 89 | raise Exception(f"Failed to copy Openapi specs from container: {s.stderr}") 90 | 91 | 92 | print( 93 | f"Building SLURM {container_version} to get Openapi manifest version {oapi_version}" 94 | ) 95 | 96 | try: 97 | build_container(container_version) 98 | container_id = create_container(container_version) 99 | copy_container_file(container_id, oapi_version) 100 | cleanup_container(container_version) 101 | print(f"Copied openapi spec {oapi_version} to ../openapi-specs/{slurm_version}.json") 102 | 103 | except Exception as e: 104 | print(f"Failed to copy openapi spec: {e}") 105 | cleanup_container(container_version) 106 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | You must have access to a slurm head node running `slurmrestd` and a valid token 4 | for that service. Take note of your slurm version, such as `24.05`, as you'll 5 | use this version when building. 6 | 7 | ## Requirements 8 | 9 | Install Go from source if you don't already have it. 10 | 11 | ```bash 12 | export VERSION=1.22.5 OS=linux ARCH=amd64 13 | wget https://dl.google.com/go/go$VERSION.$OS-$ARCH.tar.gz 14 | tar -xzvf go$VERSION.$OS-$ARCH.tar.gz 15 | export PATH=$PWD/go/bin:$PATH 16 | ``` 17 | 18 | _Alternatively install Go using the packaging system of your Linux 19 | distribution._ 20 | 21 | ## Building 22 | 23 | ### Clone this repository and build 24 | 25 | Use Git to clone the source code: 26 | 27 | ```bash 28 | git clone https://github.com/lcrownover/prometheus-slurm-exporter.git 29 | cd prometheus-slurm-exporter 30 | ``` 31 | 32 | Build the binary for your SLURM version, for example 24.05: 33 | 34 | ```bash 35 | SLURM_VERSION=24.05 make 36 | ``` 37 | 38 | Run tests for a specific SLURM version: 39 | 40 | ```bash 41 | SLURM_VERSION=24.05 make test 42 | ``` 43 | 44 | Run the tests for all SLURM versions: 45 | 46 | ```bash 47 | SLURM_VERSION=all make test 48 | ``` 49 | 50 | Start the exporter: 51 | 52 | ```bash 53 | ./bin/prometheus-slurm-exporter 54 | ``` 55 | 56 | If you wish to run the exporter on a different port, or the default port (8080) 57 | is already in use, run with the following argument: 58 | 59 | ```bash 60 | ./bin/prometheus-slurm-exporter --listen-address="0.0.0.0:" 61 | ``` 62 | 63 | Query all metrics: 64 | 65 | ```bash 66 | curl http://localhost:8080/metrics 67 | ``` 68 | 69 | ### Cutting releases 70 | 71 | Once you're ready to cut a new release, perform the following steps on the 72 | `main` branch. 73 | 74 | Tag the release version: 75 | 76 | `git tag v1.0.1` 77 | 78 | Push the tag: 79 | 80 | `git push origin v1.0.1` 81 | 82 | Make sure you have `GITHUB_TOKEN` exported, then use `goreleaser` to create 83 | releases: 84 | 85 | `goreleaser release --clean` 86 | 87 | ## Adding Support for New Openapi Versions 88 | 89 | ### Install openapi-generator-cli and openjdk 90 | 91 | Install `openapi-generator-cli` globally with NPM: 92 | 93 | ```bash 94 | npm install -g @openapitools/openapi-generator-cli` 95 | ``` 96 | 97 | This package depends on having the `java` executable in `PATH`, so install java. 98 | 99 | For mac, `brew install java`, then following the brew message, symlink the JDK, 100 | `sudo ln -sfn /usr/local/opt/openjdk/libexec/openjdk.jdk /Library/Java/JavaVirtualMachines/openjdk.jdk` 101 | 102 | For ubuntu, `sudo snap install openjdk`. 103 | 104 | ### Generating and Saving Openapi specs from SLURM using Docker 105 | 106 | Navigate to the `docker` directory and use the python script to automatically 107 | grab and store an openapi yaml spec from a target slurm version into the 108 | `openapi-specs` directory. 109 | 110 | ```bash 111 | python build_slurm_version.py 24.11 112 | ``` 113 | 114 | ### Generating the Openapi code for new SLURM versions 115 | 116 | I do this for every new SLURM version, so it should already be done. 117 | 118 | Assuming 23.11: 119 | 120 | ```bash 121 | openapi-generator-cli generate \ 122 | -g go \ 123 | -i openapi-specs/23.11.json \ 124 | -o ../openapi-slurm-23-11 \ 125 | --package-name openapi_slurm_23_11 \ 126 | --git-user-id lcrownover \ 127 | --git-repo-id openapi-slurm-23-11 128 | ``` 129 | 130 | This will generate an entire git repository that you can toss up in GitHub. 131 | -------------------------------------------------------------------------------- /internal/slurm/users.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | type UsersCollector struct { 14 | ctx context.Context 15 | pending *prometheus.Desc 16 | pending_cpus *prometheus.Desc 17 | running *prometheus.Desc 18 | running_cpus *prometheus.Desc 19 | suspended *prometheus.Desc 20 | } 21 | 22 | func NewUsersCollector(ctx context.Context) *UsersCollector { 23 | labels := []string{"user"} 24 | return &UsersCollector{ 25 | ctx: ctx, 26 | pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil), 27 | pending_cpus: prometheus.NewDesc("slurm_user_cpus_pending", "Pending jobs for user", labels, nil), 28 | running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil), 29 | running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil), 30 | suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil), 31 | } 32 | } 33 | 34 | func (uc *UsersCollector) Describe(ch chan<- *prometheus.Desc) { 35 | ch <- uc.pending 36 | ch <- uc.pending_cpus 37 | ch <- uc.running 38 | ch <- uc.running_cpus 39 | ch <- uc.suspended 40 | } 41 | 42 | func (uc *UsersCollector) Collect(ch chan<- prometheus.Metric) { 43 | apiCache := uc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 44 | jobsRespBytes, found := apiCache.Get("jobs") 45 | if !found { 46 | slog.Error("failed to get jobs response for users metrics from cache") 47 | return 48 | } 49 | jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte)) 50 | if err != nil { 51 | slog.Error("failed to process jobs data for users metrics", "error", err) 52 | return 53 | } 54 | um, err := ParseUsersMetrics(jobsData) 55 | if err != nil { 56 | slog.Error("failed to collect user metrics", "error", err) 57 | return 58 | } 59 | for u := range um { 60 | if um[u].pending > 0 { 61 | ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u) 62 | } 63 | if um[u].pending_cpus > 0 { 64 | ch <- prometheus.MustNewConstMetric(uc.pending_cpus, prometheus.GaugeValue, um[u].pending_cpus, u) 65 | } 66 | if um[u].running > 0 { 67 | ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u) 68 | } 69 | if um[u].running_cpus > 0 { 70 | ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u) 71 | } 72 | if um[u].suspended > 0 { 73 | ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u) 74 | } 75 | } 76 | } 77 | 78 | func NewUserJobMetrics() *userJobMetrics { 79 | return &userJobMetrics{0, 0, 0, 0, 0} 80 | } 81 | 82 | type userJobMetrics struct { 83 | pending float64 84 | pending_cpus float64 85 | running float64 86 | running_cpus float64 87 | suspended float64 88 | } 89 | 90 | func ParseUsersMetrics(jobsData *api.JobsData) (map[string]*userJobMetrics, error) { 91 | users := make(map[string]*userJobMetrics) 92 | for _, j := range jobsData.Jobs { 93 | user := j.UserName 94 | if _, exists := users[user]; !exists { 95 | users[user] = NewUserJobMetrics() 96 | } 97 | 98 | switch j.JobState { 99 | case types.JobStatePending: 100 | users[user].pending++ 101 | users[user].pending_cpus += float64(j.Cpus) 102 | case types.JobStateRunning: 103 | users[user].running++ 104 | users[user].running_cpus += float64(j.Cpus) 105 | case types.JobStateSuspended: 106 | users[user].suspended++ 107 | } 108 | } 109 | return users, nil 110 | } 111 | -------------------------------------------------------------------------------- /internal/slurm/gpus.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | type GPUsCollector struct { 14 | ctx context.Context 15 | alloc *prometheus.Desc 16 | idle *prometheus.Desc 17 | other *prometheus.Desc 18 | total *prometheus.Desc 19 | utilization *prometheus.Desc 20 | } 21 | 22 | func NewGPUsCollector(ctx context.Context) *GPUsCollector { 23 | return &GPUsCollector{ 24 | ctx: ctx, 25 | alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil), 26 | idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil), 27 | other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil), 28 | total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil), 29 | utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil), 30 | } 31 | } 32 | 33 | func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) { 34 | ch <- cc.alloc 35 | ch <- cc.idle 36 | ch <- cc.other 37 | ch <- cc.total 38 | ch <- cc.utilization 39 | } 40 | func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) { 41 | apiCache := cc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 42 | nodesRespBytes, found := apiCache.Get("nodes") 43 | if !found { 44 | slog.Error("failed to get nodes response for cpu metrics from cache") 45 | return 46 | } 47 | nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte)) 48 | if err != nil { 49 | slog.Error("failed to process nodes response for gpu metrics", "error", err) 50 | return 51 | } 52 | gm, err := ParseGPUsMetrics(nodesData) 53 | if err != nil { 54 | slog.Error("failed to collect gpus metrics", "error", err) 55 | return 56 | } 57 | ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, gm.alloc) 58 | ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, gm.idle) 59 | ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, gm.other) 60 | ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, gm.total) 61 | ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, gm.utilization) 62 | } 63 | 64 | type gpusMetrics struct { 65 | alloc float64 66 | idle float64 67 | other float64 68 | total float64 69 | utilization float64 70 | } 71 | 72 | func NewGPUsMetrics() *gpusMetrics { 73 | return &gpusMetrics{} 74 | } 75 | 76 | // NOTES: 77 | // node[gres] => gpu:0 # no gpus 78 | // node[gres] => gpu:nvidia_h100_80gb_hbm3:4(S:0-1) # 4 h100 gpus 79 | // node[gres_used] => gpu:nvidia_h100_80gb_hbm3:4(IDX:0-3) # 4 used gpus 80 | // node[gres_used] => gpu:nvidia_h100_80gb_hbm3:0(IDX:N/A) # 0 used gpus 81 | // node[tres] => cpu=48,mem=1020522M,billing=48,gres/gpu=4 # 4 total gpus 82 | // node[tres] => cpu=1,mem=1M,billing=1 # 0 total gpus 83 | // node[tres_used] => cpu=48,mem=1020522M,billing=48,gres/gpu=4 # 4 used gpus 84 | // node[tres_used] => cpu=1,mem=1M,billing=1 # 0 used gpus 85 | // 86 | // For tracking gpu resources, it looks like tres will be better. If I need to pull out per-gpu stats later, 87 | // I'll have to use gres 88 | // 89 | 90 | // ParseGPUsMetrics iterates through node response objects and tallies up the total and 91 | // allocated gpus, then derives idle and utilization from those numbers. 92 | func ParseGPUsMetrics(nodesData *api.NodesData) (*gpusMetrics, error) { 93 | gm := NewGPUsMetrics() 94 | for _, n := range nodesData.Nodes { 95 | idleGPUs := n.GPUTotal - n.GPUAllocated 96 | gm.total += float64(n.GPUTotal) 97 | gm.alloc += float64(n.GPUAllocated) 98 | gm.idle += float64(idleGPUs) 99 | } 100 | // TODO: Do we really need an "other" field? 101 | // using TRES, it should be straightforward. 102 | if gm.total > 0 { 103 | // if total is 0, we get NaN, so we check here 104 | gm.other = gm.total - (gm.alloc + gm.idle) 105 | } else { 106 | gm.other = 0 107 | } 108 | return gm, nil 109 | } 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prometheus Slurm Exporter 2 | 3 | Prometheus collector and exporter for metrics extracted from the [Slurm](https://slurm.schedmd.com/overview.html) resource scheduling system. 4 | 5 | This project was forked from [https://github.com/vpenso/prometheus-slurm-exporter](https://github.com/vpenso/prometheus-slurm-exporter) and, for now, aims to be backwards-compatible from SLURM 23.11 forward. 6 | This means the existing Grafana Dashboard should plug directly into this exporter and work roughly the same. 7 | 8 | Unlike previous slurm exporters, this project leverages the SLURM REST API (`slurmrestd`) for data retreival. 9 | Due to that difference, you are no longer required to run this exporter on a cluster node, as the exporter does not depend on having SLURM installed or connected to the head node! 10 | I will be releasing containerized versions of this exporter soon. 11 | 12 | ## Installation 13 | 14 | This repository contains precompiled binaries for the three most recent major versions of SLURM _(Note: currently only two versions, but will be three when 24.11 releases)_. 15 | In the [releases](https://github.com/lcrownover/prometheus-slurm-exporter/releases) page, download the newest version of the exporter that matches your SLURM version. 16 | The included systemd file assumes you've saved this binary to `/usr/local/sbin/prometheus-slurm-exporter`, so drop it there or take note to change the systemd file if you choose to use it. 17 | 18 | ## Configuration 19 | 20 | The expoter requires several environment variables to be set: 21 | 22 | * `SLURM_EXPORTER_LISTEN_ADDRESS` 23 | 24 | This should be the full address for the exporter to listen on. 25 | 26 | _Default: `0.0.0.0:8080`_ 27 | 28 | * `SLURM_EXPORTER_API_URL` 29 | 30 | This is the URL to your slurmrestd server. 31 | 32 | _Example: `http://head1.domain.edu:6820`_ 33 | _Example: `unix://path/to/unix/socket`_ 34 | 35 | * `SLURM_EXPORTER_API_USER` 36 | 37 | The user specified in the token command. 38 | 39 | * `SLURM_EXPORTER_API_TOKEN` 40 | 41 | This is the [SLURM token to authenticate against slurmrestd](https://slurm.schedmd.com/jwt.html). 42 | 43 | The easiest way to generate this is by running the following line on your head node: 44 | 45 | ```bash 46 | scontrol token username=myuser lifespan=someseconds 47 | ``` 48 | 49 | `myuser` should probably be the `slurm` user, or some other privileged account. 50 | 51 | `lifespan` is specified in seconds. I set mine for 1 year (`lifespan=31536000`). 52 | 53 | * `SLURM_EXPORTER_ENABLE_TLS` 54 | 55 | Set to `true` to enable TLS support. You must also provide paths to your certificate and key. 56 | 57 | * `SLURM_EXPORTER_TLS_CERT_PATH` 58 | 59 | Path to your TLS certificate. 60 | 61 | * `SLURM_EXPORTER_TLS_KEY_PATH` 62 | 63 | Path to your TLS key, it should be `0600`. 64 | 65 | ## Systemd 66 | 67 | A systemd unit file is [included](https://github.com/lcrownover/prometheus-slurm-exporter/blob/develop/extras/systemd/prometheus-slurm-exporter.service) for ease of deployment. 68 | 69 | This unit file assumes you've written your environment variables to `/etc/prometheus-slurm-exporter/env.conf` in the format: 70 | 71 | ``` 72 | SLURM_EXPORTER_API_URL="http://head.domain.edu:6820" 73 | SLURM_EXPORTER_API_USER="root" 74 | SLURM_EXPORTER_API_TOKEN="mytoken" 75 | ``` 76 | 77 | _Don't forget to `chmod 600 /etc/prometheus-slurm-exporter/env.conf`!_ 78 | 79 | ## Prometheus Server Scrape Config 80 | 81 | This is an example scrape config for your prometheus server: 82 | 83 | ``` 84 | scrape_configs: 85 | - job_name: 'slurm_exporter' 86 | scrape_interval: 30s 87 | scrape_timeout: 30s 88 | static_configs: 89 | - targets: ['exporter_host.domain.edu:8080'] 90 | ``` 91 | 92 | ## Grafana Dashboard 93 | 94 | The [dashboard](https://grafana.com/dashboards/4323) published by the previous author should work the same with this exporter. 95 | I will be releasing a new version of the dashboard soon that will receive new features. 96 | 97 | ![Status of the Nodes](images/Node_Status.png) 98 | 99 | ![Status of the Jobs](images/Job_Status.png) 100 | 101 | ![SLURM Scheduler Information](images/Scheduler_Info.png) 102 | 103 | ## Contributing 104 | 105 | Check out the [CONTRIBUTING.md](CONTRIBUTING.md) document. 106 | -------------------------------------------------------------------------------- /internal/slurm/cpus.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | // CPU metrics collector 14 | type CPUsCollector struct { 15 | ctx context.Context 16 | alloc *prometheus.Desc 17 | idle *prometheus.Desc 18 | other *prometheus.Desc 19 | total *prometheus.Desc 20 | } 21 | 22 | // NewCPUsCollector creates a new CPUsCollector 23 | func NewCPUsCollector(ctx context.Context) *CPUsCollector { 24 | return &CPUsCollector{ 25 | ctx: ctx, 26 | alloc: prometheus.NewDesc("slurm_cpus_alloc", "Allocated CPUs", nil, nil), 27 | idle: prometheus.NewDesc("slurm_cpus_idle", "Idle CPUs", nil, nil), 28 | other: prometheus.NewDesc("slurm_cpus_other", "Mix CPUs", nil, nil), 29 | total: prometheus.NewDesc("slurm_cpus_total", "Total CPUs", nil, nil), 30 | } 31 | } 32 | 33 | func (cc *CPUsCollector) Describe(ch chan<- *prometheus.Desc) { 34 | ch <- cc.alloc 35 | ch <- cc.idle 36 | ch <- cc.other 37 | ch <- cc.total 38 | } 39 | 40 | func (cc *CPUsCollector) Collect(ch chan<- prometheus.Metric) { 41 | apiCache := cc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 42 | jobsRespBytes, found := apiCache.Get("jobs") 43 | if !found { 44 | slog.Error("failed to get jobs response for users metrics from cache") 45 | return 46 | } 47 | jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte)) 48 | if err != nil { 49 | slog.Error("failed to process jobs response for cpu metrics", "error", err) 50 | return 51 | } 52 | nodesRespBytes, found := apiCache.Get("nodes") 53 | if !found { 54 | slog.Error("failed to get nodes response for cpu metrics from cache") 55 | return 56 | } 57 | nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte)) 58 | if err != nil { 59 | slog.Error("failed to process nodes response for cpu metrics", "error", err) 60 | return 61 | } 62 | cm, err := ParseCPUsMetrics(nodesData, jobsData) 63 | if err != nil { 64 | slog.Error("failed to collect cpus metrics", "error", err) 65 | return 66 | } 67 | ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc) 68 | ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle) 69 | ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other) 70 | ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total) 71 | } 72 | 73 | type cpusMetrics struct { 74 | alloc float64 75 | idle float64 76 | other float64 77 | total float64 78 | } 79 | 80 | func NewCPUsMetrics() *cpusMetrics { 81 | return &cpusMetrics{} 82 | } 83 | 84 | // ParseCPUMetrics pulls out total cluster cpu states of alloc,idle,other,total 85 | func ParseCPUsMetrics(nodesData *api.NodesData, jobsData *api.JobsData) (*cpusMetrics, error) { 86 | cm := NewCPUsMetrics() 87 | for _, j := range jobsData.Jobs { 88 | // alloc is easy, we just add up all the cpus in the "Running" job state 89 | if j.JobState == types.JobStateRunning { 90 | cm.alloc += float64(j.Cpus) 91 | } 92 | } 93 | // total is just the total number of cpus in the cluster 94 | nodes := nodesData.Nodes 95 | for _, n := range nodes { 96 | if n.Cpus == 1 { 97 | // TODO: This probably needs to be a call to partitions to get all nodes 98 | // in a partition, then add the nodes CPU values up for this field. 99 | // In our environment, nodes that exist (need slurm commands) get 100 | // put into slurm without being assigned a partition, but slurm 101 | // seems to track these systems with cpus=1. 102 | // This isn't a problem unless your site has nodes with a single CPU. 103 | continue 104 | } 105 | cpus := float64(n.Cpus) 106 | cm.total += cpus 107 | 108 | for _, ns := range n.States { 109 | if ns == types.NodeStateMix || ns == types.NodeStateAlloc || ns == types.NodeStateIdle { 110 | // TODO: This calculate is scuffed. In our 17k core environment, it's 111 | // reporting ~400 more than the `sinfo -h -o '%C'` command. 112 | // Gotta figure this one out. 113 | idle_cpus := float64(n.AllocIdleCpus) 114 | cm.idle += idle_cpus 115 | } 116 | } 117 | } 118 | // Assumedly, this should be fine. 119 | cm.other = cm.total - cm.idle - cm.alloc 120 | return cm, nil 121 | } 122 | -------------------------------------------------------------------------------- /internal/slurm/account.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | /* 14 | 15 | AccountsCollector collects metrics for accounts 16 | 17 | */ 18 | 19 | // AccountsCollector collects metrics for accounts 20 | type AccountsCollector struct { 21 | ctx context.Context 22 | pending *prometheus.Desc 23 | pending_cpus *prometheus.Desc 24 | running *prometheus.Desc 25 | running_cpus *prometheus.Desc 26 | suspended *prometheus.Desc 27 | } 28 | 29 | // NewAccountsCollector creates a new AccountsCollector 30 | func NewAccountsCollector(ctx context.Context) *AccountsCollector { 31 | labels := []string{"account"} 32 | return &AccountsCollector{ 33 | ctx: ctx, 34 | pending: prometheus.NewDesc("slurm_account_jobs_pending", "Pending jobs for account", labels, nil), 35 | pending_cpus: prometheus.NewDesc("slurm_account_cpus_pending", "Pending cpus for account", labels, nil), 36 | running: prometheus.NewDesc("slurm_account_jobs_running", "Running jobs for account", labels, nil), 37 | running_cpus: prometheus.NewDesc("slurm_account_cpus_running", "Running cpus for account", labels, nil), 38 | suspended: prometheus.NewDesc("slurm_account_jobs_suspended", "Suspended jobs for account", labels, nil), 39 | } 40 | } 41 | 42 | func (ac *AccountsCollector) Describe(ch chan<- *prometheus.Desc) { 43 | ch <- ac.pending 44 | ch <- ac.pending_cpus 45 | ch <- ac.running 46 | ch <- ac.running_cpus 47 | ch <- ac.suspended 48 | } 49 | 50 | func (ac *AccountsCollector) Collect(ch chan<- prometheus.Metric) { 51 | apiCache := ac.ctx.Value(types.ApiCacheKey).(*cache.Cache) 52 | jobsRespBytes, found := apiCache.Get("jobs") 53 | if !found { 54 | slog.Error("failed to get jobs response for users metrics from cache") 55 | return 56 | } 57 | jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte)) 58 | if err != nil { 59 | slog.Error("failed to extract jobs data for accounts metrics", "error", err) 60 | return 61 | } 62 | am, err := ParseAccountsMetrics(*jobsData) 63 | if err != nil { 64 | slog.Error("failed to parse accounts metrics", "error", err) 65 | return 66 | } 67 | for a := range am { 68 | if am[a].pending > 0 { 69 | ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a) 70 | } 71 | if am[a].pending_cpus > 0 { 72 | ch <- prometheus.MustNewConstMetric(ac.pending_cpus, prometheus.GaugeValue, am[a].pending_cpus, a) 73 | } 74 | if am[a].running > 0 { 75 | ch <- prometheus.MustNewConstMetric(ac.running, prometheus.GaugeValue, am[a].running, a) 76 | } 77 | if am[a].running_cpus > 0 { 78 | ch <- prometheus.MustNewConstMetric(ac.running_cpus, prometheus.GaugeValue, am[a].running_cpus, a) 79 | } 80 | if am[a].suspended > 0 { 81 | ch <- prometheus.MustNewConstMetric(ac.suspended, prometheus.GaugeValue, am[a].suspended, a) 82 | } 83 | } 84 | } 85 | 86 | type JobMetrics struct { 87 | pending float64 88 | pending_cpus float64 89 | running float64 90 | running_cpus float64 91 | suspended float64 92 | } 93 | 94 | func NewJobMetrics() *JobMetrics { 95 | return &JobMetrics{} 96 | } 97 | 98 | // ParseAccountsMetrics gets the response body of jobs from SLURM and 99 | // parses it into a map of "accountName": *JobMetrics 100 | func ParseAccountsMetrics(jobsData api.JobsData) (map[string]*JobMetrics, error) { 101 | accounts := make(map[string]*JobMetrics) 102 | for _, j := range jobsData.Jobs { 103 | // build the map with the account name as the key and job metrics as the value 104 | _, key := accounts[j.Account] 105 | if !key { 106 | // initialize a new metrics object if the key isnt found 107 | accounts[j.Account] = NewJobMetrics() 108 | } 109 | // for each of the jobs, depending on the state, 110 | // tally up the cpu count and increment the count of jobs for that state 111 | switch j.JobState { 112 | case types.JobStatePending: 113 | accounts[j.Account].pending++ 114 | accounts[j.Account].pending_cpus += float64(j.Cpus) 115 | case types.JobStateRunning: 116 | accounts[j.Account].running++ 117 | accounts[j.Account].running_cpus += float64(j.Cpus) 118 | case types.JobStateSuspended: 119 | accounts[j.Account].suspended++ 120 | } 121 | } 122 | return accounts, nil 123 | } 124 | -------------------------------------------------------------------------------- /internal/slurm/node.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log/slog" 7 | 8 | "github.com/akyoto/cache" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 10 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 11 | "github.com/prometheus/client_golang/prometheus" 12 | ) 13 | 14 | type NodeCollector struct { 15 | ctx context.Context 16 | cpuAlloc *prometheus.Desc 17 | cpuIdle *prometheus.Desc 18 | cpuOther *prometheus.Desc 19 | cpuTotal *prometheus.Desc 20 | memAlloc *prometheus.Desc 21 | memTotal *prometheus.Desc 22 | } 23 | 24 | // NewNodeCollectorOld creates a Prometheus collector to keep all our stats in 25 | // It returns a set of collections for consumption 26 | func NewNodeCollector(ctx context.Context) *NodeCollector { 27 | labels := []string{"node", "status"} 28 | 29 | return &NodeCollector{ 30 | ctx: ctx, 31 | cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil), 32 | cpuIdle: prometheus.NewDesc("slurm_node_cpu_idle", "Idle CPUs per node", labels, nil), 33 | cpuOther: prometheus.NewDesc("slurm_node_cpu_other", "Other CPUs per node", labels, nil), 34 | cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil), 35 | memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil), 36 | memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil), 37 | } 38 | } 39 | 40 | // Send all metric descriptions 41 | func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) { 42 | ch <- nc.cpuAlloc 43 | ch <- nc.cpuIdle 44 | ch <- nc.cpuOther 45 | ch <- nc.cpuTotal 46 | ch <- nc.memAlloc 47 | ch <- nc.memTotal 48 | } 49 | 50 | func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) { 51 | apiCache := nc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 52 | nodesRespBytes, found := apiCache.Get("nodes") 53 | if !found { 54 | slog.Error("failed to get nodes response for cpu metrics from cache") 55 | return 56 | } 57 | nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte)) 58 | if err != nil { 59 | slog.Error("failed to process nodes response for node metrics", "error", err) 60 | return 61 | } 62 | nm, err := ParseNodeMetrics(nodesData) 63 | if err != nil { 64 | slog.Error("failed to collect nodes metrics", "error", err) 65 | return 66 | } 67 | for node := range nm { 68 | ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nm[node].cpuAlloc), node, nm[node].nodeStatus) 69 | ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nm[node].cpuIdle), node, nm[node].nodeStatus) 70 | ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nm[node].cpuOther), node, nm[node].nodeStatus) 71 | ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nm[node].cpuTotal), node, nm[node].nodeStatus) 72 | ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nm[node].memAlloc), node, nm[node].nodeStatus) 73 | ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nm[node].memTotal), node, nm[node].nodeStatus) 74 | } 75 | } 76 | 77 | // NodeMetrics stores metrics for each node 78 | type nodeMetrics struct { 79 | memAlloc uint64 80 | memTotal uint64 81 | cpuAlloc uint64 82 | cpuIdle uint64 83 | cpuOther uint64 84 | cpuTotal uint64 85 | nodeStatus string 86 | } 87 | 88 | func NewNodeMetrics() *nodeMetrics { 89 | return &nodeMetrics{} 90 | } 91 | 92 | // ParseNodeMetrics takes the output of sinfo with node data 93 | // It returns a map of metrics per node 94 | func ParseNodeMetrics(nodesData *api.NodesData) (map[string]*nodeMetrics, error) { 95 | nodeMap := make(map[string]*nodeMetrics) 96 | 97 | for _, n := range nodesData.Nodes { 98 | nodeName := n.Hostname 99 | nodeMap[nodeName] = &nodeMetrics{0, 0, 0, 0, 0, 0, ""} 100 | 101 | // state 102 | nodeStatesStr, err := n.GetNodeStatesString("|") 103 | if err != nil { 104 | return nil, fmt.Errorf("failed to get node state: %v", err) 105 | } 106 | nodeMap[nodeName].nodeStatus = nodeStatesStr 107 | 108 | // memory 109 | nodeMap[nodeName].memAlloc = uint64(n.AllocMemory) 110 | nodeMap[nodeName].memTotal = uint64(n.RealMemory) 111 | 112 | // cpu 113 | nodeMap[nodeName].cpuAlloc = uint64(n.AllocCpus) 114 | nodeMap[nodeName].cpuIdle = uint64(n.AllocIdleCpus) 115 | nodeMap[nodeName].cpuOther = uint64(n.OtherCpus) 116 | nodeMap[nodeName].cpuTotal = uint64(n.Cpus) 117 | } 118 | 119 | return nodeMap, nil 120 | } 121 | -------------------------------------------------------------------------------- /internal/slurm/nodes.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | type NodesCollector struct { 14 | ctx context.Context 15 | alloc *prometheus.Desc 16 | comp *prometheus.Desc 17 | down *prometheus.Desc 18 | drain *prometheus.Desc 19 | err *prometheus.Desc 20 | fail *prometheus.Desc 21 | idle *prometheus.Desc 22 | maint *prometheus.Desc 23 | mix *prometheus.Desc 24 | resv *prometheus.Desc 25 | } 26 | 27 | func NewNodesCollector(ctx context.Context) *NodesCollector { 28 | return &NodesCollector{ 29 | ctx: ctx, 30 | alloc: prometheus.NewDesc("slurm_nodes_alloc", "Allocated nodes", nil, nil), 31 | comp: prometheus.NewDesc("slurm_nodes_comp", "Completing nodes", nil, nil), 32 | down: prometheus.NewDesc("slurm_nodes_down", "Down nodes", nil, nil), 33 | drain: prometheus.NewDesc("slurm_nodes_drain", "Drain nodes", nil, nil), 34 | err: prometheus.NewDesc("slurm_nodes_err", "Error nodes", nil, nil), 35 | fail: prometheus.NewDesc("slurm_nodes_fail", "Fail nodes", nil, nil), 36 | idle: prometheus.NewDesc("slurm_nodes_idle", "Idle nodes", nil, nil), 37 | maint: prometheus.NewDesc("slurm_nodes_maint", "Maint nodes", nil, nil), 38 | mix: prometheus.NewDesc("slurm_nodes_mix", "Mix nodes", nil, nil), 39 | resv: prometheus.NewDesc("slurm_nodes_resv", "Reserved nodes", nil, nil), 40 | } 41 | } 42 | 43 | func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) { 44 | ch <- nc.alloc 45 | ch <- nc.comp 46 | ch <- nc.down 47 | ch <- nc.drain 48 | ch <- nc.err 49 | ch <- nc.fail 50 | ch <- nc.idle 51 | ch <- nc.maint 52 | ch <- nc.mix 53 | ch <- nc.resv 54 | } 55 | 56 | func (nc *NodesCollector) Collect(ch chan<- prometheus.Metric) { 57 | apiCache := nc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 58 | nodesRespBytes, found := apiCache.Get("nodes") 59 | if !found { 60 | slog.Error("failed to get nodes response for cpu metrics from cache") 61 | return 62 | } 63 | nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte)) 64 | if err != nil { 65 | slog.Error("failed to process nodes response for nodes metrics", "error", err) 66 | return 67 | } 68 | nm, err := ParseNodesMetrics(nodesData) 69 | if err != nil { 70 | slog.Error("failed to collect nodes metrics", "error", err) 71 | return 72 | } 73 | ch <- prometheus.MustNewConstMetric(nc.alloc, prometheus.GaugeValue, nm.alloc) 74 | ch <- prometheus.MustNewConstMetric(nc.comp, prometheus.GaugeValue, nm.comp) 75 | ch <- prometheus.MustNewConstMetric(nc.down, prometheus.GaugeValue, nm.down) 76 | ch <- prometheus.MustNewConstMetric(nc.drain, prometheus.GaugeValue, nm.drain) 77 | ch <- prometheus.MustNewConstMetric(nc.err, prometheus.GaugeValue, nm.err) 78 | ch <- prometheus.MustNewConstMetric(nc.fail, prometheus.GaugeValue, nm.fail) 79 | ch <- prometheus.MustNewConstMetric(nc.idle, prometheus.GaugeValue, nm.idle) 80 | ch <- prometheus.MustNewConstMetric(nc.maint, prometheus.GaugeValue, nm.maint) 81 | ch <- prometheus.MustNewConstMetric(nc.mix, prometheus.GaugeValue, nm.mix) 82 | ch <- prometheus.MustNewConstMetric(nc.resv, prometheus.GaugeValue, nm.resv) 83 | } 84 | 85 | type nodesMetrics struct { 86 | alloc float64 87 | comp float64 88 | down float64 89 | drain float64 90 | err float64 91 | fail float64 92 | idle float64 93 | maint float64 94 | mix float64 95 | resv float64 96 | } 97 | 98 | func NewNodesMetrics() *nodesMetrics { 99 | return &nodesMetrics{} 100 | } 101 | 102 | // ParseNodesMetrics iterates through node response objects and tallies up 103 | // nodes based on their state 104 | func ParseNodesMetrics(nodesData *api.NodesData) (*nodesMetrics, error) { 105 | nm := NewNodesMetrics() 106 | 107 | for _, n := range nodesData.Nodes { 108 | for _, ns := range n.States { 109 | switch ns { 110 | case types.NodeStateAlloc: 111 | nm.alloc += 1 112 | case types.NodeStateComp: 113 | nm.comp += 1 114 | case types.NodeStateDown: 115 | nm.down += 1 116 | case types.NodeStateDrain: 117 | nm.drain += 1 118 | case types.NodeStateErr: 119 | nm.err += 1 120 | case types.NodeStateFail: 121 | nm.fail += 1 122 | case types.NodeStateIdle: 123 | nm.idle += 1 124 | case types.NodeStateMaint: 125 | nm.maint += 1 126 | case types.NodeStateMix: 127 | nm.mix += 1 128 | case types.NodeStateResv: 129 | nm.resv += 1 130 | } 131 | } 132 | } 133 | 134 | return nm, nil 135 | } 136 | -------------------------------------------------------------------------------- /cmd/prometheus-slurm-exporter/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "log/slog" 8 | "net/http" 9 | "os" 10 | "strconv" 11 | "strings" 12 | "time" 13 | 14 | "github.com/akyoto/cache" 15 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 16 | "github.com/lcrownover/prometheus-slurm-exporter/internal/slurm" 17 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 18 | "github.com/prometheus/client_golang/prometheus" 19 | ) 20 | 21 | var err error 22 | 23 | var version = "2.1.1-beta" 24 | 25 | func main() { 26 | // set up logging 27 | lvl := slog.LevelInfo 28 | _, debug := os.LookupEnv("SLURM_EXPORTER_DEBUG") 29 | if debug { 30 | lvl = slog.LevelDebug 31 | } 32 | l := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ 33 | Level: lvl, 34 | })) 35 | slog.SetDefault(l) 36 | slog.Debug("debug logging enabled") 37 | 38 | // if -v is passed, print the version and exit 39 | if len(os.Args) > 1 && os.Args[1] == "-v" { 40 | fmt.Println(version) 41 | os.Exit(0) 42 | } 43 | 44 | log.Printf("Starting Prometheus Slurm Exporter %s\n", version) 45 | 46 | listenAddress, found := os.LookupEnv("SLURM_EXPORTER_LISTEN_ADDRESS") 47 | if !found { 48 | listenAddress = "0.0.0.0:8080" 49 | } 50 | 51 | apiURL, found := os.LookupEnv("SLURM_EXPORTER_API_URL") 52 | if !found { 53 | fmt.Println("You must set SLURM_EXPORTER_API_URL. Example: localhost:6820") 54 | os.Exit(1) 55 | } 56 | 57 | var apiUser string 58 | var apiToken string 59 | var tlsEnable bool 60 | var tlsCert string 61 | var tlsKey string 62 | 63 | // we only need these values if the endpoint is not unix:// 64 | if strings.HasPrefix(apiURL, "http://") || strings.HasPrefix(apiURL, "https://") { 65 | var found bool 66 | apiUser, found = os.LookupEnv("SLURM_EXPORTER_API_USER") 67 | if !found { 68 | fmt.Println("You must set SLURM_EXPORTER_API_USER") 69 | os.Exit(1) 70 | } 71 | 72 | apiToken, found = os.LookupEnv("SLURM_EXPORTER_API_TOKEN") 73 | if !found { 74 | fmt.Println("You must set SLURM_EXPORTER_API_TOKEN") 75 | os.Exit(1) 76 | } 77 | 78 | tlsString, found := os.LookupEnv("SLURM_EXPORTER_ENABLE_TLS") 79 | 80 | if !found { 81 | tlsEnable = false // default to false, do not break existing conf files 82 | } else { 83 | tlsEnable, err = strconv.ParseBool(tlsString) 84 | if err != nil { 85 | fmt.Println("Failed to parse SLURM_EXPORTER_ENABLE_TLS. Please set to 1, t, T, TRUE, true, True, 0, f, F, FALSE, false, or False.") 86 | } 87 | } 88 | if tlsEnable { // require tlsCert and tlsKey only if tlsEnable is true 89 | tlsCert, found = os.LookupEnv("SLURM_EXPORTER_TLS_CERT_PATH") 90 | if !found { 91 | fmt.Println("You must set SLURM_EXPORTER_TLS_CERT_PATH to the path of your cert") 92 | os.Exit(1) 93 | } 94 | tlsKey, found = os.LookupEnv("SLURM_EXPORTER_TLS_KEY_PATH") 95 | if !found { 96 | fmt.Println("You must set SLURM_EXPORTER_TLS_KEY_PATH to the path of your key") 97 | os.Exit(1) 98 | } 99 | } 100 | 101 | } else if strings.HasPrefix(apiURL, "unix://") { 102 | apiUser = "" 103 | apiToken = "" 104 | tlsEnable = false 105 | tlsCert = "" 106 | tlsKey = "" 107 | 108 | } else { 109 | fmt.Println("SLURM_EXPORTER_API_URL must start with unix://, http://, or https://") 110 | fmt.Println("Got: ", apiURL) 111 | os.Exit(1) 112 | } 113 | // API Cache 114 | apiCache := cache.New(60 * time.Second) 115 | 116 | // Set up the context to pass around 117 | ctx := context.Background() 118 | ctx = context.WithValue(ctx, types.ApiUserKey, apiUser) 119 | ctx = context.WithValue(ctx, types.ApiTokenKey, apiToken) 120 | ctx = context.WithValue(ctx, types.ApiURLKey, apiURL) 121 | ctx = context.WithValue(ctx, types.ApiCacheKey, apiCache) 122 | 123 | // Register all the endpoints 124 | ctx = api.RegisterEndpoints(ctx) 125 | 126 | // Register all the collectors 127 | r := prometheus.NewRegistry() 128 | r.MustRegister(slurm.NewAccountsCollector(ctx)) 129 | r.MustRegister(slurm.NewCPUsCollector(ctx)) 130 | r.MustRegister(slurm.NewGPUsCollector(ctx)) 131 | r.MustRegister(slurm.NewNodesCollector(ctx)) 132 | r.MustRegister(slurm.NewNodeCollector(ctx)) 133 | r.MustRegister(slurm.NewPartitionsCollector(ctx)) 134 | r.MustRegister(slurm.NewFairShareCollector(ctx)) 135 | r.MustRegister(slurm.NewQueueCollector(ctx)) 136 | r.MustRegister(slurm.NewSchedulerCollector(ctx)) 137 | r.MustRegister(slurm.NewUsersCollector(ctx)) 138 | 139 | log.Printf("Starting Server: %s\n", listenAddress) 140 | http.Handle("/metrics", api.MetricsHandler(r, ctx)) 141 | if tlsEnable { 142 | log.Fatal(http.ListenAndServeTLS(listenAddress, tlsCert, tlsKey, nil)) 143 | } else { 144 | log.Fatal(http.ListenAndServe(listenAddress, nil)) 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /testdata/SlurmV0041GetShares200Response.json: -------------------------------------------------------------------------------- 1 | { 2 | "shares" : { 3 | "shares" : [ { 4 | "cluster" : "cluster", 5 | "parent" : "parent", 6 | "shares_normalized" : { 7 | "number" : 6.027456183070403, 8 | "set" : true, 9 | "infinite" : true 10 | }, 11 | "usage" : 9, 12 | "fairshare" : { 13 | "level" : 2.027123023002322, 14 | "factor" : 3.616076749251911 15 | }, 16 | "type" : [ "USER", "USER" ], 17 | "effective_usage" : 2.3021358869347655, 18 | "shares" : { 19 | "number" : 1, 20 | "set" : true, 21 | "infinite" : true 22 | }, 23 | "partition" : "partition", 24 | "usage_normalized" : { 25 | "number" : 7.061401241503109, 26 | "set" : true, 27 | "infinite" : true 28 | }, 29 | "name" : "name", 30 | "tres" : { 31 | "run_seconds" : [ { 32 | "name" : "name", 33 | "value" : { 34 | "number" : 5, 35 | "set" : true, 36 | "infinite" : true 37 | } 38 | }, { 39 | "name" : "name", 40 | "value" : { 41 | "number" : 5, 42 | "set" : true, 43 | "infinite" : true 44 | } 45 | } ], 46 | "usage" : [ { 47 | "name" : "name", 48 | "value" : 5.637376656633329 49 | }, { 50 | "name" : "name", 51 | "value" : 5.637376656633329 52 | } ], 53 | "group_minutes" : [ { 54 | "name" : "name", 55 | "value" : { 56 | "number" : 5, 57 | "set" : true, 58 | "infinite" : true 59 | } 60 | }, { 61 | "name" : "name", 62 | "value" : { 63 | "number" : 5, 64 | "set" : true, 65 | "infinite" : true 66 | } 67 | } ] 68 | }, 69 | "id" : 0 70 | }, { 71 | "cluster" : "cluster", 72 | "parent" : "parent", 73 | "shares_normalized" : { 74 | "number" : 6.027456183070403, 75 | "set" : true, 76 | "infinite" : true 77 | }, 78 | "usage" : 9, 79 | "fairshare" : { 80 | "level" : 2.027123023002322, 81 | "factor" : 3.616076749251911 82 | }, 83 | "type" : [ "USER", "USER" ], 84 | "effective_usage" : 2.3021358869347655, 85 | "shares" : { 86 | "number" : 1, 87 | "set" : true, 88 | "infinite" : true 89 | }, 90 | "partition" : "partition", 91 | "usage_normalized" : { 92 | "number" : 7.061401241503109, 93 | "set" : true, 94 | "infinite" : true 95 | }, 96 | "name" : "name", 97 | "tres" : { 98 | "run_seconds" : [ { 99 | "name" : "name", 100 | "value" : { 101 | "number" : 5, 102 | "set" : true, 103 | "infinite" : true 104 | } 105 | }, { 106 | "name" : "name", 107 | "value" : { 108 | "number" : 5, 109 | "set" : true, 110 | "infinite" : true 111 | } 112 | } ], 113 | "usage" : [ { 114 | "name" : "name", 115 | "value" : 5.637376656633329 116 | }, { 117 | "name" : "name", 118 | "value" : 5.637376656633329 119 | } ], 120 | "group_minutes" : [ { 121 | "name" : "name", 122 | "value" : { 123 | "number" : 5, 124 | "set" : true, 125 | "infinite" : true 126 | } 127 | }, { 128 | "name" : "name", 129 | "value" : { 130 | "number" : 5, 131 | "set" : true, 132 | "infinite" : true 133 | } 134 | } ] 135 | }, 136 | "id" : 0 137 | } ], 138 | "total_shares" : 4 139 | }, 140 | "meta" : { 141 | "slurm" : { 142 | "cluster" : "cluster", 143 | "release" : "release", 144 | "version" : { 145 | "major" : "major", 146 | "minor" : "minor", 147 | "micro" : "micro" 148 | } 149 | }, 150 | "plugin" : { 151 | "accounting_storage" : "accounting_storage", 152 | "name" : "name", 153 | "type" : "type", 154 | "data_parser" : "data_parser" 155 | }, 156 | "client" : { 157 | "source" : "source", 158 | "user" : "user", 159 | "group" : "group" 160 | }, 161 | "command" : [ "command", "command" ] 162 | }, 163 | "warnings" : [ { 164 | "description" : "description", 165 | "source" : "source" 166 | }, { 167 | "description" : "description", 168 | "source" : "source" 169 | } ], 170 | "errors" : [ { 171 | "description" : "description", 172 | "source" : "source", 173 | "error" : "error", 174 | "error_number" : 5 175 | }, { 176 | "description" : "description", 177 | "source" : "source", 178 | "error" : "error", 179 | "error_number" : 5 180 | } ] 181 | } 182 | -------------------------------------------------------------------------------- /docker/24.05.dockerfile: -------------------------------------------------------------------------------- 1 | FROM rockylinux:8 2 | RUN dnf clean all && \ 3 | dnf update -y && \ 4 | dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \ 5 | dnf install -y --enablerepo=devel mariadb-devel python3-PyMySQL hwloc lz4-devel wget bzip2 perl munge-devel munge cmake jansson libjwt-devel libjwt json-c-devel json-c http-parser-devel http-parser libcgroup libcgroup-tools dbus-devel mariadb && \ 6 | dnf group install -y "Development Tools" 7 | 8 | RUN dnf install -y sudo 9 | 10 | RUN dnf -y update && \ 11 | dnf install -y systemd && \ 12 | dnf clean all && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | 16 | # Add fake users to run jobs as 17 | RUN adduser user1 18 | RUN adduser user2 19 | RUN adduser slurm 20 | 21 | # Install http_parser 22 | RUN git clone --depth 1 --single-branch -b v2.9.4 https://github.com/nodejs/http-parser.git http_parser \ 23 | && cd http_parser \ 24 | && make \ 25 | && make install 26 | 27 | #RUN dnf install -y systemd 28 | #slurmrestd -d list 29 | #Need to do this 30 | 31 | RUN dnf install -y jansson-devel 32 | 33 | RUN git clone --depth 1 --single-branch -b v1.12.0 https://github.com/benmcollins/libjwt.git libjwt \ 34 | && cd libjwt \ 35 | && autoreconf --force --install \ 36 | && ./configure --prefix=/usr/local/ \ 37 | && make -j && make install 38 | 39 | 40 | WORKDIR /slurm 41 | RUN wget https://download.schedmd.com/slurm/slurm-24.05-latest.tar.bz2 && tar -xvjf slurm-24.05-latest.tar.bz2 --strip-components=1 42 | # add --with-jwt=/usr/local/ 43 | RUN ./configure \ 44 | --with-cgroup-v2 \ 45 | --with-http-parser=/usr/local/ \ 46 | --enable-slurmrestd \ 47 | --with-jwt=/usr/local/ \ 48 | && make && make install 49 | 50 | # Create the /var/log/slurm directory and set permissions 51 | RUN mkdir -p /var/log/slurm && \ 52 | chown slurm:slurm /var/log/slurm && \ 53 | chmod 750 /var/log/slurm && \ 54 | touch /var/log/slurm/slurmd.log /var/log/slurm/slurmctld.log /var/log/slurm/slurmdbd.log && \ 55 | chown slurm:slurm /var/log/slurm/slurmctld.log /var/log/slurm/slurmd.log /var/log/slurm/slurmdbd.log 56 | 57 | RUN getent group munge || groupadd -r munge && \ 58 | getent passwd munge || useradd -r -g munge munge && \ 59 | mkdir -p /var/log/munge && \ 60 | chown munge:munge /var/log/munge && \ 61 | chmod 750 /var/log/munge && \ 62 | /usr/sbin/create-munge-key && \ 63 | chown munge:munge /etc/munge/munge.key && \ 64 | chmod 400 /etc/munge/munge.key 65 | 66 | RUN touch /var/log/munge/munged.log && \ 67 | chown munge:munge /var/log/munge/munged.log 68 | 69 | COPY slurm.conf /usr/local/etc/slurm.conf 70 | 71 | USER root 72 | COPY cgroup.conf /usr/local/etc/cgroup.conf 73 | COPY slurm.conf /usr/local/etc/slurm.conf 74 | COPY slurmdbd.conf /usr/local/etc/slurmdbd.conf 75 | RUN chown slurm:slurm /usr/local/etc/slurmdbd.conf 76 | RUN chmod 600 /usr/local/etc/slurmdbd.conf 77 | COPY start_slurm.sh /start_slurm.sh 78 | COPY start_jobs.sh /start_jobs.sh 79 | 80 | ENV SLURM_CONF=/usr/local/etc/slurm.conf 81 | RUN chmod 755 /start_slurm.sh /start_jobs.sh 82 | 83 | RUN mkdir -p /var/spool/slurm /var/spool/slurmd && \ 84 | chown slurm:slurm /var/spool/slurm /var/spool/slurmd && \ 85 | chmod 755 /var/spool/slurmd 86 | 87 | RUN chown -R slurm:slurm /slurm/src/ 88 | 89 | # touch /var/spool/slurmd/cred_state && \ 90 | # chown slurm:slurm /var/spool/slurmd/cred_state && \ 91 | # chmod 755 /var/spool/slurmd/cred_state 92 | 93 | RUN mkdir -p /var/spool/slurm/statesave && dd if=/dev/random of=/var/spool/slurm/statesave/jwt_hs256.key bs=32 count=1 \ 94 | && chown slurm:slurm /var/spool/slurm/statesave/jwt_hs256.key \ 95 | && chmod 0600 /var/spool/slurm/statesave/jwt_hs256.key \ 96 | && chown slurm:slurm /var/spool/slurm/statesave \ 97 | && chmod 0755 /var/spool/slurm/statesave 98 | 99 | 100 | RUN mkdir -p /jobs /jobs/output /jobs/err && \ 101 | chown root:slurm /jobs /jobs/output /jobs/err 102 | 103 | # Create sample SLURM job scripts 104 | 105 | COPY hello_world_job.sbatch /jobs/hello_world_job.sbatch 106 | COPY lets_go_job.sbatch /jobs/lets_go_job.sbatch 107 | 108 | RUN chmod +x /jobs/hello_world_job.sbatch /jobs/lets_go_job.sbatch 109 | 110 | # Ask Lucas about what other ports need to be exposed or if I need to build slurm with this port exposed from the getgo 111 | EXPOSE 6280 112 | 113 | RUN ln -s /slurm/src/slurmd/slurmd/slurmd /bin/slurmd # I only added this to make it easier to run the slurmd executable during daemon start troubleshooting 114 | RUN ln -s /slurm/src/slurmdbd/slurmdbd /bin/slurmdbd # I only added this to make it easier to run the slurmd executable during daemon start troubleshooting 115 | RUN ln -s /slurm/src/slurmrestd/slurmrestd /bin/slurmrestd # I only added this to make it easier to run the slurmd executable during daemon start troubleshooting 116 | 117 | RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.41 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.41.json 118 | RUN env SLURM_CONF=/dev/null slurmrestd -d v0.0.40 -s slurmdbd,slurmctld --generate-openapi-spec > /slurm/v0.0.40.json 119 | 120 | ENTRYPOINT ["/start_slurm.sh"] 121 | -------------------------------------------------------------------------------- /testdata/V0041OpenapiSharesResp.json: -------------------------------------------------------------------------------- 1 | { 2 | "shares" : { 3 | "shares" : [ { 4 | "cluster" : "cluster", 5 | "parent" : "parent", 6 | "shares_normalized" : { 7 | "number" : 6.027456183070403, 8 | "set" : true, 9 | "infinite" : true 10 | }, 11 | "usage" : 9, 12 | "fairshare" : { 13 | "level" : 2.027123023002322, 14 | "factor" : 3.616076749251911 15 | }, 16 | "type" : [ "USER", "USER" ], 17 | "effective_usage" : { 18 | "number" : 2.3021358869347655, 19 | "set" : true, 20 | "infinite" : true 21 | }, 22 | "shares" : { 23 | "number" : 1, 24 | "set" : true, 25 | "infinite" : true 26 | }, 27 | "partition" : "partition", 28 | "usage_normalized" : { 29 | "number" : 7.061401241503109, 30 | "set" : true, 31 | "infinite" : true 32 | }, 33 | "name" : "name", 34 | "tres" : { 35 | "run_seconds" : [ { 36 | "name" : "name", 37 | "value" : { 38 | "number" : 5, 39 | "set" : true, 40 | "infinite" : true 41 | } 42 | }, { 43 | "name" : "name", 44 | "value" : { 45 | "number" : 5, 46 | "set" : true, 47 | "infinite" : true 48 | } 49 | } ], 50 | "usage" : [ { 51 | "name" : "name", 52 | "value" : 5.637376656633329 53 | }, { 54 | "name" : "name", 55 | "value" : 5.637376656633329 56 | } ], 57 | "group_minutes" : [ { 58 | "name" : "name", 59 | "value" : { 60 | "number" : 5, 61 | "set" : true, 62 | "infinite" : true 63 | } 64 | }, { 65 | "name" : "name", 66 | "value" : { 67 | "number" : 5, 68 | "set" : true, 69 | "infinite" : true 70 | } 71 | } ] 72 | }, 73 | "id" : 0 74 | }, { 75 | "cluster" : "cluster", 76 | "parent" : "parent", 77 | "shares_normalized" : { 78 | "number" : 6.027456183070403, 79 | "set" : true, 80 | "infinite" : true 81 | }, 82 | "usage" : 9, 83 | "fairshare" : { 84 | "level" : 2.027123023002322, 85 | "factor" : 3.616076749251911 86 | }, 87 | "type" : [ "USER", "USER" ], 88 | "effective_usage" : { 89 | "number" : 2.3021358869347655, 90 | "set" : true, 91 | "infinite" : true 92 | }, 93 | "shares" : { 94 | "number" : 1, 95 | "set" : true, 96 | "infinite" : true 97 | }, 98 | "partition" : "partition", 99 | "usage_normalized" : { 100 | "number" : 7.061401241503109, 101 | "set" : true, 102 | "infinite" : true 103 | }, 104 | "name" : "name", 105 | "tres" : { 106 | "run_seconds" : [ { 107 | "name" : "name", 108 | "value" : { 109 | "number" : 5, 110 | "set" : true, 111 | "infinite" : true 112 | } 113 | }, { 114 | "name" : "name", 115 | "value" : { 116 | "number" : 5, 117 | "set" : true, 118 | "infinite" : true 119 | } 120 | } ], 121 | "usage" : [ { 122 | "name" : "name", 123 | "value" : 5.637376656633329 124 | }, { 125 | "name" : "name", 126 | "value" : 5.637376656633329 127 | } ], 128 | "group_minutes" : [ { 129 | "name" : "name", 130 | "value" : { 131 | "number" : 5, 132 | "set" : true, 133 | "infinite" : true 134 | } 135 | }, { 136 | "name" : "name", 137 | "value" : { 138 | "number" : 5, 139 | "set" : true, 140 | "infinite" : true 141 | } 142 | } ] 143 | }, 144 | "id" : 0 145 | } ], 146 | "total_shares" : 4 147 | }, 148 | "meta" : { 149 | "slurm" : { 150 | "cluster" : "cluster", 151 | "release" : "release", 152 | "version" : { 153 | "major" : "major", 154 | "minor" : "minor", 155 | "micro" : "micro" 156 | } 157 | }, 158 | "plugin" : { 159 | "accounting_storage" : "accounting_storage", 160 | "name" : "name", 161 | "type" : "type", 162 | "data_parser" : "data_parser" 163 | }, 164 | "client" : { 165 | "source" : "source", 166 | "user" : "user", 167 | "group" : "group" 168 | }, 169 | "command" : [ "command", "command" ] 170 | }, 171 | "warnings" : [ { 172 | "description" : "description", 173 | "source" : "source" 174 | }, { 175 | "description" : "description", 176 | "source" : "source" 177 | } ], 178 | "errors" : [ { 179 | "description" : "description", 180 | "source" : "source", 181 | "error" : "error", 182 | "error_number" : 5 183 | }, { 184 | "description" : "description", 185 | "source" : "source", 186 | "error" : "error", 187 | "error_number" : 5 188 | } ] 189 | } 190 | -------------------------------------------------------------------------------- /testdata/SlurmV0041GetDiag200Response.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta" : { 3 | "slurm" : { 4 | "cluster" : "cluster", 5 | "release" : "release", 6 | "version" : { 7 | "major" : "major", 8 | "minor" : "minor", 9 | "micro" : "micro" 10 | } 11 | }, 12 | "plugin" : { 13 | "accounting_storage" : "accounting_storage", 14 | "name" : "name", 15 | "type" : "type", 16 | "data_parser" : "data_parser" 17 | }, 18 | "client" : { 19 | "source" : "source", 20 | "user" : "user", 21 | "group" : "group" 22 | }, 23 | "command" : [ "command", "command" ] 24 | }, 25 | "warnings" : [ { 26 | "description" : "description", 27 | "source" : "source" 28 | }, { 29 | "description" : "description", 30 | "source" : "source" 31 | } ], 32 | "errors" : [ { 33 | "description" : "description", 34 | "source" : "source", 35 | "error" : "error", 36 | "error_number" : 5 37 | }, { 38 | "description" : "description", 39 | "source" : "source", 40 | "error" : "error", 41 | "error_number" : 5 42 | } ], 43 | "statistics" : { 44 | "bf_cycle_max" : 4, 45 | "rpcs_by_message_type" : [ { 46 | "cycle_last" : 6, 47 | "average_time" : { 48 | "number" : 3, 49 | "set" : true, 50 | "infinite" : true 51 | }, 52 | "type_id" : 0, 53 | "queued" : 5, 54 | "count" : 7, 55 | "dropped" : 4, 56 | "message_type" : "message_type", 57 | "total_time" : 4, 58 | "cycle_max" : 8 59 | }, { 60 | "cycle_last" : 6, 61 | "average_time" : { 62 | "number" : 3, 63 | "set" : true, 64 | "infinite" : true 65 | }, 66 | "type_id" : 0, 67 | "queued" : 5, 68 | "count" : 7, 69 | "dropped" : 4, 70 | "message_type" : "message_type", 71 | "total_time" : 4, 72 | "cycle_max" : 8 73 | } ], 74 | "bf_backfilled_het_jobs" : 3, 75 | "bf_table_size" : 7, 76 | "schedule_cycle_depth" : 7, 77 | "bf_depth_sum" : 0, 78 | "job_states_ts" : { 79 | "number" : 6, 80 | "set" : true, 81 | "infinite" : true 82 | }, 83 | "bf_queue_len" : 4, 84 | "jobs_started" : 6, 85 | "schedule_cycle_max" : 2, 86 | "server_thread_count" : 5, 87 | "bf_queue_len_sum" : 4, 88 | "bf_cycle_last" : 0, 89 | "bf_exit" : { 90 | "state_changed" : 5, 91 | "bf_max_time" : 3, 92 | "bf_max_job_start" : 7, 93 | "bf_node_space_size" : 7, 94 | "end_job_queue" : 8, 95 | "bf_max_job_test" : 3 96 | }, 97 | "agent_thread_count" : 7, 98 | "jobs_completed" : 3, 99 | "bf_depth_mean" : 0, 100 | "bf_depth_try_sum" : 6, 101 | "schedule_cycle_mean" : 1, 102 | "bf_table_size_sum" : 9, 103 | "agent_queue_size" : 5, 104 | "jobs_failed" : 1, 105 | "bf_last_depth_try" : 4, 106 | "req_time" : { 107 | "number" : 6, 108 | "set" : true, 109 | "infinite" : true 110 | }, 111 | "bf_cycle_counter" : 3, 112 | "schedule_queue_length" : 8, 113 | "bf_queue_len_mean" : 1, 114 | "schedule_exit" : { 115 | "max_sched_time" : 9, 116 | "licenses" : 6, 117 | "default_queue_depth" : 4, 118 | "max_job_start" : 5, 119 | "max_rpc_cnt" : 9, 120 | "end_job_queue" : 1 121 | }, 122 | "jobs_canceled" : 6, 123 | "schedule_cycle_sum" : 7, 124 | "jobs_submitted" : 9, 125 | "schedule_cycle_mean_depth" : 1, 126 | "schedule_cycle_per_minute" : 6, 127 | "req_time_start" : { 128 | "number" : 1, 129 | "set" : true, 130 | "infinite" : true 131 | }, 132 | "jobs_running" : 6, 133 | "bf_last_backfilled_jobs" : 6, 134 | "bf_last_depth" : 3, 135 | "bf_backfilled_jobs" : 5, 136 | "rpcs_by_user" : [ { 137 | "average_time" : { 138 | "number" : 3, 139 | "set" : true, 140 | "infinite" : true 141 | }, 142 | "user_id" : 0, 143 | "count" : 2, 144 | "total_time" : 1, 145 | "user" : "user" 146 | }, { 147 | "average_time" : { 148 | "number" : 3, 149 | "set" : true, 150 | "infinite" : true 151 | }, 152 | "user_id" : 0, 153 | "count" : 2, 154 | "total_time" : 1, 155 | "user" : "user" 156 | } ], 157 | "bf_cycle_mean" : 7, 158 | "pending_rpcs_by_hostlist" : [ { 159 | "type_id" : 4, 160 | "count" : [ "count", "count" ], 161 | "message_type" : "message_type" 162 | }, { 163 | "type_id" : 4, 164 | "count" : [ "count", "count" ], 165 | "message_type" : "message_type" 166 | } ], 167 | "dbd_agent_queue_size" : 9, 168 | "bf_table_size_mean" : 0, 169 | "jobs_pending" : 2, 170 | "agent_count" : 2, 171 | "bf_cycle_sum" : 6, 172 | "parts_packed" : 0, 173 | "bf_active" : true, 174 | "bf_depth_mean_try" : 7, 175 | "gettimeofday_latency" : 3, 176 | "pending_rpcs" : [ { 177 | "type_id" : 8, 178 | "count" : 6, 179 | "message_type" : "message_type" 180 | }, { 181 | "type_id" : 8, 182 | "count" : 6, 183 | "message_type" : "message_type" 184 | } ], 185 | "schedule_cycle_total" : 1, 186 | "bf_when_last_cycle" : { 187 | "number" : 9, 188 | "set" : true, 189 | "infinite" : true 190 | }, 191 | "schedule_cycle_last" : 4 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /internal/slurm/queue.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | type QueueCollector struct { 14 | ctx context.Context 15 | pending *prometheus.Desc 16 | pending_dep *prometheus.Desc 17 | running *prometheus.Desc 18 | suspended *prometheus.Desc 19 | cancelled *prometheus.Desc 20 | completing *prometheus.Desc 21 | completed *prometheus.Desc 22 | configuring *prometheus.Desc 23 | failed *prometheus.Desc 24 | timeout *prometheus.Desc 25 | preempted *prometheus.Desc 26 | node_fail *prometheus.Desc 27 | } 28 | 29 | func NewQueueCollector(ctx context.Context) *QueueCollector { 30 | return &QueueCollector{ 31 | ctx: ctx, 32 | pending: prometheus.NewDesc("slurm_queue_pending", "Pending jobs in queue", nil, nil), 33 | pending_dep: prometheus.NewDesc("slurm_queue_pending_dependency", "Pending jobs because of dependency in queue", nil, nil), 34 | running: prometheus.NewDesc("slurm_queue_running", "Running jobs in the cluster", nil, nil), 35 | suspended: prometheus.NewDesc("slurm_queue_suspended", "Suspended jobs in the cluster", nil, nil), 36 | cancelled: prometheus.NewDesc("slurm_queue_cancelled", "Cancelled jobs in the cluster", nil, nil), 37 | completing: prometheus.NewDesc("slurm_queue_completing", "Completing jobs in the cluster", nil, nil), 38 | completed: prometheus.NewDesc("slurm_queue_completed", "Completed jobs in the cluster", nil, nil), 39 | configuring: prometheus.NewDesc("slurm_queue_configuring", "Configuring jobs in the cluster", nil, nil), 40 | failed: prometheus.NewDesc("slurm_queue_failed", "Number of failed jobs", nil, nil), 41 | timeout: prometheus.NewDesc("slurm_queue_timeout", "Jobs stopped by timeout", nil, nil), 42 | preempted: prometheus.NewDesc("slurm_queue_preempted", "Number of preempted jobs", nil, nil), 43 | node_fail: prometheus.NewDesc("slurm_queue_node_fail", "Number of jobs stopped due to node fail", nil, nil), 44 | } 45 | } 46 | 47 | func (qc *QueueCollector) Describe(ch chan<- *prometheus.Desc) { 48 | ch <- qc.pending 49 | ch <- qc.pending_dep 50 | ch <- qc.running 51 | ch <- qc.suspended 52 | ch <- qc.cancelled 53 | ch <- qc.completing 54 | ch <- qc.completed 55 | ch <- qc.configuring 56 | ch <- qc.failed 57 | ch <- qc.timeout 58 | ch <- qc.preempted 59 | ch <- qc.node_fail 60 | } 61 | 62 | func (qc *QueueCollector) Collect(ch chan<- prometheus.Metric) { 63 | apiCache := qc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 64 | jobsRespBytes, found := apiCache.Get("jobs") 65 | if !found { 66 | slog.Error("failed to get jobs response for users metrics from cache") 67 | return 68 | } 69 | jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte)) 70 | if err != nil { 71 | slog.Error("failed to process jobs data for queue metrics", "error", err) 72 | return 73 | } 74 | qm, err := ParseQueueMetrics(jobsData) 75 | if err != nil { 76 | slog.Error("failed to collect queue metrics", "error", err) 77 | return 78 | } 79 | ch <- prometheus.MustNewConstMetric(qc.pending, prometheus.GaugeValue, qm.pending) 80 | ch <- prometheus.MustNewConstMetric(qc.pending_dep, prometheus.GaugeValue, qm.pending_dep) 81 | ch <- prometheus.MustNewConstMetric(qc.running, prometheus.GaugeValue, qm.running) 82 | ch <- prometheus.MustNewConstMetric(qc.suspended, prometheus.GaugeValue, qm.suspended) 83 | ch <- prometheus.MustNewConstMetric(qc.cancelled, prometheus.GaugeValue, qm.cancelled) 84 | ch <- prometheus.MustNewConstMetric(qc.completing, prometheus.GaugeValue, qm.completing) 85 | ch <- prometheus.MustNewConstMetric(qc.completed, prometheus.GaugeValue, qm.completed) 86 | ch <- prometheus.MustNewConstMetric(qc.configuring, prometheus.GaugeValue, qm.configuring) 87 | ch <- prometheus.MustNewConstMetric(qc.failed, prometheus.GaugeValue, qm.failed) 88 | ch <- prometheus.MustNewConstMetric(qc.timeout, prometheus.GaugeValue, qm.timeout) 89 | ch <- prometheus.MustNewConstMetric(qc.preempted, prometheus.GaugeValue, qm.preempted) 90 | ch <- prometheus.MustNewConstMetric(qc.node_fail, prometheus.GaugeValue, qm.node_fail) 91 | } 92 | 93 | func NewQueueMetrics() *queueMetrics { 94 | return &queueMetrics{} 95 | } 96 | 97 | type queueMetrics struct { 98 | pending float64 99 | pending_dep float64 100 | running float64 101 | suspended float64 102 | cancelled float64 103 | completing float64 104 | completed float64 105 | configuring float64 106 | failed float64 107 | timeout float64 108 | preempted float64 109 | node_fail float64 110 | } 111 | 112 | func ParseQueueMetrics(jobsData *api.JobsData) (*queueMetrics, error) { 113 | qm := NewQueueMetrics() 114 | for _, j := range jobsData.Jobs { 115 | switch j.JobState { 116 | case types.JobStatePending: 117 | if j.Dependency != "" { 118 | qm.pending_dep++ 119 | } else { 120 | qm.pending++ 121 | } 122 | case types.JobStateRunning: 123 | qm.running++ 124 | case types.JobStateSuspended: 125 | qm.suspended++ 126 | case types.JobStateCancelled: 127 | qm.cancelled++ 128 | case types.JobStateCompleting: 129 | qm.completing++ 130 | case types.JobStateCompleted: 131 | qm.completed++ 132 | case types.JobStateConfiguring: 133 | qm.configuring++ 134 | case types.JobStateFailed: 135 | qm.failed++ 136 | case types.JobStateTimeout: 137 | qm.timeout++ 138 | case types.JobStatePreempted: 139 | qm.preempted++ 140 | case types.JobStateNodeFail: 141 | qm.node_fail++ 142 | } 143 | } 144 | return qm, nil 145 | } 146 | -------------------------------------------------------------------------------- /internal/api/transport.go: -------------------------------------------------------------------------------- 1 | package api 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "io" 8 | "log/slog" 9 | "net" 10 | "net/http" 11 | "strings" 12 | 13 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 14 | ) 15 | 16 | type slurmRestRequest struct { 17 | req *http.Request 18 | client *http.Client 19 | } 20 | 21 | type SlurmRestResponse struct { 22 | StatusCode int 23 | Body []byte 24 | } 25 | 26 | // GetSlurmRestResponse retrieves response data from slurm api 27 | func GetSlurmRestResponse(ctx context.Context, endpointCtxKey types.Key) ([]byte, error) { 28 | var endpointStr string 29 | switch endpointCtxKey { 30 | case types.ApiDiagEndpointKey: 31 | endpointStr = "diag" 32 | case types.ApiJobsEndpointKey: 33 | endpointStr = "jobs" 34 | case types.ApiNodesEndpointKey: 35 | endpointStr = "nodes" 36 | case types.ApiPartitionsEndpointKey: 37 | endpointStr = "partitions" 38 | case types.ApiSharesEndpointKey: 39 | endpointStr = "shares" 40 | default: 41 | return nil, fmt.Errorf("invalid endpoint key") 42 | } 43 | slog.Debug("performing rest request", "endpoint", endpointStr) 44 | nr, err := newSlurmRestRequest(ctx, endpointCtxKey) 45 | if err != nil { 46 | return nil, fmt.Errorf("failed to generate new slurm rest request: %v", err) 47 | } 48 | resp, err := nr.Send() 49 | if err != nil { 50 | return nil, fmt.Errorf("failed to retrieve slurm rest response: %v", err) 51 | } 52 | // sometimes slurm fails to get stuff. we want to error here 53 | if resp.StatusCode == 500 { 54 | slog.Debug("incorrect response status code", "endpoint", endpointStr, "code", resp.StatusCode, "body", string(resp.Body)) 55 | 56 | // try to unmarshal the api error and give a better log 57 | var aed APIErrorData 58 | var errStr string 59 | err := json.Unmarshal(resp.Body, &aed) 60 | if err != nil { 61 | errStr = "tried to get more data about the error but failed. try debug mode for more information" 62 | } 63 | errStr = aed.ToString() 64 | return nil, fmt.Errorf("internal server error (500) from slurm controller getting %s data: %s", endpointStr, errStr) 65 | } 66 | // unauthorized responses should say that 67 | if resp.StatusCode == 401 { 68 | return nil, fmt.Errorf("unauthorized: invalid credentials") 69 | } 70 | // otherwise, it should be status 200, so this catches unsupported status codes 71 | if resp.StatusCode != 200 { 72 | slog.Debug("incorrect response status code", "endpoint", endpointStr, "code", resp.StatusCode, "body", string(resp.Body)) 73 | return nil, fmt.Errorf("received incorrect status code for %s data", endpointStr) 74 | } 75 | slog.Debug("successfully queried slurm rest data", "endpoint", endpointStr) 76 | return resp.Body, nil 77 | } 78 | 79 | // newSlurmRestRequest returns a new slurmRestRequest object which is used to perform 80 | // http interactions with the slurmrest server. It configures everything up until 81 | // the request is actually sent to get data. 82 | func newSlurmRestRequest(ctx context.Context, k types.Key) (*slurmRestRequest, error) { 83 | apiURL := ctx.Value(types.ApiURLKey).(string) 84 | 85 | if strings.HasPrefix(apiURL, "unix://") { 86 | return newSlurmUnixRestRequest(ctx, k) 87 | } else if strings.HasPrefix(apiURL, "http://") || strings.HasPrefix(apiURL, "https://") { 88 | return newSlurmInetRestRequest(ctx, k) 89 | } 90 | return nil, fmt.Errorf("invalid SLURM_EXPORTER_API_URL: %s", apiURL) 91 | } 92 | 93 | func newSlurmInetRestRequest(ctx context.Context, k types.Key) (*slurmRestRequest, error) { 94 | apiUser := ctx.Value(types.ApiUserKey).(string) 95 | apiToken := ctx.Value(types.ApiTokenKey).(string) 96 | apiURL := ctx.Value(types.ApiURLKey).(string) 97 | apiEndpoint := ctx.Value(k).(string) 98 | 99 | url := fmt.Sprintf("%s/%s", apiURL, apiEndpoint) 100 | req, err := http.NewRequest("GET", url, nil) 101 | if err != nil { 102 | return nil, err 103 | } 104 | req.Header.Set("Accept", "application/json") 105 | req.Header.Set("X-SLURM-USER-NAME", apiUser) 106 | req.Header.Set("X-SLURM-USER-TOKEN", apiToken) 107 | 108 | return &slurmRestRequest{ 109 | req: req, 110 | client: &http.Client{}, 111 | }, nil 112 | } 113 | 114 | func newSlurmUnixRestRequest(ctx context.Context, k types.Key) (*slurmRestRequest, error) { 115 | apiURL := ctx.Value(types.ApiURLKey).(string) 116 | apiEndpoint := ctx.Value(k).(string) 117 | 118 | socketPath := strings.TrimPrefix(apiURL, "unix:") 119 | url := fmt.Sprintf("http://unix/%s", apiEndpoint) 120 | req, err := http.NewRequest("GET", url, nil) 121 | if err != nil { 122 | return nil, err 123 | } 124 | req.Header.Set("Accept", "application/json") 125 | 126 | return &slurmRestRequest{ 127 | req: req, 128 | client: &http.Client{ 129 | Transport: &http.Transport{ 130 | DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { 131 | return net.Dial("unix", socketPath) 132 | }, 133 | DisableKeepAlives: true, 134 | }, 135 | }, 136 | }, nil 137 | } 138 | 139 | // slurmRestRequest.Send is used to perform the request against the slurmrest 140 | // server. It returns a *SlurmRestResponse which is a struct containing the 141 | // response status code and the bytes of the response body. 142 | func (sr slurmRestRequest) Send() (*SlurmRestResponse, error) { 143 | resp, err := sr.client.Do(sr.req) 144 | if err != nil { 145 | return nil, fmt.Errorf("failed to send request: %v", err) 146 | } 147 | defer resp.Body.Close() 148 | 149 | body, err := io.ReadAll(resp.Body) 150 | if err != nil { 151 | return nil, fmt.Errorf("failed to read response body: %v", err) 152 | } 153 | 154 | sresp := SlurmRestResponse{} 155 | sresp.StatusCode = resp.StatusCode 156 | sresp.Body = body 157 | 158 | return &sresp, nil 159 | } 160 | -------------------------------------------------------------------------------- /testdata/V0041OpenapiNodesResp.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes" : [ { 3 | "reason" : "reason", 4 | "gpu_spec" : "gpu_spec", 5 | "slurmd_start_time" : { 6 | "number" : 3, 7 | "set" : true, 8 | "infinite" : true 9 | }, 10 | "features" : [ "features", "features" ], 11 | "hostname" : "hostname", 12 | "cores" : 1, 13 | "reason_changed_at" : { 14 | "number" : 9, 15 | "set" : true, 16 | "infinite" : true 17 | }, 18 | "reservation" : "reservation", 19 | "tres" : "tres", 20 | "cpu_binding" : 5, 21 | "state" : [ "INVALID", "INVALID" ], 22 | "sockets" : 6, 23 | "energy" : { 24 | "current_watts" : { 25 | "number" : 1, 26 | "set" : true, 27 | "infinite" : true 28 | }, 29 | "base_consumed_energy" : 4, 30 | "last_collected" : 1, 31 | "consumed_energy" : 7, 32 | "previous_consumed_energy" : 1, 33 | "average_watts" : 2 34 | }, 35 | "partitions" : [ "partitions", "partitions" ], 36 | "gres_drained" : "gres_drained", 37 | "weight" : 6, 38 | "version" : "version", 39 | "gres_used" : "gres_used", 40 | "mcs_label" : "mcs_label", 41 | "real_memory" : 4, 42 | "instance_id" : "instance_id", 43 | "burstbuffer_network_address" : "burstbuffer_network_address", 44 | "port" : 1, 45 | "name" : "name", 46 | "resume_after" : { 47 | "number" : 9, 48 | "set" : true, 49 | "infinite" : true 50 | }, 51 | "temporary_disk" : 2, 52 | "tres_used" : "tres_used", 53 | "effective_cpus" : 3, 54 | "instance_type" : "instance_type", 55 | "external_sensors" : {}, 56 | "res_cores_per_gpu" : 5, 57 | "boards" : 0, 58 | "alloc_cpus" : 8, 59 | "active_features" : [ "active_features", "active_features" ], 60 | "reason_set_by_user" : "reason_set_by_user", 61 | "free_mem" : { 62 | "number" : 7, 63 | "set" : true, 64 | "infinite" : true 65 | }, 66 | "alloc_idle_cpus" : 9, 67 | "extra" : "extra", 68 | "operating_system" : "operating_system", 69 | "power" : {}, 70 | "architecture" : "architecture", 71 | "owner" : "owner", 72 | "cluster_name" : "cluster_name", 73 | "address" : "address", 74 | "cpus" : 9, 75 | "tres_weighted" : 6.438423552598547, 76 | "gres" : "gres", 77 | "threads" : 1, 78 | "boot_time" : { 79 | "number" : 6, 80 | "set" : true, 81 | "infinite" : true 82 | }, 83 | "alloc_memory" : 6, 84 | "specialized_memory" : 7, 85 | "specialized_cpus" : "specialized_cpus", 86 | "specialized_cores" : 5, 87 | "last_busy" : { 88 | "number" : 6, 89 | "set" : true, 90 | "infinite" : true 91 | }, 92 | "comment" : "comment", 93 | "next_state_after_reboot" : [ "INVALID", "INVALID" ], 94 | "cpu_load" : 2 95 | }, { 96 | "reason" : "reason", 97 | "gpu_spec" : "gpu_spec", 98 | "slurmd_start_time" : { 99 | "number" : 3, 100 | "set" : true, 101 | "infinite" : true 102 | }, 103 | "features" : [ "features", "features" ], 104 | "hostname" : "hostname", 105 | "cores" : 1, 106 | "reason_changed_at" : { 107 | "number" : 9, 108 | "set" : true, 109 | "infinite" : true 110 | }, 111 | "reservation" : "reservation", 112 | "tres" : "tres", 113 | "cpu_binding" : 5, 114 | "state" : [ "INVALID", "INVALID" ], 115 | "sockets" : 6, 116 | "energy" : { 117 | "current_watts" : { 118 | "number" : 1, 119 | "set" : true, 120 | "infinite" : true 121 | }, 122 | "base_consumed_energy" : 4, 123 | "last_collected" : 1, 124 | "consumed_energy" : 7, 125 | "previous_consumed_energy" : 1, 126 | "average_watts" : 2 127 | }, 128 | "partitions" : [ "partitions", "partitions" ], 129 | "gres_drained" : "gres_drained", 130 | "weight" : 6, 131 | "version" : "version", 132 | "gres_used" : "gres_used", 133 | "mcs_label" : "mcs_label", 134 | "real_memory" : 4, 135 | "instance_id" : "instance_id", 136 | "burstbuffer_network_address" : "burstbuffer_network_address", 137 | "port" : 1, 138 | "name" : "name", 139 | "resume_after" : { 140 | "number" : 9, 141 | "set" : true, 142 | "infinite" : true 143 | }, 144 | "temporary_disk" : 2, 145 | "tres_used" : "tres_used", 146 | "effective_cpus" : 3, 147 | "instance_type" : "instance_type", 148 | "external_sensors" : {}, 149 | "res_cores_per_gpu" : 5, 150 | "boards" : 0, 151 | "alloc_cpus" : 8, 152 | "active_features" : [ "active_features", "active_features" ], 153 | "reason_set_by_user" : "reason_set_by_user", 154 | "free_mem" : { 155 | "number" : 7, 156 | "set" : true, 157 | "infinite" : true 158 | }, 159 | "alloc_idle_cpus" : 9, 160 | "extra" : "extra", 161 | "operating_system" : "operating_system", 162 | "power" : {}, 163 | "architecture" : "architecture", 164 | "owner" : "owner", 165 | "cluster_name" : "cluster_name", 166 | "address" : "address", 167 | "cpus" : 9, 168 | "tres_weighted" : 6.438423552598547, 169 | "gres" : "gres", 170 | "threads" : 1, 171 | "boot_time" : { 172 | "number" : 6, 173 | "set" : true, 174 | "infinite" : true 175 | }, 176 | "alloc_memory" : 6, 177 | "specialized_memory" : 7, 178 | "specialized_cpus" : "specialized_cpus", 179 | "specialized_cores" : 5, 180 | "last_busy" : { 181 | "number" : 6, 182 | "set" : true, 183 | "infinite" : true 184 | }, 185 | "comment" : "comment", 186 | "next_state_after_reboot" : [ "INVALID", "INVALID" ], 187 | "cpu_load" : 2 188 | } ], 189 | "meta" : { 190 | "slurm" : { 191 | "cluster" : "cluster", 192 | "release" : "release", 193 | "version" : { 194 | "major" : "major", 195 | "minor" : "minor", 196 | "micro" : "micro" 197 | } 198 | }, 199 | "plugin" : { 200 | "accounting_storage" : "accounting_storage", 201 | "name" : "name", 202 | "type" : "type", 203 | "data_parser" : "data_parser" 204 | }, 205 | "client" : { 206 | "source" : "source", 207 | "user" : "user", 208 | "group" : "group" 209 | }, 210 | "command" : [ "command", "command" ] 211 | }, 212 | "last_update" : { 213 | "number" : 6, 214 | "set" : true, 215 | "infinite" : true 216 | }, 217 | "warnings" : [ { 218 | "description" : "description", 219 | "source" : "source" 220 | }, { 221 | "description" : "description", 222 | "source" : "source" 223 | } ], 224 | "errors" : [ { 225 | "description" : "description", 226 | "source" : "source", 227 | "error" : "error", 228 | "error_number" : 5 229 | }, { 230 | "description" : "description", 231 | "source" : "source", 232 | "error" : "error", 233 | "error_number" : 5 234 | } ] 235 | } 236 | -------------------------------------------------------------------------------- /internal/slurm/partitions.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "strings" 7 | 8 | "github.com/akyoto/cache" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 10 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 11 | "github.com/prometheus/client_golang/prometheus" 12 | ) 13 | 14 | type PartitionsCollector struct { 15 | ctx context.Context 16 | allocated *prometheus.Desc 17 | idle *prometheus.Desc 18 | other *prometheus.Desc 19 | pending *prometheus.Desc 20 | total *prometheus.Desc 21 | } 22 | 23 | func NewPartitionsCollector(ctx context.Context) *PartitionsCollector { 24 | labels := []string{"partition"} 25 | return &PartitionsCollector{ 26 | ctx: ctx, 27 | allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels, nil), 28 | idle: prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels, nil), 29 | other: prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels, nil), 30 | pending: prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels, nil), 31 | total: prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels, nil), 32 | } 33 | } 34 | 35 | func (pc *PartitionsCollector) Describe(ch chan<- *prometheus.Desc) { 36 | ch <- pc.allocated 37 | ch <- pc.idle 38 | ch <- pc.other 39 | ch <- pc.pending 40 | ch <- pc.total 41 | } 42 | 43 | func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) { 44 | apiCache := pc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 45 | partitionsRespBytes, found := apiCache.Get("partitions") 46 | if !found { 47 | slog.Error("failed to get partitions response for partitions metrics from cache") 48 | return 49 | } 50 | jobsRespBytes, found := apiCache.Get("jobs") 51 | if !found { 52 | slog.Error("failed to get jobs response for users metrics from cache") 53 | return 54 | } 55 | nodesRespBytes, found := apiCache.Get("nodes") 56 | if !found { 57 | slog.Error("failed to get nodes response for cpu metrics from cache") 58 | return 59 | } 60 | partitionsData, err := api.ProcessPartitionsResponse(partitionsRespBytes.([]byte)) 61 | if err != nil { 62 | slog.Error("failed to process partitions data for partitions metrics", "error", err) 63 | return 64 | } 65 | jobsData, err := api.ProcessJobsResponse(jobsRespBytes.([]byte)) 66 | if err != nil { 67 | slog.Error("failed to process jobs data for partitions metrics", "error", err) 68 | return 69 | } 70 | nodesData, err := api.ProcessNodesResponse(nodesRespBytes.([]byte)) 71 | if err != nil { 72 | slog.Error("failed to process nodes data for partitions metrics", "error", err) 73 | return 74 | } 75 | pm, err := ParsePartitionsMetrics(partitionsData, jobsData, nodesData) 76 | if err != nil { 77 | slog.Error("failed to collect partitions metrics", "error", err) 78 | return 79 | } 80 | for p := range pm { 81 | if pm[p].cpus_allocated > 0 { 82 | ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].cpus_allocated, p) 83 | } 84 | if pm[p].cpus_idle > 0 { 85 | ch <- prometheus.MustNewConstMetric(pc.idle, prometheus.GaugeValue, pm[p].cpus_idle, p) 86 | } 87 | if pm[p].cpus_other > 0 { 88 | ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].cpus_other, p) 89 | } 90 | if pm[p].cpus_total > 0 { 91 | ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].cpus_total, p) 92 | } 93 | if pm[p].jobs_pending > 0 { 94 | ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].jobs_pending, p) 95 | } 96 | } 97 | } 98 | 99 | func NewPartitionsMetrics() *partitionMetrics { 100 | return &partitionMetrics{0, 0, 0, 0, 0} 101 | } 102 | 103 | type partitionMetrics struct { 104 | cpus_allocated float64 105 | cpus_idle float64 106 | cpus_other float64 107 | cpus_total float64 108 | jobs_pending float64 109 | } 110 | 111 | // ParsePartitionsMetrics returns a map where the keys are the partition names and the values are a partitionMetrics struct 112 | func ParsePartitionsMetrics(partitionsData *api.PartitionsData, jobsData *api.JobsData, nodesData *api.NodesData) (map[string]*partitionMetrics, error) { 113 | partitions := make(map[string]*partitionMetrics) 114 | nodePartitions := make(map[string][]string) 115 | 116 | // first, scan through partition data to easily get total cpus 117 | for _, p := range partitionsData.Partitions { 118 | _, exists := partitions[p.Name] 119 | if !exists { 120 | partitions[p.Name] = NewPartitionsMetrics() 121 | } 122 | 123 | // cpu total 124 | partitions[p.Name].cpus_total = float64(p.Cpus) 125 | } 126 | 127 | // we need to gather cpus from the nodes perspective because a node can 128 | // be a member of multiple partitions, running a job in one partition, and 129 | // we want to see that there are allocated cpus on the other partition because 130 | // of the shared node. 131 | for _, n := range nodesData.Nodes { 132 | nodePartitions[n.Name] = n.Partitions 133 | } 134 | 135 | // to get used and available cpus, we need to scan through the job list and categorize 136 | // each job by its partition, adding the cpus as we go 137 | for _, n := range nodesData.Nodes { 138 | alloc_cpus := n.AllocCpus 139 | idle_cpus := n.AllocIdleCpus 140 | nodePartitionNames := n.Partitions 141 | for _, partitionName := range nodePartitionNames { 142 | // this needs to exist to handle the test data provided by SLURM 143 | // where the nodes response example data does not correspond to 144 | // the partitions response example data. in real data, the 145 | // partition names should already exist in the map 146 | _, exists := partitions[partitionName] 147 | if !exists { 148 | partitions[partitionName] = NewPartitionsMetrics() 149 | } 150 | 151 | partitions[partitionName].cpus_allocated += float64(alloc_cpus) 152 | partitions[partitionName].cpus_idle += float64(idle_cpus) 153 | } 154 | } 155 | 156 | // derive the other stat 157 | for i, p := range partitions { 158 | partitions[i].cpus_other = p.cpus_total - p.cpus_allocated - p.cpus_idle 159 | } 160 | 161 | // lastly, we need to get a count of pending jobs for the partition 162 | for _, j := range jobsData.Jobs { 163 | // partition name can be comma-separated, so we iterate through it 164 | pnames := strings.Split(j.Partition, ",") 165 | for _, partitionName := range pnames { 166 | // this needs to exist to handle the test data provided by SLURM 167 | // where the nodes response example data does not correspond to 168 | // the partitions response example data. in real data, the 169 | // partition names should already exist in the map 170 | _, exists := partitions[partitionName] 171 | if !exists { 172 | partitions[partitionName] = NewPartitionsMetrics() 173 | } 174 | partitions[partitionName].jobs_pending += 1 175 | } 176 | } 177 | 178 | return partitions, nil 179 | } 180 | -------------------------------------------------------------------------------- /testdata/V0041OpenapiPartitionResp.json: -------------------------------------------------------------------------------- 1 | { 2 | "partitions" : [ { 3 | "cluster" : "cluster", 4 | "cpus" : { 5 | "task_binding" : 6, 6 | "total" : 1 7 | }, 8 | "timeouts" : { 9 | "resume" : { 10 | "number" : 9, 11 | "set" : true, 12 | "infinite" : true 13 | }, 14 | "suspend" : { 15 | "number" : 6, 16 | "set" : true, 17 | "infinite" : true 18 | } 19 | }, 20 | "groups" : { 21 | "allowed" : "allowed" 22 | }, 23 | "alternate" : "alternate", 24 | "select_type" : [ "CPU", "CPU" ], 25 | "suspend_time" : { 26 | "number" : 8, 27 | "set" : true, 28 | "infinite" : true 29 | }, 30 | "priority" : { 31 | "tier" : 9, 32 | "job_factor" : 5 33 | }, 34 | "node_sets" : "node_sets", 35 | "maximums" : { 36 | "shares" : 1, 37 | "nodes" : { 38 | "number" : 1, 39 | "set" : true, 40 | "infinite" : true 41 | }, 42 | "over_time_limit" : { 43 | "number" : 1, 44 | "set" : true, 45 | "infinite" : true 46 | }, 47 | "cpus_per_node" : { 48 | "number" : 3, 49 | "set" : true, 50 | "infinite" : true 51 | }, 52 | "cpus_per_socket" : { 53 | "number" : 2, 54 | "set" : true, 55 | "infinite" : true 56 | }, 57 | "partition_memory_per_node" : { 58 | "number" : 1, 59 | "set" : true, 60 | "infinite" : true 61 | }, 62 | "oversubscribe" : { 63 | "jobs" : 6, 64 | "flags" : [ "force", "force" ] 65 | }, 66 | "memory_per_cpu" : 4, 67 | "time" : { 68 | "number" : 7, 69 | "set" : true, 70 | "infinite" : true 71 | }, 72 | "partition_memory_per_cpu" : { 73 | "number" : 7, 74 | "set" : true, 75 | "infinite" : true 76 | } 77 | }, 78 | "nodes" : { 79 | "configured" : "configured", 80 | "total" : 0, 81 | "allowed_allocation" : "allowed_allocation" 82 | }, 83 | "partition" : { 84 | "state" : [ "INACTIVE", "INACTIVE" ] 85 | }, 86 | "qos" : { 87 | "deny" : "deny", 88 | "allowed" : "allowed", 89 | "assigned" : "assigned" 90 | }, 91 | "defaults" : { 92 | "partition_memory_per_node" : { 93 | "number" : 2, 94 | "set" : true, 95 | "infinite" : true 96 | }, 97 | "memory_per_cpu" : 5, 98 | "time" : { 99 | "number" : 7, 100 | "set" : true, 101 | "infinite" : true 102 | }, 103 | "job" : "job", 104 | "partition_memory_per_cpu" : { 105 | "number" : 5, 106 | "set" : true, 107 | "infinite" : true 108 | } 109 | }, 110 | "name" : "name", 111 | "tres" : { 112 | "configured" : "configured", 113 | "billing_weights" : "billing_weights" 114 | }, 115 | "accounts" : { 116 | "deny" : "deny", 117 | "allowed" : "allowed" 118 | }, 119 | "minimums" : { 120 | "nodes" : 4 121 | }, 122 | "grace_time" : 9 123 | }, { 124 | "cluster" : "cluster", 125 | "cpus" : { 126 | "task_binding" : 6, 127 | "total" : 1 128 | }, 129 | "timeouts" : { 130 | "resume" : { 131 | "number" : 9, 132 | "set" : true, 133 | "infinite" : true 134 | }, 135 | "suspend" : { 136 | "number" : 6, 137 | "set" : true, 138 | "infinite" : true 139 | } 140 | }, 141 | "groups" : { 142 | "allowed" : "allowed" 143 | }, 144 | "alternate" : "alternate", 145 | "select_type" : [ "CPU", "CPU" ], 146 | "suspend_time" : { 147 | "number" : 8, 148 | "set" : true, 149 | "infinite" : true 150 | }, 151 | "priority" : { 152 | "tier" : 9, 153 | "job_factor" : 5 154 | }, 155 | "node_sets" : "node_sets", 156 | "maximums" : { 157 | "shares" : 1, 158 | "nodes" : { 159 | "number" : 1, 160 | "set" : true, 161 | "infinite" : true 162 | }, 163 | "over_time_limit" : { 164 | "number" : 1, 165 | "set" : true, 166 | "infinite" : true 167 | }, 168 | "cpus_per_node" : { 169 | "number" : 3, 170 | "set" : true, 171 | "infinite" : true 172 | }, 173 | "cpus_per_socket" : { 174 | "number" : 2, 175 | "set" : true, 176 | "infinite" : true 177 | }, 178 | "partition_memory_per_node" : { 179 | "number" : 1, 180 | "set" : true, 181 | "infinite" : true 182 | }, 183 | "oversubscribe" : { 184 | "jobs" : 6, 185 | "flags" : [ "force", "force" ] 186 | }, 187 | "memory_per_cpu" : 4, 188 | "time" : { 189 | "number" : 7, 190 | "set" : true, 191 | "infinite" : true 192 | }, 193 | "partition_memory_per_cpu" : { 194 | "number" : 7, 195 | "set" : true, 196 | "infinite" : true 197 | } 198 | }, 199 | "nodes" : { 200 | "configured" : "configured", 201 | "total" : 0, 202 | "allowed_allocation" : "allowed_allocation" 203 | }, 204 | "partition" : { 205 | "state" : [ "INACTIVE", "INACTIVE" ] 206 | }, 207 | "qos" : { 208 | "deny" : "deny", 209 | "allowed" : "allowed", 210 | "assigned" : "assigned" 211 | }, 212 | "defaults" : { 213 | "partition_memory_per_node" : { 214 | "number" : 2, 215 | "set" : true, 216 | "infinite" : true 217 | }, 218 | "memory_per_cpu" : 5, 219 | "time" : { 220 | "number" : 7, 221 | "set" : true, 222 | "infinite" : true 223 | }, 224 | "job" : "job", 225 | "partition_memory_per_cpu" : { 226 | "number" : 5, 227 | "set" : true, 228 | "infinite" : true 229 | } 230 | }, 231 | "name" : "name", 232 | "tres" : { 233 | "configured" : "configured", 234 | "billing_weights" : "billing_weights" 235 | }, 236 | "accounts" : { 237 | "deny" : "deny", 238 | "allowed" : "allowed" 239 | }, 240 | "minimums" : { 241 | "nodes" : 4 242 | }, 243 | "grace_time" : 9 244 | } ], 245 | "meta" : { 246 | "slurm" : { 247 | "cluster" : "cluster", 248 | "release" : "release", 249 | "version" : { 250 | "major" : "major", 251 | "minor" : "minor", 252 | "micro" : "micro" 253 | } 254 | }, 255 | "plugin" : { 256 | "accounting_storage" : "accounting_storage", 257 | "name" : "name", 258 | "type" : "type", 259 | "data_parser" : "data_parser" 260 | }, 261 | "client" : { 262 | "source" : "source", 263 | "user" : "user", 264 | "group" : "group" 265 | }, 266 | "command" : [ "command", "command" ] 267 | }, 268 | "last_update" : { 269 | "number" : 9, 270 | "set" : true, 271 | "infinite" : true 272 | }, 273 | "warnings" : [ { 274 | "description" : "description", 275 | "source" : "source" 276 | }, { 277 | "description" : "description", 278 | "source" : "source" 279 | } ], 280 | "errors" : [ { 281 | "description" : "description", 282 | "source" : "source", 283 | "error" : "error", 284 | "error_number" : 5 285 | }, { 286 | "description" : "description", 287 | "source" : "source", 288 | "error" : "error", 289 | "error_number" : 5 290 | } ] 291 | } 292 | -------------------------------------------------------------------------------- /internal/slurm/scheduler.go: -------------------------------------------------------------------------------- 1 | package slurm 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | 7 | "github.com/akyoto/cache" 8 | "github.com/lcrownover/prometheus-slurm-exporter/internal/api" 9 | "github.com/lcrownover/prometheus-slurm-exporter/internal/types" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | type SchedulerCollector struct { 14 | ctx context.Context 15 | threads *prometheus.Desc 16 | queue_size *prometheus.Desc 17 | dbd_queue_size *prometheus.Desc 18 | last_cycle *prometheus.Desc 19 | mean_cycle *prometheus.Desc 20 | cycle_per_minute *prometheus.Desc 21 | backfill_last_cycle *prometheus.Desc 22 | backfill_mean_cycle *prometheus.Desc 23 | backfill_depth_mean *prometheus.Desc 24 | total_backfilled_jobs_since_start *prometheus.Desc 25 | total_backfilled_jobs_since_cycle *prometheus.Desc 26 | total_backfilled_heterogeneous *prometheus.Desc 27 | } 28 | 29 | func NewSchedulerCollector(ctx context.Context) *SchedulerCollector { 30 | return &SchedulerCollector{ 31 | ctx: ctx, 32 | threads: prometheus.NewDesc( 33 | "slurm_scheduler_threads", 34 | "Information provided by the Slurm sdiag command, number of scheduler threads ", 35 | nil, 36 | nil), 37 | queue_size: prometheus.NewDesc( 38 | "slurm_scheduler_queue_size", 39 | "Information provided by the Slurm sdiag command, length of the scheduler queue", 40 | nil, 41 | nil), 42 | dbd_queue_size: prometheus.NewDesc( 43 | "slurm_scheduler_dbd_queue_size", 44 | "Information provided by the Slurm sdiag command, length of the DBD agent queue", 45 | nil, 46 | nil), 47 | last_cycle: prometheus.NewDesc( 48 | "slurm_scheduler_last_cycle", 49 | "Information provided by the Slurm sdiag command, scheduler last cycle time in (microseconds)", 50 | nil, 51 | nil), 52 | mean_cycle: prometheus.NewDesc( 53 | "slurm_scheduler_mean_cycle", 54 | "Information provided by the Slurm sdiag command, scheduler mean cycle time in (microseconds)", 55 | nil, 56 | nil), 57 | cycle_per_minute: prometheus.NewDesc( 58 | "slurm_scheduler_cycle_per_minute", 59 | "Information provided by the Slurm sdiag command, number scheduler cycles per minute", 60 | nil, 61 | nil), 62 | backfill_last_cycle: prometheus.NewDesc( 63 | "slurm_scheduler_backfill_last_cycle", 64 | "Information provided by the Slurm sdiag command, scheduler backfill last cycle time in (microseconds)", 65 | nil, 66 | nil), 67 | backfill_mean_cycle: prometheus.NewDesc( 68 | "slurm_scheduler_backfill_mean_cycle", 69 | "Information provided by the Slurm sdiag command, scheduler backfill mean cycle time in (microseconds)", 70 | nil, 71 | nil), 72 | backfill_depth_mean: prometheus.NewDesc( 73 | "slurm_scheduler_backfill_depth_mean", 74 | "Information provided by the Slurm sdiag command, scheduler backfill mean depth", 75 | nil, 76 | nil), 77 | total_backfilled_jobs_since_start: prometheus.NewDesc( 78 | "slurm_scheduler_backfilled_jobs_since_start_total", 79 | "Information provided by the Slurm sdiag command, number of jobs started thanks to backfilling since last slurm start", 80 | nil, 81 | nil), 82 | total_backfilled_jobs_since_cycle: prometheus.NewDesc( 83 | "slurm_scheduler_backfilled_jobs_since_cycle_total", 84 | "Information provided by the Slurm sdiag command, number of jobs started thanks to backfilling since last time stats where reset", 85 | nil, 86 | nil), 87 | total_backfilled_heterogeneous: prometheus.NewDesc( 88 | "slurm_scheduler_backfilled_heterogeneous_total", 89 | "Information provided by the Slurm sdiag command, number of heterogeneous job components started thanks to backfilling since last Slurm start", 90 | nil, 91 | nil), 92 | } 93 | } 94 | 95 | // Send all metric descriptions 96 | func (c *SchedulerCollector) Describe(ch chan<- *prometheus.Desc) { 97 | ch <- c.threads 98 | ch <- c.queue_size 99 | ch <- c.dbd_queue_size 100 | ch <- c.last_cycle 101 | ch <- c.mean_cycle 102 | ch <- c.cycle_per_minute 103 | ch <- c.backfill_last_cycle 104 | ch <- c.backfill_mean_cycle 105 | ch <- c.backfill_depth_mean 106 | ch <- c.total_backfilled_jobs_since_start 107 | ch <- c.total_backfilled_jobs_since_cycle 108 | ch <- c.total_backfilled_heterogeneous 109 | } 110 | 111 | // Send the values of all metrics 112 | func (sc *SchedulerCollector) Collect(ch chan<- prometheus.Metric) { 113 | apiCache := sc.ctx.Value(types.ApiCacheKey).(*cache.Cache) 114 | diagRespBytes, found := apiCache.Get("diag") 115 | if !found { 116 | slog.Error("failed to get diag response for scheduler metrics from cache") 117 | return 118 | } 119 | diagData, err := api.ProcessDiagResponse(diagRespBytes.([]byte)) 120 | if err != nil { 121 | slog.Error("failed to process diag response for scheduler metrics", "error", err) 122 | return 123 | } 124 | sm, err := ParseSchedulerMetrics(diagData) 125 | if err != nil { 126 | slog.Error("failed to collect scheduler metrics", "error", err) 127 | return 128 | } 129 | ch <- prometheus.MustNewConstMetric(sc.threads, prometheus.GaugeValue, sm.threads) 130 | ch <- prometheus.MustNewConstMetric(sc.queue_size, prometheus.GaugeValue, sm.queue_size) 131 | ch <- prometheus.MustNewConstMetric(sc.dbd_queue_size, prometheus.GaugeValue, sm.dbd_queue_size) 132 | ch <- prometheus.MustNewConstMetric(sc.last_cycle, prometheus.GaugeValue, sm.last_cycle) 133 | ch <- prometheus.MustNewConstMetric(sc.mean_cycle, prometheus.GaugeValue, sm.mean_cycle) 134 | ch <- prometheus.MustNewConstMetric(sc.cycle_per_minute, prometheus.GaugeValue, sm.cycle_per_minute) 135 | ch <- prometheus.MustNewConstMetric(sc.backfill_last_cycle, prometheus.GaugeValue, sm.backfill_last_cycle) 136 | ch <- prometheus.MustNewConstMetric(sc.backfill_mean_cycle, prometheus.GaugeValue, sm.backfill_mean_cycle) 137 | ch <- prometheus.MustNewConstMetric(sc.backfill_depth_mean, prometheus.GaugeValue, sm.backfill_depth_mean) 138 | ch <- prometheus.MustNewConstMetric(sc.total_backfilled_jobs_since_start, prometheus.GaugeValue, sm.total_backfilled_jobs_since_start) 139 | ch <- prometheus.MustNewConstMetric(sc.total_backfilled_jobs_since_cycle, prometheus.GaugeValue, sm.total_backfilled_jobs_since_cycle) 140 | ch <- prometheus.MustNewConstMetric(sc.total_backfilled_heterogeneous, prometheus.GaugeValue, sm.total_backfilled_heterogeneous) 141 | } 142 | 143 | func NewSchedulerMetrics() *schedulerMetrics { 144 | return &schedulerMetrics{} 145 | } 146 | 147 | type schedulerMetrics struct { 148 | threads float64 149 | queue_size float64 150 | dbd_queue_size float64 151 | last_cycle float64 152 | mean_cycle float64 153 | cycle_per_minute float64 154 | backfill_last_cycle float64 155 | backfill_mean_cycle float64 156 | backfill_depth_mean float64 157 | total_backfilled_jobs_since_start float64 158 | total_backfilled_jobs_since_cycle float64 159 | total_backfilled_heterogeneous float64 160 | } 161 | 162 | // Extract the relevant metrics from the sdiag output 163 | func ParseSchedulerMetrics(diagData *api.DiagData) (*schedulerMetrics, error) { 164 | sm := NewSchedulerMetrics() 165 | 166 | sm.threads = float64(diagData.ServerThreadCount) 167 | sm.queue_size = float64(diagData.AgentQueueSize) 168 | sm.dbd_queue_size = float64(diagData.DbdAgentQueueSize) 169 | sm.last_cycle = float64(diagData.ScheduleCycleLast) 170 | sm.mean_cycle = float64(diagData.ScheduleCycleMean) 171 | sm.cycle_per_minute = float64(diagData.ScheduleCyclePerMinute) 172 | sm.backfill_depth_mean = float64(diagData.BfDepthMean) 173 | sm.backfill_last_cycle = float64(diagData.BfCycleLast) 174 | sm.backfill_mean_cycle = float64(diagData.BfCycleMean) 175 | sm.total_backfilled_jobs_since_cycle = float64(diagData.BfBackfilledJobs) 176 | sm.total_backfilled_heterogeneous = float64(diagData.BfBackfilledHetJobs) 177 | sm.total_backfilled_jobs_since_start = float64(diagData.BfLastBackfilledJobs) 178 | return sm, nil 179 | } 180 | -------------------------------------------------------------------------------- /testdata/V0040OpenapiSharesResp.json: -------------------------------------------------------------------------------- 1 | { 2 | "shares" : { 3 | "shares" : [ { 4 | "cluster" : "cluster", 5 | "parent" : "parent", 6 | "shares_normalized" : { 7 | "number" : 6.027456183070403, 8 | "set" : true, 9 | "infinite" : true 10 | }, 11 | "usage" : 9, 12 | "fairshare" : { 13 | "level" : 2.027123023002322, 14 | "factor" : 3.616076749251911 15 | }, 16 | "type" : [ "USER", "USER" ], 17 | "effective_usage" : 2.3021358869347655, 18 | "shares" : { 19 | "number" : 1, 20 | "set" : true, 21 | "infinite" : true 22 | }, 23 | "partition" : "partition", 24 | "usage_normalized" : { 25 | "number" : 7.061401241503109, 26 | "set" : true, 27 | "infinite" : true 28 | }, 29 | "name" : "name", 30 | "tres" : { 31 | "run_seconds" : [ { 32 | "name" : "name", 33 | "value" : { 34 | "number" : 5, 35 | "set" : true, 36 | "infinite" : true 37 | } 38 | }, { 39 | "name" : "name", 40 | "value" : { 41 | "number" : 5, 42 | "set" : true, 43 | "infinite" : true 44 | } 45 | } ], 46 | "usage" : [ { 47 | "name" : "name", 48 | "value" : 5.637376656633329 49 | }, { 50 | "name" : "name", 51 | "value" : 5.637376656633329 52 | } ], 53 | "group_minutes" : [ { 54 | "name" : "name", 55 | "value" : { 56 | "number" : 5, 57 | "set" : true, 58 | "infinite" : true 59 | } 60 | }, { 61 | "name" : "name", 62 | "value" : { 63 | "number" : 5, 64 | "set" : true, 65 | "infinite" : true 66 | } 67 | } ] 68 | }, 69 | "id" : 0 70 | }, 71 | { 72 | "id": 104, 73 | "cluster": "cluster1", 74 | "name": "user1", 75 | "parent": "group1", 76 | "partition": "", 77 | "shares_normalized": { 78 | "set": true, 79 | "infinite": false, 80 | "number": 0.333333 81 | }, 82 | "shares": { 83 | "set": true, 84 | "infinite": false, 85 | "number": 1 86 | }, 87 | "tres": { 88 | "run_seconds": [ 89 | { 90 | "name": "cpu", 91 | "value": { 92 | "set": true, 93 | "infinite": false, 94 | "number": 0 95 | } 96 | }, 97 | { 98 | "name": "mem", 99 | "value": { 100 | "set": true, 101 | "infinite": false, 102 | "number": 0 103 | } 104 | }, 105 | { 106 | "name": "energy", 107 | "value": { 108 | "set": true, 109 | "infinite": false, 110 | "number": 0 111 | } 112 | }, 113 | { 114 | "name": "node", 115 | "value": { 116 | "set": true, 117 | "infinite": false, 118 | "number": 0 119 | } 120 | }, 121 | { 122 | "name": "billing", 123 | "value": { 124 | "set": true, 125 | "infinite": false, 126 | "number": 0 127 | } 128 | }, 129 | { 130 | "name": "fs\/disk", 131 | "value": { 132 | "set": true, 133 | "infinite": false, 134 | "number": 0 135 | } 136 | }, 137 | { 138 | "name": "vmem", 139 | "value": { 140 | "set": true, 141 | "infinite": false, 142 | "number": 0 143 | } 144 | }, 145 | { 146 | "name": "pages", 147 | "value": { 148 | "set": true, 149 | "infinite": false, 150 | "number": 0 151 | } 152 | }, 153 | { 154 | "name": "gres\/gpu", 155 | "value": { 156 | "set": true, 157 | "infinite": false, 158 | "number": 0 159 | } 160 | }, 161 | { 162 | "name": "gres\/gpu:a100", 163 | "value": { 164 | "set": true, 165 | "infinite": false, 166 | "number": 0 167 | } 168 | }, 169 | { 170 | "name": "gres\/gpumem", 171 | "value": { 172 | "set": true, 173 | "infinite": false, 174 | "number": 0 175 | } 176 | }, 177 | { 178 | "name": "gres\/gpuutil", 179 | "value": { 180 | "set": true, 181 | "infinite": false, 182 | "number": 0 183 | } 184 | } 185 | ], 186 | "group_minutes": [ 187 | { 188 | "name": "cpu", 189 | "value": { 190 | "set": false, 191 | "infinite": true, 192 | "number": 0 193 | } 194 | }, 195 | { 196 | "name": "mem", 197 | "value": { 198 | "set": false, 199 | "infinite": true, 200 | "number": 0 201 | } 202 | }, 203 | { 204 | "name": "energy", 205 | "value": { 206 | "set": false, 207 | "infinite": true, 208 | "number": 0 209 | } 210 | }, 211 | { 212 | "name": "node", 213 | "value": { 214 | "set": false, 215 | "infinite": true, 216 | "number": 0 217 | } 218 | }, 219 | { 220 | "name": "billing", 221 | "value": { 222 | "set": false, 223 | "infinite": true, 224 | "number": 0 225 | } 226 | }, 227 | { 228 | "name": "fs\/disk", 229 | "value": { 230 | "set": false, 231 | "infinite": true, 232 | "number": 0 233 | } 234 | }, 235 | { 236 | "name": "vmem", 237 | "value": { 238 | "set": false, 239 | "infinite": true, 240 | "number": 0 241 | } 242 | }, 243 | { 244 | "name": "pages", 245 | "value": { 246 | "set": false, 247 | "infinite": true, 248 | "number": 0 249 | } 250 | }, 251 | { 252 | "name": "gres\/gpu", 253 | "value": { 254 | "set": false, 255 | "infinite": true, 256 | "number": 0 257 | } 258 | }, 259 | { 260 | "name": "gres\/gpu:a100", 261 | "value": { 262 | "set": false, 263 | "infinite": true, 264 | "number": 0 265 | } 266 | }, 267 | { 268 | "name": "gres\/gpumem", 269 | "value": { 270 | "set": false, 271 | "infinite": true, 272 | "number": 0 273 | } 274 | }, 275 | { 276 | "name": "gres\/gpuutil", 277 | "value": { 278 | "set": false, 279 | "infinite": true, 280 | "number": 0 281 | } 282 | } 283 | ], 284 | "usage": [ 285 | { 286 | "name" : "name", 287 | "value" : 5.637376656633329 288 | }, { 289 | "name" : "name", 290 | "value" : 5.637376656633329 291 | } 292 | ] 293 | 294 | } 295 | }, 296 | { 297 | "cluster" : "cluster", 298 | "parent" : "parent", 299 | "shares_normalized" : { 300 | "number" : 6.027456183070403, 301 | "set" : true, 302 | "infinite" : true 303 | }, 304 | "usage" : 9, 305 | "fairshare" : { 306 | "level" : Infinity, 307 | "factor" : 3.616076749251911 308 | }, 309 | "type" : [ "USER", "USER" ], 310 | "effective_usage" : 2.3021358869347655, 311 | "shares" : { 312 | "number" : 1, 313 | "set" : true, 314 | "infinite" : true 315 | }, 316 | "partition" : "partition", 317 | "usage_normalized" : { 318 | "number" : 7.061401241503109, 319 | "set" : true, 320 | "infinite" : true 321 | }, 322 | "name" : "name", 323 | "tres" : { 324 | "run_seconds" : [ { 325 | "name" : "name", 326 | "value" : { 327 | "number" : 5, 328 | "set" : true, 329 | "infinite" : true 330 | } 331 | }, { 332 | "name" : "name", 333 | "value" : { 334 | "number" : 5, 335 | "set" : true, 336 | "infinite" : true 337 | } 338 | } ], 339 | "usage" : [ 340 | { 341 | "name" : "name", 342 | "value" : 5.637376656633329 343 | }, { 344 | "name" : "name", 345 | "value" : 5.637376656633329 346 | } 347 | ], 348 | "group_minutes" : [ { 349 | "name" : "name", 350 | "value" : { 351 | "number" : 5, 352 | "set" : true, 353 | "infinite" : true 354 | } 355 | }, { 356 | "name" : "name", 357 | "value" : { 358 | "number" : 5, 359 | "set" : true, 360 | "infinite" : true 361 | } 362 | } ] 363 | }, 364 | "id" : 0 365 | } ], 366 | "total_shares" : 4 367 | }, 368 | "meta" : { 369 | "slurm" : { 370 | "cluster" : "cluster", 371 | "release" : "release", 372 | "version" : { 373 | "major" : "major", 374 | "minor" : "minor", 375 | "micro" : "micro" 376 | } 377 | }, 378 | "plugin" : { 379 | "accounting_storage" : "accounting_storage", 380 | "name" : "name", 381 | "type" : "type", 382 | "data_parser" : "data_parser" 383 | }, 384 | "client" : { 385 | "source" : "source", 386 | "user" : "user", 387 | "group" : "group" 388 | }, 389 | "command" : [ "command", "command" ] 390 | }, 391 | "warnings" : [ { 392 | "description" : "description", 393 | "source" : "source" 394 | }, { 395 | "description" : "description", 396 | "source" : "source" 397 | } ], 398 | "errors" : [ { 399 | "description" : "description", 400 | "source" : "source", 401 | "error" : "error", 402 | "error_number" : 5 403 | }, { 404 | "description" : "description", 405 | "source" : "source", 406 | "error" : "error", 407 | "error_number" : 5 408 | } ] 409 | } 410 | -------------------------------------------------------------------------------- /testdata/V0040OpenapiDiagResp.json: -------------------------------------------------------------------------------- 1 | { 2 | "statistics": { 3 | "parts_packed": 1, 4 | "req_time": { 5 | "set": true, 6 | "infinite": false, 7 | "number": 1726764981 8 | }, 9 | "req_time_start": { 10 | "set": true, 11 | "infinite": false, 12 | "number": 1726704000 13 | }, 14 | "server_thread_count": 2, 15 | "agent_queue_size": 0, 16 | "agent_count": 0, 17 | "agent_thread_count": 0, 18 | "dbd_agent_queue_size": 0, 19 | "gettimeofday_latency": 17, 20 | "schedule_cycle_max": 14942, 21 | "schedule_cycle_last": 22, 22 | "schedule_cycle_total": 1065, 23 | "schedule_cycle_mean": 49, 24 | "schedule_cycle_mean_depth": 0, 25 | "schedule_cycle_per_minute": 1, 26 | "schedule_queue_length": 0, 27 | "schedule_exit": { 28 | "end_job_queue": 1065, 29 | "default_queue_depth": 0, 30 | "max_job_start": 0, 31 | "max_rpc_cnt": 0, 32 | "max_sched_time": 0, 33 | "licenses": 0 34 | }, 35 | "jobs_submitted": 2, 36 | "jobs_started": 0, 37 | "jobs_completed": 2, 38 | "jobs_canceled": 0, 39 | "jobs_failed": 0, 40 | "jobs_pending": 25, 41 | "jobs_running": 1, 42 | "job_states_ts": { 43 | "set": true, 44 | "infinite": false, 45 | "number": 1726764972 46 | }, 47 | "bf_backfilled_jobs": 13, 48 | "bf_last_backfilled_jobs": 0, 49 | "bf_backfilled_het_jobs": 0, 50 | "bf_cycle_counter": 0, 51 | "bf_cycle_mean": 0, 52 | "bf_depth_mean": 0, 53 | "bf_depth_mean_try": 0, 54 | "bf_cycle_sum": 0, 55 | "bf_cycle_last": 0, 56 | "bf_last_depth": 0, 57 | "bf_last_depth_try": 0, 58 | "bf_depth_sum": 0, 59 | "bf_depth_try_sum": 0, 60 | "bf_queue_len": 0, 61 | "bf_queue_len_mean": 0, 62 | "bf_queue_len_sum": 0, 63 | "bf_table_size": 1, 64 | "bf_table_size_mean": 0, 65 | "bf_when_last_cycle": { 66 | "set": true, 67 | "infinite": false, 68 | "number": 1726695861 69 | }, 70 | "bf_active": false, 71 | "bf_exit": { 72 | "end_job_queue": 0, 73 | "bf_max_job_start": 0, 74 | "bf_max_job_test": 0, 75 | "bf_max_time": 0, 76 | "bf_node_space_size": 0, 77 | "state_changed": 0 78 | }, 79 | "rpcs_by_message_type": [ 80 | { 81 | "message_type": "REQUEST_TRIGGER_PULL", 82 | "type_id": 2030, 83 | "count": 1, 84 | "average_time": 104, 85 | "total_time": 104 86 | }, 87 | { 88 | "message_type": "REQUEST_CONTROL_STATUS", 89 | "type_id": 2053, 90 | "count": 3578, 91 | "average_time": 21, 92 | "total_time": 76126 93 | }, 94 | { 95 | "message_type": "REQUEST_FED_INFO", 96 | "type_id": 2049, 97 | "count": 59, 98 | "average_time": 21, 99 | "total_time": 1287 100 | }, 101 | { 102 | "message_type": "REQUEST_JOB_USER_INFO", 103 | "type_id": 2039, 104 | "count": 14, 105 | "average_time": 393, 106 | "total_time": 5513 107 | }, 108 | { 109 | "message_type": "REQUEST_PARTITION_INFO", 110 | "type_id": 2009, 111 | "count": 402960, 112 | "average_time": 42, 113 | "total_time": 17203621 114 | }, 115 | { 116 | "message_type": "REQUEST_SUBMIT_BATCH_JOB", 117 | "type_id": 4003, 118 | "count": 6, 119 | "average_time": 2101, 120 | "total_time": 12608 121 | }, 122 | { 123 | "message_type": "REQUEST_NODE_INFO", 124 | "type_id": 2007, 125 | "count": 404869, 126 | "average_time": 370518, 127 | "total_time": 150011441628 128 | }, 129 | { 130 | "message_type": "REQUEST_CONFIG", 131 | "type_id": 2015, 132 | "count": 661, 133 | "average_time": 73, 134 | "total_time": 48817 135 | }, 136 | { 137 | "message_type": "MESSAGE_NODE_REGISTRATION_STATUS", 138 | "type_id": 1002, 139 | "count": 999, 140 | "average_time": 503, 141 | "total_time": 503284 142 | }, 143 | { 144 | "message_type": "REQUEST_COMPLETE_PROLOG", 145 | "type_id": 6018, 146 | "count": 45, 147 | "average_time": 16740, 148 | "total_time": 753313 149 | }, 150 | { 151 | "message_type": "REQUEST_COMPLETE_BATCH_SCRIPT", 152 | "type_id": 5018, 153 | "count": 36, 154 | "average_time": 196, 155 | "total_time": 7069 156 | }, 157 | { 158 | "message_type": "REQUEST_STEP_COMPLETE", 159 | "type_id": 5016, 160 | "count": 44, 161 | "average_time": 168, 162 | "total_time": 7411 163 | }, 164 | { 165 | "message_type": "REQUEST_JOB_INFO_SINGLE", 166 | "type_id": 2021, 167 | "count": 45, 168 | "average_time": 2438, 169 | "total_time": 109735 170 | }, 171 | { 172 | "message_type": "MESSAGE_EPILOG_COMPLETE", 173 | "type_id": 6012, 174 | "count": 39, 175 | "average_time": 30745088, 176 | "total_time": 1199058444 177 | }, 178 | { 179 | "message_type": "REQUEST_HET_JOB_ALLOC_INFO", 180 | "type_id": 4027, 181 | "count": 2, 182 | "average_time": 205, 183 | "total_time": 410 184 | }, 185 | { 186 | "message_type": "REQUEST_JOB_STEP_CREATE", 187 | "type_id": 5001, 188 | "count": 3, 189 | "average_time": 283, 190 | "total_time": 850 191 | }, 192 | { 193 | "message_type": "REQUEST_RESOURCE_ALLOCATION", 194 | "type_id": 4001, 195 | "count": 18, 196 | "average_time": 274360, 197 | "total_time": 4938484 198 | }, 199 | { 200 | "message_type": "REQUEST_JOB_READY", 201 | "type_id": 4019, 202 | "count": 2, 203 | "average_time": 22, 204 | "total_time": 44 205 | }, 206 | { 207 | "message_type": "REQUEST_UPDATE_PARTITION", 208 | "type_id": 3005, 209 | "count": 34, 210 | "average_time": 201, 211 | "total_time": 6843 212 | }, 213 | { 214 | "message_type": "ACCOUNTING_REGISTER_CTLD", 215 | "type_id": 10003, 216 | "count": 1, 217 | "average_time": 86444, 218 | "total_time": 86444 219 | }, 220 | { 221 | "message_type": "REQUEST_PERSIST_INIT", 222 | "type_id": 6500, 223 | "count": 1, 224 | "average_time": 57, 225 | "total_time": 57 226 | }, 227 | { 228 | "message_type": "ACCOUNTING_UPDATE_MSG", 229 | "type_id": 10001, 230 | "count": 1, 231 | "average_time": 22, 232 | "total_time": 22 233 | }, 234 | { 235 | "message_type": "REQUEST_AUTH_TOKEN", 236 | "type_id": 5039, 237 | "count": 1, 238 | "average_time": 262, 239 | "total_time": 262 240 | }, 241 | { 242 | "message_type": "REQUEST_JOB_INFO", 243 | "type_id": 2003, 244 | "count": 15, 245 | "average_time": 597, 246 | "total_time": 8969 247 | }, 248 | { 249 | "message_type": "REQUEST_STATS_INFO", 250 | "type_id": 2035, 251 | "count": 9, 252 | "average_time": 31, 253 | "total_time": 281 254 | }, 255 | { 256 | "message_type": "REQUEST_SHARE_INFO", 257 | "type_id": 2022, 258 | "count": 8, 259 | "average_time": 3486, 260 | "total_time": 27888 261 | }, 262 | { 263 | "message_type": "REQUEST_CANCEL_JOB_STEP", 264 | "type_id": 5005, 265 | "count": 1, 266 | "average_time": 218, 267 | "total_time": 218 268 | }, 269 | { 270 | "message_type": "REQUEST_COMPLETE_JOB_ALLOCATION", 271 | "type_id": 5017, 272 | "count": 24, 273 | "average_time": 298, 274 | "total_time": 7167 275 | }, 276 | { 277 | "message_type": "REQUEST_JOB_ALLOCATION_INFO", 278 | "type_id": 4014, 279 | "count": 11, 280 | "average_time": 21, 281 | "total_time": 235 282 | }, 283 | { 284 | "message_type": "REQUEST_KILL_JOB", 285 | "type_id": 5032, 286 | "count": 2, 287 | "average_time": 177, 288 | "total_time": 354 289 | } 290 | ], 291 | "rpcs_by_user": [ 292 | { 293 | "user": "root", 294 | "user_id": 0, 295 | "count": 809723, 296 | "average_time": 186766, 297 | "total_time": 151229024182 298 | }, 299 | { 300 | "user": "slurm", 301 | "user_id": 58, 302 | "count": 3582, 303 | "average_time": 45, 304 | "total_time": 162753 305 | }, 306 | { 307 | "user": "vspauldi", 308 | "user_id": 239489, 309 | "count": 7, 310 | "average_time": 512, 311 | "total_time": 3590 312 | } 313 | ] 314 | }, 315 | "meta": { 316 | "plugin": { 317 | "type": "openapi/slurmctld", 318 | "name": "Slurm OpenAPI slurmctld", 319 | "data_parser": "data_parser/v0.0.40", 320 | "accounting_storage": "accounting_storage/slurmdbd" 321 | }, 322 | "client": { 323 | "source": "[10.174.138.225]:59585", 324 | "user": "root", 325 | "group": "root" 326 | }, 327 | "command": [], 328 | "slurm": { 329 | "version": { 330 | "major": "23", 331 | "micro": "1", 332 | "minor": "11" 333 | }, 334 | "release": "23.11.1", 335 | "cluster": "mycluster" 336 | } 337 | }, 338 | "errors": [], 339 | "warnings": [] 340 | } 341 | -------------------------------------------------------------------------------- /testdata/V0040OpenapiPartitionResp.json: -------------------------------------------------------------------------------- 1 | { 2 | "partitions": [ 3 | { 4 | "nodes": { 5 | "allowed_allocation": "", 6 | "configured": "n[0111-0135,0180-0196]", 7 | "total": 42 8 | }, 9 | "accounts": { 10 | "allowed": "", 11 | "deny": "" 12 | }, 13 | "groups": { 14 | "allowed": "" 15 | }, 16 | "qos": { 17 | "allowed": "", 18 | "deny": "", 19 | "assigned": "compute" 20 | }, 21 | "alternate": "", 22 | "tres": { 23 | "billing_weights": "", 24 | "configured": "cpu=5376,mem=21186186M,node=42,billing=5376" 25 | }, 26 | "cluster": "", 27 | "cpus": { 28 | "task_binding": 0, 29 | "total": 5376 30 | }, 31 | "defaults": { 32 | "memory_per_cpu": -9223372036854771712, 33 | "partition_memory_per_cpu": { 34 | "set": true, 35 | "infinite": false, 36 | "number": 4096 37 | }, 38 | "partition_memory_per_node": { 39 | "set": false, 40 | "infinite": false, 41 | "number": 0 42 | }, 43 | "time": { 44 | "set": true, 45 | "infinite": false, 46 | "number": 1440 47 | }, 48 | "job": "" 49 | }, 50 | "grace_time": 0, 51 | "maximums": { 52 | "cpus_per_node": { 53 | "set": false, 54 | "infinite": true, 55 | "number": 0 56 | }, 57 | "cpus_per_socket": { 58 | "set": false, 59 | "infinite": true, 60 | "number": 0 61 | }, 62 | "memory_per_cpu": 0, 63 | "partition_memory_per_cpu": { 64 | "set": false, 65 | "infinite": false, 66 | "number": 0 67 | }, 68 | "partition_memory_per_node": { 69 | "set": true, 70 | "infinite": false, 71 | "number": 0 72 | }, 73 | "nodes": { 74 | "set": false, 75 | "infinite": true, 76 | "number": 0 77 | }, 78 | "shares": 1, 79 | "oversubscribe": { 80 | "jobs": 1, 81 | "flags": [] 82 | }, 83 | "time": { 84 | "set": true, 85 | "infinite": false, 86 | "number": 1440 87 | }, 88 | "over_time_limit": { 89 | "set": false, 90 | "infinite": false, 91 | "number": 0 92 | } 93 | }, 94 | "minimums": { 95 | "nodes": 1 96 | }, 97 | "name": "compute", 98 | "node_sets": "", 99 | "priority": { 100 | "job_factor": 200, 101 | "tier": 200 102 | }, 103 | "timeouts": { 104 | "resume": { 105 | "set": false, 106 | "infinite": false, 107 | "number": 0 108 | }, 109 | "suspend": { 110 | "set": false, 111 | "infinite": false, 112 | "number": 0 113 | } 114 | }, 115 | "partition": { 116 | "state": [ 117 | "UP" 118 | ] 119 | }, 120 | "suspend_time": { 121 | "set": false, 122 | "infinite": false, 123 | "number": 0 124 | } 125 | }, 126 | { 127 | "nodes": { 128 | "allowed_allocation": "", 129 | "configured": "n[0149-0160,0162-0172]", 130 | "total": 23 131 | }, 132 | "accounts": { 133 | "allowed": "", 134 | "deny": "" 135 | }, 136 | "groups": { 137 | "allowed": "" 138 | }, 139 | "qos": { 140 | "allowed": "", 141 | "deny": "", 142 | "assigned": "gpu" 143 | }, 144 | "alternate": "", 145 | "tres": { 146 | "billing_weights": "", 147 | "configured": "cpu=1104,mem=8505383M,node=23,billing=1104,gres/gpu=155" 148 | }, 149 | "cluster": "", 150 | "cpus": { 151 | "task_binding": 0, 152 | "total": 1104 153 | }, 154 | "defaults": { 155 | "memory_per_cpu": -9223372036854771712, 156 | "partition_memory_per_cpu": { 157 | "set": true, 158 | "infinite": false, 159 | "number": 4096 160 | }, 161 | "partition_memory_per_node": { 162 | "set": false, 163 | "infinite": false, 164 | "number": 0 165 | }, 166 | "time": { 167 | "set": true, 168 | "infinite": false, 169 | "number": 1440 170 | }, 171 | "job": "" 172 | }, 173 | "grace_time": 0, 174 | "maximums": { 175 | "cpus_per_node": { 176 | "set": false, 177 | "infinite": true, 178 | "number": 0 179 | }, 180 | "cpus_per_socket": { 181 | "set": false, 182 | "infinite": true, 183 | "number": 0 184 | }, 185 | "memory_per_cpu": 0, 186 | "partition_memory_per_cpu": { 187 | "set": false, 188 | "infinite": false, 189 | "number": 0 190 | }, 191 | "partition_memory_per_node": { 192 | "set": true, 193 | "infinite": false, 194 | "number": 0 195 | }, 196 | "nodes": { 197 | "set": false, 198 | "infinite": true, 199 | "number": 0 200 | }, 201 | "shares": 1, 202 | "oversubscribe": { 203 | "jobs": 1, 204 | "flags": [] 205 | }, 206 | "time": { 207 | "set": true, 208 | "infinite": false, 209 | "number": 1440 210 | }, 211 | "over_time_limit": { 212 | "set": false, 213 | "infinite": false, 214 | "number": 0 215 | } 216 | }, 217 | "minimums": { 218 | "nodes": 1 219 | }, 220 | "name": "gpu", 221 | "node_sets": "", 222 | "priority": { 223 | "job_factor": 200, 224 | "tier": 200 225 | }, 226 | "timeouts": { 227 | "resume": { 228 | "set": false, 229 | "infinite": false, 230 | "number": 0 231 | }, 232 | "suspend": { 233 | "set": false, 234 | "infinite": false, 235 | "number": 0 236 | } 237 | }, 238 | "partition": { 239 | "state": [ 240 | "UP" 241 | ] 242 | }, 243 | "suspend_time": { 244 | "set": false, 245 | "infinite": false, 246 | "number": 0 247 | } 248 | }, 249 | { 250 | "nodes": { 251 | "allowed_allocation": "", 252 | "configured": "n[0141-0148,0372-0379]", 253 | "total": 16 254 | }, 255 | "accounts": { 256 | "allowed": "", 257 | "deny": "" 258 | }, 259 | "groups": { 260 | "allowed": "" 261 | }, 262 | "qos": { 263 | "allowed": "", 264 | "deny": "", 265 | "assigned": "memory" 266 | }, 267 | "alternate": "", 268 | "tres": { 269 | "billing_weights": "", 270 | "configured": "cpu=896,mem=36741334M,node=16,billing=896" 271 | }, 272 | "cluster": "", 273 | "cpus": { 274 | "task_binding": 0, 275 | "total": 896 276 | }, 277 | "defaults": { 278 | "memory_per_cpu": -9223372036854771712, 279 | "partition_memory_per_cpu": { 280 | "set": true, 281 | "infinite": false, 282 | "number": 4096 283 | }, 284 | "partition_memory_per_node": { 285 | "set": false, 286 | "infinite": false, 287 | "number": 0 288 | }, 289 | "time": { 290 | "set": true, 291 | "infinite": false, 292 | "number": 1440 293 | }, 294 | "job": "" 295 | }, 296 | "grace_time": 0, 297 | "maximums": { 298 | "cpus_per_node": { 299 | "set": false, 300 | "infinite": true, 301 | "number": 0 302 | }, 303 | "cpus_per_socket": { 304 | "set": false, 305 | "infinite": true, 306 | "number": 0 307 | }, 308 | "memory_per_cpu": 0, 309 | "partition_memory_per_cpu": { 310 | "set": false, 311 | "infinite": false, 312 | "number": 0 313 | }, 314 | "partition_memory_per_node": { 315 | "set": true, 316 | "infinite": false, 317 | "number": 0 318 | }, 319 | "nodes": { 320 | "set": false, 321 | "infinite": true, 322 | "number": 0 323 | }, 324 | "shares": 1, 325 | "oversubscribe": { 326 | "jobs": 1, 327 | "flags": [] 328 | }, 329 | "time": { 330 | "set": true, 331 | "infinite": false, 332 | "number": 1440 333 | }, 334 | "over_time_limit": { 335 | "set": false, 336 | "infinite": false, 337 | "number": 0 338 | } 339 | }, 340 | "minimums": { 341 | "nodes": 1 342 | }, 343 | "name": "memory", 344 | "node_sets": "", 345 | "priority": { 346 | "job_factor": 200, 347 | "tier": 200 348 | }, 349 | "timeouts": { 350 | "resume": { 351 | "set": false, 352 | "infinite": false, 353 | "number": 0 354 | }, 355 | "suspend": { 356 | "set": false, 357 | "infinite": false, 358 | "number": 0 359 | } 360 | }, 361 | "partition": { 362 | "state": [ 363 | "UP" 364 | ] 365 | }, 366 | "suspend_time": { 367 | "set": false, 368 | "infinite": false, 369 | "number": 0 370 | } 371 | }, 372 | { 373 | "nodes": { 374 | "allowed_allocation": "", 375 | "configured": "n[0142,0144,0146,0148,0372,0374,0376,0378]", 376 | "total": 8 377 | }, 378 | "accounts": { 379 | "allowed": "", 380 | "deny": "" 381 | }, 382 | "groups": { 383 | "allowed": "" 384 | }, 385 | "qos": { 386 | "allowed": "", 387 | "deny": "", 388 | "assigned": "memory" 389 | }, 390 | "alternate": "", 391 | "tres": { 392 | "billing_weights": "", 393 | "configured": "cpu=448,mem=18370667M,node=8,billing=448" 394 | }, 395 | "cluster": "", 396 | "cpus": { 397 | "task_binding": 0, 398 | "total": 448 399 | }, 400 | "defaults": { 401 | "memory_per_cpu": -9223372036854771712, 402 | "partition_memory_per_cpu": { 403 | "set": true, 404 | "infinite": false, 405 | "number": 4096 406 | }, 407 | "partition_memory_per_node": { 408 | "set": false, 409 | "infinite": false, 410 | "number": 0 411 | }, 412 | "time": { 413 | "set": true, 414 | "infinite": false, 415 | "number": 20160 416 | }, 417 | "job": "" 418 | }, 419 | "grace_time": 0, 420 | "maximums": { 421 | "cpus_per_node": { 422 | "set": false, 423 | "infinite": true, 424 | "number": 0 425 | }, 426 | "cpus_per_socket": { 427 | "set": false, 428 | "infinite": true, 429 | "number": 0 430 | }, 431 | "memory_per_cpu": 0, 432 | "partition_memory_per_cpu": { 433 | "set": false, 434 | "infinite": false, 435 | "number": 0 436 | }, 437 | "partition_memory_per_node": { 438 | "set": true, 439 | "infinite": false, 440 | "number": 0 441 | }, 442 | "nodes": { 443 | "set": false, 444 | "infinite": true, 445 | "number": 0 446 | }, 447 | "shares": 1, 448 | "oversubscribe": { 449 | "jobs": 1, 450 | "flags": [] 451 | }, 452 | "time": { 453 | "set": true, 454 | "infinite": false, 455 | "number": 20160 456 | }, 457 | "over_time_limit": { 458 | "set": false, 459 | "infinite": false, 460 | "number": 0 461 | } 462 | }, 463 | "minimums": { 464 | "nodes": 1 465 | }, 466 | "name": "memorylong", 467 | "node_sets": "", 468 | "priority": { 469 | "job_factor": 200, 470 | "tier": 200 471 | }, 472 | "timeouts": { 473 | "resume": { 474 | "set": false, 475 | "infinite": false, 476 | "number": 0 477 | }, 478 | "suspend": { 479 | "set": false, 480 | "infinite": false, 481 | "number": 0 482 | } 483 | }, 484 | "partition": { 485 | "state": [ 486 | "UP" 487 | ] 488 | }, 489 | "suspend_time": { 490 | "set": false, 491 | "infinite": false, 492 | "number": 0 493 | } 494 | }, 495 | { 496 | "nodes": { 497 | "allowed_allocation": "", 498 | "configured": "n[0013-0044,0049-0136,0141-0189,0191-0196,0199,0201-0242,0244-0269,0301-0308,0310-0399,0998-1000]", 499 | "total": 345 500 | }, 501 | "accounts": { 502 | "allowed": "", 503 | "deny": "" 504 | }, 505 | "groups": { 506 | "allowed": "" 507 | }, 508 | "qos": { 509 | "allowed": "", 510 | "deny": "", 511 | "assigned": "preempt" 512 | }, 513 | "alternate": "", 514 | "tres": { 515 | "billing_weights": "", 516 | "configured": "cpu=17772,mem=145334877M,node=345,billing=17772,gres/gpu=230" 517 | }, 518 | "cluster": "", 519 | "cpus": { 520 | "task_binding": 0, 521 | "total": 17772 522 | }, 523 | "defaults": { 524 | "memory_per_cpu": -9223372036854771712, 525 | "partition_memory_per_cpu": { 526 | "set": true, 527 | "infinite": false, 528 | "number": 4096 529 | }, 530 | "partition_memory_per_node": { 531 | "set": false, 532 | "infinite": false, 533 | "number": 0 534 | }, 535 | "time": { 536 | "set": true, 537 | "infinite": false, 538 | "number": 10080 539 | }, 540 | "job": "" 541 | }, 542 | "grace_time": 0, 543 | "maximums": { 544 | "cpus_per_node": { 545 | "set": false, 546 | "infinite": true, 547 | "number": 0 548 | }, 549 | "cpus_per_socket": { 550 | "set": false, 551 | "infinite": true, 552 | "number": 0 553 | }, 554 | "memory_per_cpu": 0, 555 | "partition_memory_per_cpu": { 556 | "set": false, 557 | "infinite": false, 558 | "number": 0 559 | }, 560 | "partition_memory_per_node": { 561 | "set": true, 562 | "infinite": false, 563 | "number": 0 564 | }, 565 | "nodes": { 566 | "set": true, 567 | "infinite": false, 568 | "number": 48 569 | }, 570 | "shares": 1, 571 | "oversubscribe": { 572 | "jobs": 1, 573 | "flags": [] 574 | }, 575 | "time": { 576 | "set": true, 577 | "infinite": false, 578 | "number": 10080 579 | }, 580 | "over_time_limit": { 581 | "set": false, 582 | "infinite": false, 583 | "number": 0 584 | } 585 | }, 586 | "minimums": { 587 | "nodes": 1 588 | }, 589 | "name": "preempt", 590 | "node_sets": "", 591 | "priority": { 592 | "job_factor": 1, 593 | "tier": 1 594 | }, 595 | "timeouts": { 596 | "resume": { 597 | "set": false, 598 | "infinite": false, 599 | "number": 0 600 | }, 601 | "suspend": { 602 | "set": false, 603 | "infinite": false, 604 | "number": 0 605 | } 606 | }, 607 | "partition": { 608 | "state": [ 609 | "UP" 610 | ] 611 | }, 612 | "suspend_time": { 613 | "set": false, 614 | "infinite": false, 615 | "number": 0 616 | } 617 | } 618 | ], 619 | "last_update": { 620 | "set": true, 621 | "infinite": false, 622 | "number": 1727286013 623 | }, 624 | "meta": { 625 | "plugin": { 626 | "type": "openapi/slurmctld", 627 | "name": "Slurm OpenAPI slurmctld", 628 | "data_parser": "data_parser/v0.0.40", 629 | "accounting_storage": "accounting_storage/slurmdbd" 630 | }, 631 | "client": { 632 | "source": "[10.174.139.128]:55418", 633 | "user": "root", 634 | "group": "root" 635 | }, 636 | "command": [], 637 | "slurm": { 638 | "version": { 639 | "major": "23", 640 | "micro": "1", 641 | "minor": "11" 642 | }, 643 | "release": "23.11.1", 644 | "cluster": "cluster" 645 | } 646 | }, 647 | "errors": [], 648 | "warnings": [] 649 | } 650 | -------------------------------------------------------------------------------- /testdata/V0040OpenapiJobInfoResp.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobs": [ 3 | { 4 | "account": "jamming", 5 | "accrue_time": { 6 | "set": true, 7 | "infinite": false, 8 | "number": 1722268326 9 | }, 10 | "admin_comment": "", 11 | "allocating_node": "login1", 12 | "array_job_id": { 13 | "set": true, 14 | "infinite": false, 15 | "number": 7725337 16 | }, 17 | "array_task_id": { 18 | "set": true, 19 | "infinite": false, 20 | "number": 411 21 | }, 22 | "array_max_tasks": { 23 | "set": true, 24 | "infinite": false, 25 | "number": 0 26 | }, 27 | "array_task_string": "", 28 | "association_id": 70, 29 | "batch_features": "", 30 | "batch_flag": true, 31 | "batch_host": "n0180", 32 | "flags": [ 33 | "ACCRUE_COUNT_CLEARED", 34 | "JOB_WAS_RUNNING", 35 | "USING_DEFAULT_QOS", 36 | "USING_DEFAULT_WCKEY" 37 | ], 38 | "burst_buffer": "", 39 | "burst_buffer_state": "", 40 | "cluster": "talapas", 41 | "cluster_features": "", 42 | "command": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/relaxAndShearScheme.srun", 43 | "comment": "", 44 | "container": "", 45 | "container_id": "", 46 | "contiguous": false, 47 | "core_spec": 0, 48 | "thread_spec": 32766, 49 | "cores_per_socket": { 50 | "set": false, 51 | "infinite": false, 52 | "number": 0 53 | }, 54 | "billable_tres": { 55 | "set": true, 56 | "infinite": false, 57 | "number": 1.0 58 | }, 59 | "cpus_per_task": { 60 | "set": true, 61 | "infinite": false, 62 | "number": 1 63 | }, 64 | "cpu_frequency_minimum": { 65 | "set": false, 66 | "infinite": false, 67 | "number": 0 68 | }, 69 | "cpu_frequency_maximum": { 70 | "set": false, 71 | "infinite": false, 72 | "number": 0 73 | }, 74 | "cpu_frequency_governor": { 75 | "set": false, 76 | "infinite": false, 77 | "number": 0 78 | }, 79 | "cpus_per_tres": "", 80 | "cron": "", 81 | "deadline": { 82 | "set": true, 83 | "infinite": false, 84 | "number": 0 85 | }, 86 | "delay_boot": { 87 | "set": true, 88 | "infinite": false, 89 | "number": 0 90 | }, 91 | "dependency": "", 92 | "derived_exit_code": { 93 | "status": [ 94 | "SUCCESS" 95 | ], 96 | "return_code": { 97 | "set": true, 98 | "infinite": false, 99 | "number": 0 100 | }, 101 | "signal": { 102 | "id": { 103 | "set": false, 104 | "infinite": false, 105 | "number": 0 106 | }, 107 | "name": "" 108 | } 109 | }, 110 | "eligible_time": { 111 | "set": true, 112 | "infinite": false, 113 | "number": 1722268326 114 | }, 115 | "end_time": { 116 | "set": true, 117 | "infinite": false, 118 | "number": 1722360761 119 | }, 120 | "excluded_nodes": "", 121 | "exit_code": { 122 | "status": [ 123 | "SUCCESS" 124 | ], 125 | "return_code": { 126 | "set": true, 127 | "infinite": false, 128 | "number": 0 129 | }, 130 | "signal": { 131 | "id": { 132 | "set": false, 133 | "infinite": false, 134 | "number": 0 135 | }, 136 | "name": "" 137 | } 138 | }, 139 | "extra": "", 140 | "failed_node": "", 141 | "features": "", 142 | "federation_origin": "", 143 | "federation_siblings_active": "", 144 | "federation_siblings_viable": "", 145 | "gres_detail": [], 146 | "group_id": 131, 147 | "group_name": "uoregon", 148 | "het_job_id": { 149 | "set": true, 150 | "infinite": false, 151 | "number": 0 152 | }, 153 | "het_job_id_set": "", 154 | "het_job_offset": { 155 | "set": true, 156 | "infinite": false, 157 | "number": 0 158 | }, 159 | "job_id": 7745162, 160 | "job_resources": { 161 | "nodes": "n0180", 162 | "allocated_cores": 1, 163 | "allocated_cpus": 0, 164 | "allocated_hosts": 1, 165 | "allocated_nodes": [ 166 | { 167 | "sockets": { 168 | "1": { 169 | "cores": { 170 | "51": "allocated" 171 | } 172 | } 173 | }, 174 | "nodename": "n0180", 175 | "cpus_used": 0, 176 | "memory_used": 0, 177 | "memory_allocated": 4096 178 | } 179 | ] 180 | }, 181 | "job_size_str": [], 182 | "job_state": [ 183 | "RUNNING" 184 | ], 185 | "last_sched_evaluation": { 186 | "set": true, 187 | "infinite": false, 188 | "number": 1722274361 189 | }, 190 | "licenses": "", 191 | "mail_type": [], 192 | "mail_user": "rdennis", 193 | "max_cpus": { 194 | "set": true, 195 | "infinite": false, 196 | "number": 0 197 | }, 198 | "max_nodes": { 199 | "set": true, 200 | "infinite": false, 201 | "number": 0 202 | }, 203 | "mcs_label": "", 204 | "memory_per_tres": "", 205 | "name": "rands", 206 | "network": "", 207 | "nodes": "n0180", 208 | "nice": 0, 209 | "tasks_per_core": { 210 | "set": false, 211 | "infinite": true, 212 | "number": 0 213 | }, 214 | "tasks_per_tres": { 215 | "set": true, 216 | "infinite": false, 217 | "number": 0 218 | }, 219 | "tasks_per_node": { 220 | "set": true, 221 | "infinite": false, 222 | "number": 1 223 | }, 224 | "tasks_per_socket": { 225 | "set": false, 226 | "infinite": true, 227 | "number": 0 228 | }, 229 | "tasks_per_board": { 230 | "set": true, 231 | "infinite": false, 232 | "number": 0 233 | }, 234 | "cpus": { 235 | "set": true, 236 | "infinite": false, 237 | "number": 1 238 | }, 239 | "node_count": { 240 | "set": true, 241 | "infinite": false, 242 | "number": 1 243 | }, 244 | "tasks": { 245 | "set": true, 246 | "infinite": false, 247 | "number": 1 248 | }, 249 | "partition": "preempt", 250 | "prefer": "", 251 | "memory_per_cpu": { 252 | "set": true, 253 | "infinite": false, 254 | "number": 4096 255 | }, 256 | "memory_per_node": { 257 | "set": false, 258 | "infinite": false, 259 | "number": 0 260 | }, 261 | "minimum_cpus_per_node": { 262 | "set": true, 263 | "infinite": false, 264 | "number": 1 265 | }, 266 | "minimum_tmp_disk_per_node": { 267 | "set": true, 268 | "infinite": false, 269 | "number": 0 270 | }, 271 | "power": { 272 | "flags": [] 273 | }, 274 | "preempt_time": { 275 | "set": true, 276 | "infinite": false, 277 | "number": 0 278 | }, 279 | "preemptable_time": { 280 | "set": true, 281 | "infinite": false, 282 | "number": 1722274361 283 | }, 284 | "pre_sus_time": { 285 | "set": true, 286 | "infinite": false, 287 | "number": 0 288 | }, 289 | "priority": { 290 | "set": true, 291 | "infinite": false, 292 | "number": 169465 293 | }, 294 | "profile": [ 295 | "NOT_SET" 296 | ], 297 | "qos": "normal", 298 | "reboot": false, 299 | "required_nodes": "", 300 | "minimum_switches": 0, 301 | "requeue": false, 302 | "resize_time": { 303 | "set": true, 304 | "infinite": false, 305 | "number": 0 306 | }, 307 | "restart_cnt": 0, 308 | "resv_name": "", 309 | "scheduled_nodes": "", 310 | "selinux_context": "", 311 | "shared": [], 312 | "exclusive": [], 313 | "oversubscribe": true, 314 | "show_flags": [ 315 | "ALL", 316 | "DETAIL", 317 | "LOCAL" 318 | ], 319 | "sockets_per_board": 0, 320 | "sockets_per_node": { 321 | "set": false, 322 | "infinite": false, 323 | "number": 0 324 | }, 325 | "start_time": { 326 | "set": true, 327 | "infinite": false, 328 | "number": 1722274361 329 | }, 330 | "state_description": "", 331 | "state_reason": "None", 332 | "standard_error": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_411.out", 333 | "standard_input": "/dev/null", 334 | "standard_output": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_411.out", 335 | "submit_time": { 336 | "set": true, 337 | "infinite": false, 338 | "number": 1722268317 339 | }, 340 | "suspend_time": { 341 | "set": true, 342 | "infinite": false, 343 | "number": 0 344 | }, 345 | "system_comment": "", 346 | "time_limit": { 347 | "set": true, 348 | "infinite": false, 349 | "number": 1440 350 | }, 351 | "time_minimum": { 352 | "set": true, 353 | "infinite": false, 354 | "number": 0 355 | }, 356 | "threads_per_core": { 357 | "set": false, 358 | "infinite": false, 359 | "number": 0 360 | }, 361 | "tres_bind": "", 362 | "tres_freq": "", 363 | "tres_per_job": "", 364 | "tres_per_node": "", 365 | "tres_per_socket": "", 366 | "tres_per_task": "", 367 | "tres_req_str": "cpu=1,mem=4G,node=1,billing=1", 368 | "tres_alloc_str": "cpu=1,mem=4G,node=1,billing=1", 369 | "user_id": 110622, 370 | "user_name": "rdennis", 371 | "maximum_switch_wait_time": 0, 372 | "wckey": "", 373 | "current_working_directory": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all" 374 | }, 375 | { 376 | "account": "jamming", 377 | "accrue_time": { 378 | "set": true, 379 | "infinite": false, 380 | "number": 1722268326 381 | }, 382 | "admin_comment": "", 383 | "allocating_node": "login1", 384 | "array_job_id": { 385 | "set": true, 386 | "infinite": false, 387 | "number": 7725337 388 | }, 389 | "array_task_id": { 390 | "set": false, 391 | "infinite": false, 392 | "number": 0 393 | }, 394 | "array_max_tasks": { 395 | "set": true, 396 | "infinite": false, 397 | "number": 0 398 | }, 399 | "array_task_string": "412-9999", 400 | "association_id": 70, 401 | "batch_features": "", 402 | "batch_flag": true, 403 | "batch_host": "", 404 | "flags": [ 405 | "USING_DEFAULT_QOS", 406 | "USING_DEFAULT_WCKEY" 407 | ], 408 | "burst_buffer": "", 409 | "burst_buffer_state": "", 410 | "cluster": "talapas", 411 | "cluster_features": "", 412 | "command": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/relaxAndShearScheme.srun", 413 | "comment": "", 414 | "container": "", 415 | "container_id": "", 416 | "contiguous": false, 417 | "core_spec": 0, 418 | "thread_spec": 32766, 419 | "cores_per_socket": { 420 | "set": false, 421 | "infinite": false, 422 | "number": 0 423 | }, 424 | "billable_tres": { 425 | "set": false, 426 | "infinite": false, 427 | "number": 0.0 428 | }, 429 | "cpus_per_task": { 430 | "set": true, 431 | "infinite": false, 432 | "number": 1 433 | }, 434 | "cpu_frequency_minimum": { 435 | "set": false, 436 | "infinite": false, 437 | "number": 0 438 | }, 439 | "cpu_frequency_maximum": { 440 | "set": false, 441 | "infinite": false, 442 | "number": 0 443 | }, 444 | "cpu_frequency_governor": { 445 | "set": false, 446 | "infinite": false, 447 | "number": 0 448 | }, 449 | "cpus_per_tres": "", 450 | "cron": "", 451 | "deadline": { 452 | "set": true, 453 | "infinite": false, 454 | "number": 0 455 | }, 456 | "delay_boot": { 457 | "set": true, 458 | "infinite": false, 459 | "number": 0 460 | }, 461 | "dependency": "", 462 | "derived_exit_code": { 463 | "status": [ 464 | "SUCCESS" 465 | ], 466 | "return_code": { 467 | "set": true, 468 | "infinite": false, 469 | "number": 0 470 | }, 471 | "signal": { 472 | "id": { 473 | "set": false, 474 | "infinite": false, 475 | "number": 0 476 | }, 477 | "name": "" 478 | } 479 | }, 480 | "eligible_time": { 481 | "set": true, 482 | "infinite": false, 483 | "number": 1722268326 484 | }, 485 | "end_time": { 486 | "set": true, 487 | "infinite": false, 488 | "number": 0 489 | }, 490 | "excluded_nodes": "", 491 | "exit_code": { 492 | "status": [ 493 | "SUCCESS" 494 | ], 495 | "return_code": { 496 | "set": true, 497 | "infinite": false, 498 | "number": 0 499 | }, 500 | "signal": { 501 | "id": { 502 | "set": false, 503 | "infinite": false, 504 | "number": 0 505 | }, 506 | "name": "" 507 | } 508 | }, 509 | "extra": "", 510 | "failed_node": "", 511 | "features": "", 512 | "federation_origin": "", 513 | "federation_siblings_active": "", 514 | "federation_siblings_viable": "", 515 | "gres_detail": [], 516 | "group_id": 131, 517 | "group_name": "uoregon", 518 | "het_job_id": { 519 | "set": true, 520 | "infinite": false, 521 | "number": 0 522 | }, 523 | "het_job_id_set": "", 524 | "het_job_offset": { 525 | "set": true, 526 | "infinite": false, 527 | "number": 0 528 | }, 529 | "job_id": 7725337, 530 | "job_resources": {}, 531 | "job_size_str": [], 532 | "job_state": [ 533 | "PENDING" 534 | ], 535 | "last_sched_evaluation": { 536 | "set": true, 537 | "infinite": false, 538 | "number": 1722274361 539 | }, 540 | "licenses": "", 541 | "mail_type": [], 542 | "mail_user": "rdennis", 543 | "max_cpus": { 544 | "set": true, 545 | "infinite": false, 546 | "number": 0 547 | }, 548 | "max_nodes": { 549 | "set": true, 550 | "infinite": false, 551 | "number": 1 552 | }, 553 | "mcs_label": "", 554 | "memory_per_tres": "", 555 | "name": "rands", 556 | "network": "", 557 | "nodes": "", 558 | "nice": 0, 559 | "tasks_per_core": { 560 | "set": false, 561 | "infinite": true, 562 | "number": 0 563 | }, 564 | "tasks_per_tres": { 565 | "set": true, 566 | "infinite": false, 567 | "number": 0 568 | }, 569 | "tasks_per_node": { 570 | "set": true, 571 | "infinite": false, 572 | "number": 1 573 | }, 574 | "tasks_per_socket": { 575 | "set": false, 576 | "infinite": true, 577 | "number": 0 578 | }, 579 | "tasks_per_board": { 580 | "set": true, 581 | "infinite": false, 582 | "number": 0 583 | }, 584 | "cpus": { 585 | "set": true, 586 | "infinite": false, 587 | "number": 1 588 | }, 589 | "node_count": { 590 | "set": true, 591 | "infinite": false, 592 | "number": 1 593 | }, 594 | "tasks": { 595 | "set": true, 596 | "infinite": false, 597 | "number": 1 598 | }, 599 | "partition": "preempt", 600 | "prefer": "", 601 | "memory_per_cpu": { 602 | "set": true, 603 | "infinite": false, 604 | "number": 4096 605 | }, 606 | "memory_per_node": { 607 | "set": false, 608 | "infinite": false, 609 | "number": 0 610 | }, 611 | "minimum_cpus_per_node": { 612 | "set": true, 613 | "infinite": false, 614 | "number": 1 615 | }, 616 | "minimum_tmp_disk_per_node": { 617 | "set": true, 618 | "infinite": false, 619 | "number": 0 620 | }, 621 | "power": { 622 | "flags": [] 623 | }, 624 | "preempt_time": { 625 | "set": true, 626 | "infinite": false, 627 | "number": 0 628 | }, 629 | "preemptable_time": { 630 | "set": true, 631 | "infinite": false, 632 | "number": 0 633 | }, 634 | "pre_sus_time": { 635 | "set": true, 636 | "infinite": false, 637 | "number": 0 638 | }, 639 | "priority": { 640 | "set": true, 641 | "infinite": false, 642 | "number": 169465 643 | }, 644 | "profile": [ 645 | "NOT_SET" 646 | ], 647 | "qos": "normal", 648 | "reboot": false, 649 | "required_nodes": "", 650 | "minimum_switches": 0, 651 | "requeue": false, 652 | "resize_time": { 653 | "set": true, 654 | "infinite": false, 655 | "number": 0 656 | }, 657 | "restart_cnt": 0, 658 | "resv_name": "", 659 | "scheduled_nodes": "", 660 | "selinux_context": "", 661 | "shared": [], 662 | "exclusive": [], 663 | "oversubscribe": true, 664 | "show_flags": [ 665 | "ALL", 666 | "DETAIL", 667 | "LOCAL" 668 | ], 669 | "sockets_per_board": 0, 670 | "sockets_per_node": { 671 | "set": false, 672 | "infinite": false, 673 | "number": 0 674 | }, 675 | "start_time": { 676 | "set": true, 677 | "infinite": false, 678 | "number": 0 679 | }, 680 | "state_description": "", 681 | "state_reason": "Resources", 682 | "standard_error": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_4294967294.out", 683 | "standard_input": "/dev/null", 684 | "standard_output": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all/slurm-7725337_4294967294.out", 685 | "submit_time": { 686 | "set": true, 687 | "infinite": false, 688 | "number": 1722268317 689 | }, 690 | "suspend_time": { 691 | "set": true, 692 | "infinite": false, 693 | "number": 0 694 | }, 695 | "system_comment": "", 696 | "time_limit": { 697 | "set": true, 698 | "infinite": false, 699 | "number": 1440 700 | }, 701 | "time_minimum": { 702 | "set": true, 703 | "infinite": false, 704 | "number": 0 705 | }, 706 | "threads_per_core": { 707 | "set": false, 708 | "infinite": false, 709 | "number": 0 710 | }, 711 | "tres_bind": "", 712 | "tres_freq": "", 713 | "tres_per_job": "", 714 | "tres_per_node": "", 715 | "tres_per_socket": "", 716 | "tres_per_task": "", 717 | "tres_req_str": "cpu=1,mem=4G,node=1,billing=1", 718 | "tres_alloc_str": "", 719 | "user_id": 110622, 720 | "user_name": "rdennis", 721 | "maximum_switch_wait_time": 0, 722 | "wckey": "", 723 | "current_working_directory": "/gpfs/home/rdennis/timeTemperatureEquivalence/scheme/all" 724 | } 725 | ], 726 | "last_backfill": { 727 | "set": true, 728 | "infinite": false, 729 | "number": 1722274362 730 | }, 731 | "last_update": { 732 | "set": true, 733 | "infinite": false, 734 | "number": 1722274381 735 | }, 736 | "meta": { 737 | "plugin": { 738 | "type": "openapi/slurmctld", 739 | "name": "Slurm OpenAPI slurmctld", 740 | "data_parser": "data_parser/v0.0.40", 741 | "accounting_storage": "accounting_storage/slurmdbd" 742 | }, 743 | "client": { 744 | "source": "[10.174.139.109]:49957", 745 | "user": "root", 746 | "group": "root" 747 | }, 748 | "command": [], 749 | "slurm": { 750 | "version": { 751 | "major": "23", 752 | "micro": "1", 753 | "minor": "11" 754 | }, 755 | "release": "23.11.1", 756 | "cluster": "talapas" 757 | } 758 | }, 759 | "errors": [], 760 | "warnings": [] 761 | } 762 | --------------------------------------------------------------------------------