├── .codespellrc ├── .dockerignore ├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ ├── e2e_test.yml │ ├── github_release.yml │ ├── gpubench_only.yml │ └── one_job.yml ├── .gitignore ├── .golangci.yaml ├── .mockery.yaml ├── CODEOWNERS ├── LICENSE ├── Makefile ├── PROJECT ├── README.md ├── SECURITY.md ├── VERSION ├── api ├── v1 │ ├── groupversion_info.go │ ├── slurmcluster_types.go │ └── zz_generated.deepcopy.go └── v1alpha1 │ ├── activecheck_types.go │ ├── extra_types.go │ ├── groupversion_info.go │ ├── nodeconfigurator_types.go │ ├── nodeset_types.go │ └── zz_generated.deepcopy.go ├── cmd ├── exporter │ └── main.go ├── main.go ├── rebooter │ └── main.go ├── sconfigcontroller │ └── main.go └── soperatorchecks │ └── main.go ├── config ├── certmanager │ ├── certificate.yaml │ ├── kustomization.yaml │ └── kustomizeconfig.yaml ├── crd │ ├── bases │ │ ├── kustomization.yaml │ │ ├── slurm.nebius.ai_activechecks.yaml │ │ ├── slurm.nebius.ai_nodeconfigurators.yaml │ │ ├── slurm.nebius.ai_nodesets.yaml │ │ └── slurm.nebius.ai_slurmclusters.yaml │ ├── kustomization.yaml │ └── kustomizeconfig.yaml ├── default │ ├── kustomization.yaml │ ├── manager_auth_proxy_patch.yaml │ ├── manager_config_patch.yaml │ └── manager_webhook_patch.yaml ├── manager │ ├── kustomization.yaml │ └── manager.yaml ├── network-policy │ └── allow-webhook-traffic.yaml ├── nodeconfigurator │ └── kustomization.yaml ├── prometheus │ ├── kustomization.yaml │ └── monitor.yaml ├── rbac │ ├── activecheck_admin_role.yaml │ ├── activecheck_editor_role.yaml │ ├── activecheck_viewer_role.yaml │ ├── auth_proxy_client_clusterrole.yaml │ ├── auth_proxy_role.yaml │ ├── auth_proxy_role_binding.yaml │ ├── auth_proxy_service.yaml │ ├── clustercontroller │ │ ├── kustomization.yaml │ │ └── role.yaml │ ├── kustomization.yaml │ ├── leader_election_role.yaml │ ├── leader_election_role_binding.yaml │ ├── nodeconfigurator │ │ ├── kustomization.yaml │ │ ├── role.yaml │ │ └── role_binding.yaml │ ├── nodeconfigurator_admin_role.yaml │ ├── nodeconfigurator_editor_role.yaml │ ├── nodeconfigurator_viewer_role.yaml │ ├── nodeset_admin_role.yaml │ ├── nodeset_editor_role.yaml │ ├── nodeset_viewer_role.yaml │ ├── role_binding.yaml │ ├── service_account.yaml │ ├── slurmcluster_editor_role.yaml │ ├── slurmcluster_viewer_role.yaml │ ├── soperatorchecks │ │ ├── kustomization.yaml │ │ └── role.yaml │ ├── soperatorchecks_admin_role.yaml │ ├── soperatorchecks_editor_role.yaml │ └── soperatorchecks_viewer_role.yaml ├── samples │ ├── kustomization.yaml │ ├── slurm_v1alpha1_activecheck.yaml │ ├── slurm_v1alpha1_nodeconfigurator.yaml │ └── slurm_v1alpha1_nodeset.yaml ├── soperatorchecks │ ├── kustomization.yaml │ ├── manager_auth_proxy_patch.yaml │ └── soperatorchecks.yaml └── webhook │ ├── kustomization.yaml │ ├── kustomizeconfig.yaml │ ├── manifests.yaml │ └── service.yaml ├── docs ├── architecture.md ├── features.md ├── future_plans.md ├── images │ ├── architecture_diagram.svg │ ├── directory_structure_diagram.svg │ ├── easy_scaling_diagram.svg │ ├── gpu_benchmark_diagram.svg │ └── layers_diagram.png └── limitations.md ├── fluxcd ├── README.md ├── base │ └── soperator-fluxcd │ │ ├── kustomization.yaml │ │ └── resources.yaml └── environment │ └── nebius-cloud │ ├── base │ ├── custom-configmaps-soperator │ │ ├── 95-nebius-o11y │ │ ├── daemon.json │ │ ├── enroot.conf │ │ ├── epilog.sh │ │ ├── kustomization.yaml │ │ ├── prolog.sh │ │ └── supervisord.conf │ ├── flux_kustomization.yaml │ └── kustomization.yaml │ ├── dev │ ├── bootstrap │ │ ├── flux-kustomization.yaml │ │ ├── git-repository.yaml │ │ └── kustomization.yaml │ ├── helmrelease-patch.yaml │ ├── helmrepository-patch.yaml │ └── kustomization.yaml │ └── prod │ ├── bootstrap │ ├── flux-kustomization.yaml │ ├── git-repository.yaml │ └── kustomization.yaml │ └── kustomization.yaml ├── go.mod ├── go.sum ├── hack └── boilerplate.go.txt ├── helm ├── nodeconfigurator │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── binding-rbac.yaml │ │ ├── nodeconfigurator-cr.yaml │ │ ├── nodeconfigurator-rbac.yaml │ │ └── serviceaccount.yaml │ └── values.yaml ├── slurm-cluster-storage │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── accounting-mount-daemonset.yaml │ │ ├── accounting-pv.yaml │ │ ├── controller-spool-mount-daemonset.yaml │ │ ├── controller-spool-pv.yaml │ │ ├── controller-spool-pvc.yaml │ │ ├── jail-mount-daemonset.yaml │ │ ├── jail-pv.yaml │ │ ├── jail-pvc.yaml │ │ ├── jail-submounts-mount-daemonset.yaml │ │ ├── jail-submounts-pv.yaml │ │ ├── jail-submounts-pvc.yaml │ │ ├── local-storageclass.yaml │ │ └── mount-scripts-configmap.yaml │ └── values.yaml ├── slurm-cluster │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── _registry_helpers.tpl │ │ ├── _secret_helpers.tpl │ │ ├── priority-class.yaml │ │ ├── pvc.yaml │ │ └── slurm-cluster-cr.yaml │ └── values.yaml ├── soperator-activechecks │ ├── .helmignore │ ├── Chart.yaml │ ├── README.md │ ├── templates │ │ ├── _helpers.tpl │ │ └── activecheck.yaml │ └── values.yaml ├── soperator-crds │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ └── slurmcluster-crd.yaml │ └── values.yaml ├── soperator-dcgm-exporter │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── configmap.yaml │ │ ├── daemonset.yaml │ │ ├── role.yaml │ │ ├── rolebinding.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ └── servicemonitor.yaml │ └── values.yaml ├── soperator-fluxcd │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── backup.yaml │ │ ├── backup_schedule.yaml │ │ ├── cert-manager.yaml │ │ ├── dcgm-exporter.yaml │ │ ├── helmrepository.yaml │ │ ├── kruise.yaml │ │ ├── mariadb-operator-crds.yaml │ │ ├── mariadb-operator.yaml │ │ ├── namespaces.yaml │ │ ├── nodeconfigurator.yaml │ │ ├── opentelemetry-collector-events.yaml │ │ ├── opentelemetry-collector-logs.yaml │ │ ├── prometheus-operator-crds.yaml │ │ ├── security-profiles-operator.yaml │ │ ├── slurm-cluster-storage.yaml │ │ ├── slurm-cluster.yaml │ │ ├── soperator.yaml │ │ ├── soperatorchecks.yaml │ │ ├── victoria-metrics-operator-crds.yaml │ │ ├── vm-logs.yaml │ │ └── vm-stack.yaml │ ├── tests │ │ ├── component_enabled_test.yaml │ │ ├── namespace_test.yaml │ │ ├── soperator_dependency_test.yaml │ │ └── values_override_test.yaml │ └── values.yaml ├── soperator │ ├── .gitignore │ ├── .helmignore │ ├── Chart.yaml │ ├── README.md │ ├── charts │ │ └── .gitkeep │ ├── crds │ │ └── slurmcluster-crd.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── binding-rbac.yaml │ │ ├── deployment.yaml │ │ ├── leader-election-rbac.yaml │ │ ├── manager-rbac.yaml │ │ ├── metrics-reader-rbac.yaml │ │ ├── metrics-service.yaml │ │ ├── proxy-rbac.yaml │ │ ├── selfsigned-issuer.yaml │ │ ├── serviceaccount.yaml │ │ ├── serving-cert.yaml │ │ ├── soperator-checks-rbac.yaml │ │ ├── validate-secrets.yaml │ │ ├── validation-configmap.yaml │ │ └── webhook-service.yaml │ └── values.yaml └── soperatorchecks │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ ├── _helpers.tpl │ ├── deployment.yaml │ ├── metrics-service.yaml │ ├── serviceaccount.yaml │ ├── soperator-checks-binding-rbac.yaml │ └── soperator-checks-rbac.yaml │ └── values.yaml ├── images ├── accounting │ ├── slurmdbd.dockerfile │ └── slurmdbd_entrypoint.sh ├── common │ ├── chroot-plugin │ │ └── chroot.c │ ├── enroot │ │ └── enroot.conf │ ├── nvidia-container-runtime │ │ └── config.toml │ └── scripts │ │ ├── bind_slurm_common.sh │ │ ├── complement_jail.sh │ │ ├── install_awscli.sh │ │ ├── install_chroot_plugin.sh │ │ ├── install_container_toolkit.sh │ │ ├── install_docker.sh │ │ ├── install_docker_cli.sh │ │ ├── install_enroot.sh │ │ ├── install_munge.sh │ │ ├── install_openmpi.sh │ │ ├── install_python.sh │ │ └── install_rclone.sh ├── controller │ ├── slurmctld.dockerfile │ └── slurmctld_entrypoint.sh ├── exporter │ └── exporter.dockerfile ├── jail │ ├── init-users │ │ ├── group │ │ ├── gshadow │ │ ├── passwd │ │ ├── shadow │ │ └── sudoers │ ├── jail.dockerfile │ ├── motd │ │ ├── 00-welcome │ │ ├── 10-system-info │ │ ├── 20-slurm-stats │ │ └── 30-ssh-users │ ├── pin_packages │ │ └── cuda-pins │ ├── scripts │ │ ├── createuser.sh │ │ ├── docker.sh │ │ ├── nvidia_smi_hostpid.sh │ │ └── srun_perf_run.sh │ └── skel │ │ ├── .bash_logout │ │ ├── .bashrc │ │ ├── .config │ │ └── enroot │ │ │ └── .credentials │ │ ├── .profile │ │ └── .slurm │ │ └── defaults ├── k8s_check_job │ └── k8s_check_job.dockerfile ├── login │ ├── sshd.dockerfile │ └── sshd_entrypoint.sh ├── munge │ ├── munge.dockerfile │ └── munge_entrypoint.sh ├── nccl_benchmark │ ├── nccl_benchmark.dockerfile │ ├── nccl_benchmark_entrypoint.sh │ └── scripts │ │ └── srun_perf.sh ├── populate_jail │ ├── populate_jail.dockerfile │ └── populate_jail_entrypoint.sh ├── rebooter │ └── rebooter.dockerfile ├── restd │ ├── slurmrestd.dockerfile │ └── slurmrestd_entrypoint.sh ├── sconfigcontroller │ └── sconfigcontroller.dockerfile ├── slurm_check_job │ ├── slurm_check_job.dockerfile │ └── slurm_check_job_entrypoint.sh ├── soperator-exporter │ └── soperator-exporter.dockerfile ├── soperator │ └── Dockerfile ├── soperatorchecks │ └── soperatorchecks.dockerfile └── worker │ ├── docker │ └── daemon.json │ ├── gpubench │ ├── go.mod │ ├── go.sum │ ├── main.go │ └── main_test.go │ ├── scripts │ └── gpu_healthcheck.sh │ ├── slurmd.dockerfile │ ├── slurmd_entrypoint.sh │ └── supervisord_entrypoint.sh ├── internal ├── check │ ├── consts.go │ ├── installed_crd.go │ ├── maintanence.go │ ├── resources.go │ └── resources_test.go ├── consts │ ├── accounting.go │ ├── activecheck.go │ ├── annotation.go │ ├── cgroup.go │ ├── cluster_type.go │ ├── component.go │ ├── conditions.go │ ├── configmap.go │ ├── container.go │ ├── cronjob.go │ ├── indexfield.go │ ├── job.go │ ├── label.go │ ├── maintenance.go │ ├── mariadb.go │ ├── nccl_topology_type.go │ ├── node_configurator.go │ ├── nvidia.go │ ├── pagination.go │ ├── rest.go │ ├── sconfigcontroller.go │ ├── secret.go │ ├── service.go │ ├── slurm.go │ ├── sshd.go │ ├── statefulset.go │ ├── version.go │ └── volume.go ├── controller │ ├── clustercontroller │ │ ├── accounting.go │ │ ├── benchmark.go │ │ ├── common.go │ │ ├── controller.go │ │ ├── exporter.go │ │ ├── login.go │ │ ├── populate_job.go │ │ ├── reconcile.go │ │ ├── rest.go │ │ ├── sconfigcontroller.go │ │ ├── soperator_exporter.go │ │ └── worker.go │ ├── common_subjects_test.go │ ├── fixtures_test.go │ ├── helpers_test.go │ ├── nodeconfigurator │ │ └── nodeconfigurator_controller.go │ ├── nodesetcontroller │ │ ├── controller.go │ │ ├── controller_test.go │ │ └── indexer.go │ ├── reconciler │ │ ├── apparmorprofile.go │ │ ├── deployment.go │ │ ├── deployment_test.go │ │ ├── fake_error_client_test.go │ │ ├── grant.go │ │ ├── k8s_configmap.go │ │ ├── k8s_cronjob.go │ │ ├── k8s_daemonset.go │ │ ├── k8s_job.go │ │ ├── k8s_role.go │ │ ├── k8s_role_test.go │ │ ├── k8s_rolebinding_test.go │ │ ├── k8s_rolebinging.go │ │ ├── k8s_secret.go │ │ ├── k8s_service.go │ │ ├── k8s_service_account.go │ │ ├── k8s_service_test.go │ │ ├── k8s_statefulset.go │ │ ├── k8s_statefulset_advanced.go │ │ ├── k8s_statefulset_test.go │ │ ├── mariadb.go │ │ ├── otel.go │ │ ├── otel_test.go │ │ ├── pod_monitor.go │ │ ├── pod_monitor_test.go │ │ ├── reconciler.go │ │ └── versioning.go │ ├── sconfigcontroller │ │ ├── controller.go │ │ ├── controller_test.go │ │ ├── fake │ │ │ └── mock_store.go │ │ ├── file_store.go │ │ └── file_store_test.go │ ├── soperatorchecks │ │ ├── activecheck_controller.go │ │ ├── activecheck_jobs_controller.go │ │ ├── k8s_nodes_controller.go │ │ ├── serviceaccount_controller.go │ │ ├── slurm_api_clients_controller.go │ │ ├── slurm_nodes_controller.go │ │ ├── slurm_nodes_controller_test.go │ │ └── soperatorchecks.go │ ├── state │ │ └── state.go │ ├── suite_test.go │ └── topologyconfcontroller │ │ ├── base_reconciler.go │ │ ├── nodetopology_controller.go │ │ ├── nodetopology_controller_test.go │ │ ├── topology_graph.go │ │ ├── topology_graph_test.go │ │ ├── workertopology_controller.go │ │ └── workertopology_controller_test.go ├── controllerconfig │ └── options.go ├── exporter │ ├── collector.go │ ├── collector_test.go │ ├── exporter.go │ └── state.go ├── jwt │ ├── consts.go │ ├── registry.go │ ├── registry_test.go │ ├── signing_key.go │ ├── token.go │ └── token_test.go ├── logfield │ ├── fields.go │ └── resource.go ├── naming │ ├── naming.go │ └── naming_test.go ├── rebooter │ ├── reconcile.go │ └── reconcile_test.go ├── render │ ├── accounting │ │ ├── container.go │ │ ├── deployment.go │ │ ├── deployment_test.go │ │ ├── grant.go │ │ ├── grant_test.go │ │ ├── mariadb.go │ │ ├── mariadb_password.go │ │ ├── mariadb_test.go │ │ ├── pod.go │ │ ├── pod_test.go │ │ ├── secret.go │ │ ├── secret_test.go │ │ ├── service.go │ │ ├── service_test.go │ │ ├── vars_test.go │ │ ├── volume.go │ │ └── volume_test.go │ ├── benchmark │ │ ├── container.go │ │ ├── container_test.go │ │ └── cronjob.go │ ├── common │ │ ├── apparmorprofile.go │ │ ├── configmap.go │ │ ├── configmap_test.go │ │ ├── container.go │ │ ├── label.go │ │ ├── pod.go │ │ ├── pod_test.go │ │ ├── probe.go │ │ ├── resources.go │ │ ├── secret.go │ │ └── volume.go │ ├── controller │ │ ├── container.go │ │ ├── service.go │ │ ├── statefulset.go │ │ └── volume.go │ ├── exporter │ │ ├── container.go │ │ ├── container_test.go │ │ ├── deployment.go │ │ ├── names.go │ │ ├── pod.go │ │ ├── pod_monitor.go │ │ ├── pod_monitor_test.go │ │ ├── pod_test.go │ │ ├── role.go │ │ ├── role_binding.go │ │ ├── service_account.go │ │ └── vars_test.go │ ├── login │ │ ├── configmap.go │ │ ├── container.go │ │ ├── service.go │ │ ├── statefulset.go │ │ └── volume.go │ ├── nodeconfigurator │ │ ├── container.go │ │ ├── daemonset.go │ │ └── pod.go │ ├── otel │ │ ├── otel.go │ │ └── otel_test.go │ ├── populate_jail │ │ ├── container.go │ │ └── job.go │ ├── prometheus │ │ ├── container.go │ │ ├── container_test.go │ │ ├── exporter.go │ │ ├── exporter_test.go │ │ ├── names.go │ │ ├── pod.go │ │ ├── pod_test.go │ │ ├── role.go │ │ ├── role_binding.go │ │ ├── serviceaccount.go │ │ └── vars_test.go │ ├── rest │ │ ├── container.go │ │ ├── pod.go │ │ ├── rest.go │ │ ├── secret.go │ │ └── service.go │ ├── sconfigcontroller │ │ ├── container.go │ │ ├── deployment.go │ │ ├── initcontainer.go │ │ ├── pod.go │ │ ├── role.go │ │ ├── role_binding.go │ │ └── serviceaccount.go │ ├── soperatorchecks │ │ ├── configmap.go │ │ ├── container.go │ │ ├── cronjob.go │ │ ├── job.go │ │ ├── pod.go │ │ ├── role.go │ │ ├── role_binding.go │ │ └── serviceaccount.go │ ├── utils │ │ └── config.go │ └── worker │ │ ├── configmap.go │ │ ├── container.go │ │ ├── container_test.go │ │ ├── role.go │ │ ├── role_binding.go │ │ ├── role_binding_test.go │ │ ├── role_test.go │ │ ├── service.go │ │ ├── serviceaccount.go │ │ ├── statefulset.go │ │ ├── statefulset_test.go │ │ └── volume.go ├── slurmapi │ ├── client.go │ ├── client_set.go │ ├── fake │ │ └── mock_client.go │ ├── interface.go │ ├── job.go │ ├── job_status.go │ ├── job_test.go │ ├── node.go │ ├── node_test.go │ ├── testdata │ │ ├── 2_node_job.json │ │ └── usual_node_rest.json │ ├── tres.go │ └── tres_test.go ├── utils │ ├── get_by.go │ ├── get_by_test.go │ ├── multistep_test.go │ ├── mutltistep.go │ ├── oneof.go │ ├── oneof_test.go │ ├── unique.go │ └── unique_test.go ├── values │ ├── slurm_accounting.go │ ├── slurm_accounting_test.go │ ├── slurm_cluster.go │ ├── slurm_config.go │ ├── slurm_controller.go │ ├── slurm_exporter.go │ ├── slurm_exporter_test.go │ ├── slurm_jail.go │ ├── slurm_login.go │ ├── slurm_periodic_checks.go │ ├── slurm_rest.go │ ├── slurm_sconfigcontroller.go │ ├── slurm_worker.go │ ├── slurm_worker_test.go │ ├── types.go │ └── validate.go └── webhook │ └── v1 │ ├── secret_webhook.go │ └── secret_webhook_test.go ├── pkg └── jwt │ └── jwt.go ├── release_helm.sh └── test └── e2e └── e2e_test.go /.codespellrc: -------------------------------------------------------------------------------- 1 | [codespell] 2 | # Ref: https://github.com/codespell-project/codespell#using-a-config-file 3 | skip = .git*,*.svg,go.sum,.codespellrc 4 | check-hidden = true 5 | ignore-words-list = notin 6 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /config 3 | /deps 4 | /docker 5 | /slurm-service/internal/docs 6 | /helm* 7 | /terraform* 8 | /test 9 | .* 10 | *.sh 11 | Makefile 12 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: http://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | end_of_line = lf 9 | insert_final_newline = true 10 | indent_style = space 11 | indent_size = 2 12 | 13 | [{Makefile,go.mod,go.sum,*.go,.gitmodules}] 14 | end_of_line = lf 15 | insert_final_newline = true 16 | indent_style = tab 17 | indent_size = 4 18 | 19 | [*.md] 20 | indent_size = 4 21 | trim_trailing_whitespace = false 22 | 23 | [*.yml] 24 | indent_style = space 25 | indent_size = 2 26 | 27 | [*.yaml] 28 | indent_style = space 29 | indent_size = 2 30 | 31 | [Dockerfile] 32 | indent_size = 4 33 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: / 5 | schedule: 6 | interval: daily 7 | target-branch: "dev" 8 | 9 | - package-ecosystem: docker 10 | directory: / 11 | schedule: 12 | interval: daily 13 | target-branch: "dev" 14 | 15 | - package-ecosystem: gomod 16 | directory: / 17 | schedule: 18 | interval: daily 19 | target-branch: "dev" 20 | 21 | - package-ecosystem: gomod 22 | directory: /images/worker/gpubench 23 | schedule: 24 | interval: daily 25 | target-branch: "dev" 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | !go.mod 2 | .DS_Store 3 | /.idea/ 4 | bin 5 | cover.out 6 | release_all.sh 7 | upload_to_build_agent.sh 8 | .vscode 9 | 10 | # ignoring generated version folder 11 | /version/ 12 | -------------------------------------------------------------------------------- /.mockery.yaml: -------------------------------------------------------------------------------- 1 | with-expecter: true 2 | issue-845-fix: True 3 | resolve-type-alias: False 4 | packages: 5 | nebius.ai/slurm-operator/internal/slurmapi: 6 | interfaces: 7 | Client: 8 | config: 9 | dir: "{{.InterfaceDirRelative}}/fake" 10 | outpkg: "fake" 11 | filename: "mock_{{ .InterfaceName | camelcase | firstLower }}.go" 12 | nebius.ai/slurm-operator/internal/controller/sconfigcontroller: 13 | interfaces: 14 | Store: 15 | config: 16 | dir: "{{.InterfaceDirRelative}}/fake" 17 | outpkg: "fake" 18 | filename: "mock_{{ .InterfaceName | camelcase | firstLower }}.go" 19 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Global code owners (applies to the whole repo) 2 | * @dstaroff @asteny @rdjjke @Uburro @itechdima @theyoprst @ChessProfessor 3 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Reporting Security Issues 2 | 3 | The Nebius team takes security bugs seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions. 4 | 5 | To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/nebius/soperator/security/advisories/new) tab. 6 | 7 | The Nebius team will send a response indicating the next steps in handling your report. After the initial reply to your report, the Nebius team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance. 8 | 9 | ## Learning More About Security in Nebius 10 | 11 | To learn more about security in Nebius, please see [this page](https://nebius.ai/docs/security). 12 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 1.20.0 2 | -------------------------------------------------------------------------------- /api/v1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | // Package v1 contains API Schema definitions for the slurm v1 API group 2 | // +kubebuilder:object:generate=true 3 | // +groupName=slurm.nebius.ai 4 | package v1 5 | 6 | import ( 7 | "k8s.io/apimachinery/pkg/runtime/schema" 8 | "sigs.k8s.io/controller-runtime/pkg/scheme" 9 | ) 10 | 11 | var ( 12 | // GroupVersion is group version used to register these objects 13 | GroupVersion = schema.GroupVersion{Group: "slurm.nebius.ai", Version: "v1"} 14 | 15 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 16 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 17 | 18 | // AddToScheme adds the types in this group-version to the given scheme. 19 | AddToScheme = SchemeBuilder.AddToScheme 20 | ) 21 | -------------------------------------------------------------------------------- /api/v1alpha1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 Nebius B.V. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1alpha1 contains API Schema definitions for the slurm v1alpha1 API group. 18 | // +kubebuilder:object:generate=true 19 | // +groupName=slurm.nebius.ai 20 | package v1alpha1 21 | 22 | import ( 23 | "k8s.io/apimachinery/pkg/runtime/schema" 24 | "sigs.k8s.io/controller-runtime/pkg/scheme" 25 | ) 26 | 27 | var ( 28 | // GroupVersion is group version used to register these objects. 29 | GroupVersion = schema.GroupVersion{Group: "slurm.nebius.ai", Version: "v1alpha1"} 30 | 31 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme. 32 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 33 | 34 | // AddToScheme adds the types in this group-version to the given scheme. 35 | AddToScheme = SchemeBuilder.AddToScheme 36 | ) 37 | -------------------------------------------------------------------------------- /config/certmanager/certificate.yaml: -------------------------------------------------------------------------------- 1 | # The following manifests contain a self-signed issuer CR and a certificate CR. 2 | # More document can be found at https://docs.cert-manager.io 3 | # WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. 4 | apiVersion: cert-manager.io/v1 5 | kind: Issuer 6 | metadata: 7 | labels: 8 | app.kubernetes.io/name: slurm-operator 9 | app.kubernetes.io/managed-by: kustomize 10 | name: selfsigned-issuer 11 | namespace: system 12 | spec: 13 | selfSigned: {} 14 | --- 15 | apiVersion: cert-manager.io/v1 16 | kind: Certificate 17 | metadata: 18 | labels: 19 | app.kubernetes.io/name: certificate 20 | app.kubernetes.io/instance: serving-cert 21 | app.kubernetes.io/component: certificate 22 | app.kubernetes.io/created-by: slurm-operator 23 | app.kubernetes.io/part-of: slurm-operator 24 | app.kubernetes.io/managed-by: kustomize 25 | name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml 26 | namespace: system 27 | spec: 28 | # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize 29 | dnsNames: 30 | - SERVICE_NAME.SERVICE_NAMESPACE.svc 31 | - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local 32 | issuerRef: 33 | kind: Issuer 34 | name: selfsigned-issuer 35 | secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize 36 | -------------------------------------------------------------------------------- /config/certmanager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - certificate.yaml 3 | 4 | configurations: 5 | - kustomizeconfig.yaml 6 | -------------------------------------------------------------------------------- /config/certmanager/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This configuration is for teaching kustomize how to update name ref substitution 2 | nameReference: 3 | - kind: Issuer 4 | group: cert-manager.io 5 | fieldSpecs: 6 | - kind: Certificate 7 | group: cert-manager.io 8 | path: spec/issuerRef/name 9 | -------------------------------------------------------------------------------- /config/crd/bases/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - slurm.nebius.ai_activechecks.yaml 3 | - slurm.nebius.ai_nodeconfigurators.yaml 4 | - slurm.nebius.ai_nodesets.yaml 5 | - slurm.nebius.ai_slurmclusters.yaml 6 | -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/slurm.nebius.ai_activechecks.yaml 6 | - bases/slurm.nebius.ai_nodeconfigurators.yaml 7 | - bases/slurm.nebius.ai_nodesets.yaml 8 | - bases/slurm.nebius.ai_slurmclusters.yaml 9 | 10 | #+kubebuilder:scaffold:crdkustomizeresource 11 | 12 | patches: [] 13 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. 14 | # patches here are for enabling the conversion webhook for each CRD 15 | #+kubebuilder:scaffold:crdkustomizewebhookpatch 16 | 17 | # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. 18 | # patches here are for enabling the CA injection for each CRD 19 | #- path: patches/cainjection_in_slurmclusters.yaml 20 | #+kubebuilder:scaffold:crdkustomizecainjectionpatch 21 | 22 | # [WEBHOOK] To enable webhook, uncomment the following section 23 | # the following config is for teaching kustomize how to do kustomization for CRDs. 24 | 25 | #configurations: 26 | #- kustomizeconfig.yaml 27 | -------------------------------------------------------------------------------- /config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false 17 | 18 | varReference: 19 | - path: metadata/annotations 20 | -------------------------------------------------------------------------------- /config/default/manager_auth_proxy_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch inject a sidecar container which is a HTTP proxy for the 2 | # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: controller-manager 7 | namespace: system 8 | spec: 9 | template: 10 | spec: 11 | containers: 12 | - name: kube-rbac-proxy 13 | securityContext: 14 | allowPrivilegeEscalation: false 15 | capabilities: 16 | drop: 17 | - "ALL" 18 | image: gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0 19 | args: 20 | - "--secure-listen-address=0.0.0.0:8443" 21 | - "--upstream=http://127.0.0.1:8080/" 22 | - "--logtostderr=true" 23 | - "--v=0" 24 | ports: 25 | - containerPort: 8443 26 | protocol: TCP 27 | name: https 28 | resources: 29 | limits: 30 | memory: 128Mi 31 | requests: 32 | cpu: 50m 33 | memory: 64Mi 34 | - name: manager 35 | args: 36 | - "--health-probe-bind-address=:8081" 37 | - "--metrics-bind-address=127.0.0.1:8080" 38 | - "--leader-elect" 39 | -------------------------------------------------------------------------------- /config/default/manager_config_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | -------------------------------------------------------------------------------- /config/default/manager_webhook_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | labels: 7 | app.kubernetes.io/name: slurm-operator 8 | app.kubernetes.io/managed-by: kustomize 9 | spec: 10 | template: 11 | spec: 12 | containers: 13 | - name: manager 14 | ports: 15 | - containerPort: 9443 16 | name: webhook-server 17 | protocol: TCP 18 | volumeMounts: 19 | - mountPath: /tmp/k8s-webhook-server/serving-certs 20 | name: cert 21 | readOnly: true 22 | volumes: 23 | - name: cert 24 | secret: 25 | defaultMode: 420 26 | secretName: webhook-server-cert 27 | -------------------------------------------------------------------------------- /config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | images: 4 | - name: controller 5 | newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator 6 | newTag: 1.20.0 7 | -------------------------------------------------------------------------------- /config/network-policy/allow-webhook-traffic.yaml: -------------------------------------------------------------------------------- 1 | # This NetworkPolicy allows ingress traffic to your webhook server running 2 | # as part of the controller-manager from specific namespaces and pods. CR(s) which uses webhooks 3 | # will only work when applied in namespaces labeled with 'webhook: enabled' 4 | apiVersion: networking.k8s.io/v1 5 | kind: NetworkPolicy 6 | metadata: 7 | labels: 8 | app.kubernetes.io/name: slurm-operator 9 | app.kubernetes.io/managed-by: kustomize 10 | name: allow-webhook-traffic 11 | namespace: system 12 | spec: 13 | podSelector: 14 | matchLabels: 15 | control-plane: controller-manager 16 | policyTypes: 17 | - Ingress 18 | ingress: 19 | # This allows ingress traffic from any namespace with the label webhook: enabled 20 | - from: 21 | - namespaceSelector: 22 | matchLabels: 23 | webhook: enabled # Only from namespaces with this label 24 | ports: 25 | - port: 443 26 | protocol: TCP 27 | -------------------------------------------------------------------------------- /config/nodeconfigurator/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../crd/bases/slurm.nebius.ai_nodeconfigurators.yaml 3 | -------------------------------------------------------------------------------- /config/prometheus/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - monitor.yaml 3 | -------------------------------------------------------------------------------- /config/prometheus/monitor.yaml: -------------------------------------------------------------------------------- 1 | # Prometheus Monitor Service (Metrics) 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | labels: 6 | control-plane: controller-manager 7 | app.kubernetes.io/name: servicemonitor 8 | app.kubernetes.io/instance: controller-manager-metrics-monitor 9 | app.kubernetes.io/component: metrics 10 | app.kubernetes.io/created-by: slurm-operator 11 | app.kubernetes.io/part-of: slurm-operator 12 | app.kubernetes.io/managed-by: kustomize 13 | name: controller-manager-metrics-monitor 14 | namespace: system 15 | spec: 16 | endpoints: 17 | - path: /metrics 18 | port: https 19 | scheme: https 20 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 21 | tlsConfig: 22 | insecureSkipVerify: true 23 | selector: 24 | matchLabels: 25 | control-plane: controller-manager 26 | -------------------------------------------------------------------------------- /config/rbac/activecheck_admin_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants full permissions ('*') over slurm.nebius.ai. 5 | # This role is intended for users authorized to modify roles and bindings within the cluster, 6 | # enabling them to delegate specific permissions to other users or groups as needed. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: activecheck-admin-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - activechecks 20 | verbs: 21 | - '*' 22 | - apiGroups: 23 | - slurm.nebius.ai 24 | resources: 25 | - activechecks/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/activecheck_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants permissions to create, update, and delete resources within the slurm.nebius.ai. 5 | # This role is intended for users who need to manage these resources 6 | # but should not control RBAC or manage permissions for others. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: activecheck-editor-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - activechecks 20 | verbs: 21 | - create 22 | - delete 23 | - get 24 | - list 25 | - patch 26 | - update 27 | - watch 28 | - apiGroups: 29 | - slurm.nebius.ai 30 | resources: 31 | - activechecks/status 32 | verbs: 33 | - get 34 | -------------------------------------------------------------------------------- /config/rbac/activecheck_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants read-only access to slurm.nebius.ai resources. 5 | # This role is intended for users who need visibility into these resources 6 | # without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: activecheck-viewer-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - activechecks 20 | verbs: 21 | - get 22 | - list 23 | - watch 24 | - apiGroups: 25 | - slurm.nebius.ai 26 | resources: 27 | - activechecks/status 28 | verbs: 29 | - get 30 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_client_clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrole 6 | app.kubernetes.io/instance: metrics-reader 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: slurm-operator 9 | app.kubernetes.io/part-of: slurm-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: metrics-reader 12 | rules: 13 | - nonResourceURLs: 14 | - "/metrics" 15 | verbs: 16 | - get 17 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrole 6 | app.kubernetes.io/instance: proxy-role 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: slurm-operator 9 | app.kubernetes.io/part-of: slurm-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: proxy-role 12 | rules: 13 | - apiGroups: 14 | - authentication.k8s.io 15 | resources: 16 | - tokenreviews 17 | verbs: 18 | - create 19 | - apiGroups: 20 | - authorization.k8s.io 21 | resources: 22 | - subjectaccessreviews 23 | verbs: 24 | - create 25 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrolebinding 6 | app.kubernetes.io/instance: proxy-rolebinding 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: slurm-operator 9 | app.kubernetes.io/part-of: slurm-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: proxy-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: proxy-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: controller-manager 19 | namespace: system 20 | -------------------------------------------------------------------------------- /config/rbac/auth_proxy_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: controller-manager 6 | app.kubernetes.io/name: service 7 | app.kubernetes.io/instance: controller-manager-metrics-service 8 | app.kubernetes.io/component: kube-rbac-proxy 9 | app.kubernetes.io/created-by: slurm-operator 10 | app.kubernetes.io/part-of: slurm-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: controller-manager-metrics-service 13 | namespace: system 14 | spec: 15 | ports: 16 | - name: https 17 | port: 8443 18 | protocol: TCP 19 | targetPort: https 20 | selector: 21 | control-plane: controller-manager 22 | -------------------------------------------------------------------------------- /config/rbac/clustercontroller/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - role.yaml 3 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: role 7 | app.kubernetes.io/instance: leader-election-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: slurm-operator 10 | app.kubernetes.io/part-of: slurm-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: leader-election-role 13 | rules: 14 | - apiGroups: 15 | - "" 16 | resources: 17 | - configmaps 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - create 23 | - update 24 | - patch 25 | - delete 26 | - apiGroups: 27 | - coordination.k8s.io 28 | resources: 29 | - leases 30 | verbs: 31 | - get 32 | - list 33 | - watch 34 | - create 35 | - update 36 | - patch 37 | - delete 38 | - apiGroups: 39 | - "" 40 | resources: 41 | - events 42 | verbs: 43 | - create 44 | - patch 45 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: rolebinding 6 | app.kubernetes.io/instance: leader-election-rolebinding 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: slurm-operator 9 | app.kubernetes.io/part-of: slurm-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: leader-election-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: Role 15 | name: leader-election-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: controller-manager 19 | namespace: system 20 | -------------------------------------------------------------------------------- /config/rbac/nodeconfigurator/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - role.yaml 3 | -------------------------------------------------------------------------------- /config/rbac/nodeconfigurator/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: nodeconfigurator-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - nodes 11 | verbs: 12 | - get 13 | - list 14 | - update 15 | - watch 16 | - apiGroups: 17 | - "" 18 | resources: 19 | - nodes/status 20 | verbs: 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - "" 28 | resources: 29 | - pods 30 | verbs: 31 | - get 32 | - list 33 | - watch 34 | -------------------------------------------------------------------------------- /config/rbac/nodeconfigurator/role_binding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: RoleBinding 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: rrolebinding 7 | app.kubernetes.io/instance: nodeconfigurator-rolebinding 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/part-of: slurm-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: nodeconfigurator-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: nodeconfigurator-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: nodeconfigurator 19 | namespace: system 20 | -------------------------------------------------------------------------------- /config/rbac/nodeconfigurator_admin_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants full permissions ('*') over slurm.nebius.ai. 5 | # This role is intended for users authorized to modify roles and bindings within the cluster, 6 | # enabling them to delegate specific permissions to other users or groups as needed. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: nodeconfigurator-admin-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - nodeconfigurators 20 | verbs: 21 | - '*' 22 | - apiGroups: 23 | - slurm.nebius.ai 24 | resources: 25 | - nodeconfigurators/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/nodeconfigurator_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants permissions to create, update, and delete resources within the slurm.nebius.ai. 5 | # This role is intended for users who need to manage these resources 6 | # but should not control RBAC or manage permissions for others. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: nodeconfigurator-editor-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - nodeconfigurators 20 | verbs: 21 | - create 22 | - delete 23 | - get 24 | - list 25 | - patch 26 | - update 27 | - watch 28 | - apiGroups: 29 | - slurm.nebius.ai 30 | resources: 31 | - nodeconfigurators/status 32 | verbs: 33 | - get 34 | -------------------------------------------------------------------------------- /config/rbac/nodeconfigurator_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants read-only access to slurm.nebius.ai resources. 5 | # This role is intended for users who need visibility into these resources 6 | # without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: nodeconfigurator-viewer-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - nodeconfigurators 20 | verbs: 21 | - get 22 | - list 23 | - watch 24 | - apiGroups: 25 | - slurm.nebius.ai 26 | resources: 27 | - nodeconfigurators/status 28 | verbs: 29 | - get 30 | -------------------------------------------------------------------------------- /config/rbac/nodeset_admin_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants full permissions ('*') over slurm.nebius.ai. 5 | # This role is intended for users authorized to modify roles and bindings within the cluster, 6 | # enabling them to delegate specific permissions to other users or groups as needed. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: nodeset-admin-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - nodesets 20 | verbs: 21 | - '*' 22 | - apiGroups: 23 | - slurm.nebius.ai 24 | resources: 25 | - nodesets/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/nodeset_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants permissions to create, update, and delete resources within the slurm.nebius.ai. 5 | # This role is intended for users who need to manage these resources 6 | # but should not control RBAC or manage permissions for others. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: nodeset-editor-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - nodesets 20 | verbs: 21 | - create 22 | - delete 23 | - get 24 | - list 25 | - patch 26 | - update 27 | - watch 28 | - apiGroups: 29 | - slurm.nebius.ai 30 | resources: 31 | - nodesets/status 32 | verbs: 33 | - get 34 | -------------------------------------------------------------------------------- /config/rbac/nodeset_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants read-only access to slurm.nebius.ai resources. 5 | # This role is intended for users who need visibility into these resources 6 | # without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: nodeset-viewer-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - nodesets 20 | verbs: 21 | - get 22 | - list 23 | - watch 24 | - apiGroups: 25 | - slurm.nebius.ai 26 | resources: 27 | - nodesets/status 28 | verbs: 29 | - get 30 | -------------------------------------------------------------------------------- /config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrolebinding 6 | app.kubernetes.io/instance: manager-rolebinding 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: slurm-operator 9 | app.kubernetes.io/part-of: slurm-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: manager-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: manager-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: controller-manager 19 | namespace: system 20 | -------------------------------------------------------------------------------- /config/rbac/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: serviceaccount 6 | app.kubernetes.io/instance: controller-manager-sa 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: slurm-operator 9 | app.kubernetes.io/part-of: slurm-operator 10 | app.kubernetes.io/managed-by: kustomize 11 | name: controller-manager 12 | namespace: system 13 | --- 14 | apiVersion: v1 15 | kind: ServiceAccount 16 | metadata: 17 | labels: 18 | app.kubernetes.io/name: serviceaccount 19 | app.kubernetes.io/instance: rebooter 20 | app.kubernetes.io/component: rbac 21 | app.kubernetes.io/created-by: slurm-operator 22 | app.kubernetes.io/part-of: slurm-operator 23 | app.kubernetes.io/managed-by: kustomize 24 | name: rebooter 25 | namespace: system 26 | -------------------------------------------------------------------------------- /config/rbac/slurmcluster_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit slurmclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: slurmcluster-editor-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: slurm-operator 10 | app.kubernetes.io/part-of: slurm-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: slurmcluster-editor-role 13 | rules: 14 | - apiGroups: 15 | - slurm.nebius.ai 16 | resources: 17 | - slurmclusters 18 | verbs: 19 | - create 20 | - delete 21 | - get 22 | - list 23 | - patch 24 | - update 25 | - watch 26 | - apiGroups: 27 | - slurm.nebius.ai 28 | resources: 29 | - slurmclusters/status 30 | verbs: 31 | - get 32 | -------------------------------------------------------------------------------- /config/rbac/slurmcluster_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view slurmclusters. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: clusterrole 7 | app.kubernetes.io/instance: slurmcluster-viewer-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: slurm-operator 10 | app.kubernetes.io/part-of: slurm-operator 11 | app.kubernetes.io/managed-by: kustomize 12 | name: slurmcluster-viewer-role 13 | rules: 14 | - apiGroups: 15 | - slurm.nebius.ai 16 | resources: 17 | - slurmclusters 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - apiGroups: 23 | - slurm.nebius.ai 24 | resources: 25 | - slurmclusters/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/soperatorchecks/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - role.yaml 3 | -------------------------------------------------------------------------------- /config/rbac/soperatorchecks_admin_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants full permissions ('*') over slurm.nebius.ai. 5 | # This role is intended for users authorized to modify roles and bindings within the cluster, 6 | # enabling them to delegate specific permissions to other users or groups as needed. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: soperatorchecks-admin-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - soperatorchecks 20 | verbs: 21 | - '*' 22 | - apiGroups: 23 | - slurm.nebius.ai 24 | resources: 25 | - soperatorchecks/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/soperatorchecks_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants permissions to create, update, and delete resources within the slurm.nebius.ai. 5 | # This role is intended for users who need to manage these resources 6 | # but should not control RBAC or manage permissions for others. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: soperatorchecks-editor-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - soperatorchecks 20 | verbs: 21 | - create 22 | - delete 23 | - get 24 | - list 25 | - patch 26 | - update 27 | - watch 28 | - apiGroups: 29 | - slurm.nebius.ai 30 | resources: 31 | - soperatorchecks/status 32 | verbs: 33 | - get 34 | -------------------------------------------------------------------------------- /config/rbac/soperatorchecks_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project slurm-operator itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants read-only access to slurm.nebius.ai resources. 5 | # This role is intended for users who need visibility into these resources 6 | # without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: slurm-operator 13 | app.kubernetes.io/managed-by: kustomize 14 | name: soperatorchecks-viewer-role 15 | rules: 16 | - apiGroups: 17 | - slurm.nebius.ai 18 | resources: 19 | - soperatorchecks 20 | verbs: 21 | - get 22 | - list 23 | - watch 24 | - apiGroups: 25 | - slurm.nebius.ai 26 | resources: 27 | - soperatorchecks/status 28 | verbs: 29 | - get 30 | -------------------------------------------------------------------------------- /config/samples/kustomization.yaml: -------------------------------------------------------------------------------- 1 | ## Append samples of your project ## 2 | resources: 3 | - slurm_v1alpha1_activecheck.yaml 4 | - slurm_v1alpha1_nodeconfigurator.yaml 5 | - slurm_v1alpha1_nodeset.yaml 6 | #+kubebuilder:scaffold:manifestskustomizesamples 7 | -------------------------------------------------------------------------------- /config/samples/slurm_v1alpha1_activecheck.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: slurm.nebius.ai/v1alpha1 2 | kind: ActiveCheck 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: slurm-operator 6 | app.kubernetes.io/managed-by: kustomize 7 | name: activecheck-sample 8 | spec: 9 | # TODO(user): Add fields here 10 | -------------------------------------------------------------------------------- /config/samples/slurm_v1alpha1_nodeconfigurator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: slurm.nebius.ai/v1alpha1 2 | kind: NodeConfigurator 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: slurm-operator 6 | app.kubernetes.io/managed-by: kustomize 7 | name: nodeconfigurator-sample 8 | spec: 9 | # TODO(user): Add fields here 10 | -------------------------------------------------------------------------------- /config/samples/slurm_v1alpha1_nodeset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: slurm.nebius.ai/v1alpha1 2 | kind: NodeSet 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: slurm-operator 6 | app.kubernetes.io/managed-by: kustomize 7 | name: nodeset-sample 8 | spec: 9 | # TODO(user): Add fields here 10 | -------------------------------------------------------------------------------- /config/soperatorchecks/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - soperatorchecks.yaml 3 | - ../rbac/soperatorchecks/role.yaml 4 | images: 5 | - name: controller 6 | newName: cr.eu-north1.nebius.cloud/soperator/soperatorchecks 7 | newTag: 1.20.0 8 | patches: 9 | # Protect the /metrics endpoint by putting it behind auth. 10 | # If you want your controller-manager to expose the /metrics 11 | # endpoint w/o any authn/z, please comment the following line. 12 | - path: ./manager_auth_proxy_patch.yaml 13 | -------------------------------------------------------------------------------- /config/soperatorchecks/manager_auth_proxy_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch inject a sidecar container which is a HTTP proxy for the 2 | # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: soperatorchecks 7 | namespace: system 8 | spec: 9 | template: 10 | spec: 11 | containers: 12 | - name: kube-rbac-proxy 13 | securityContext: 14 | allowPrivilegeEscalation: false 15 | capabilities: 16 | drop: 17 | - "ALL" 18 | image: gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0 19 | args: 20 | - "--secure-listen-address=0.0.0.0:8443" 21 | - "--upstream=http://127.0.0.1:8080/" 22 | - "--logtostderr=true" 23 | - "--v=0" 24 | ports: 25 | - containerPort: 8443 26 | protocol: TCP 27 | name: https 28 | resources: 29 | limits: 30 | memory: 128Mi 31 | requests: 32 | cpu: 50m 33 | memory: 64Mi 34 | - name: manager 35 | args: 36 | - "--health-probe-bind-address=:8081" 37 | - "--metrics-bind-address=127.0.0.1:8080" 38 | - "--leader-elect" 39 | -------------------------------------------------------------------------------- /config/webhook/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manifests.yaml 3 | - service.yaml 4 | 5 | configurations: 6 | - kustomizeconfig.yaml 7 | -------------------------------------------------------------------------------- /config/webhook/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # the following config is for teaching kustomize where to look at when substituting nameReference. 2 | # It requires kustomize v2.1.0 or newer to work properly. 3 | nameReference: 4 | - kind: Service 5 | version: v1 6 | fieldSpecs: 7 | - kind: MutatingWebhookConfiguration 8 | group: admissionregistration.k8s.io 9 | path: webhooks/clientConfig/service/name 10 | - kind: ValidatingWebhookConfiguration 11 | group: admissionregistration.k8s.io 12 | path: webhooks/clientConfig/service/name 13 | 14 | namespace: 15 | - kind: MutatingWebhookConfiguration 16 | group: admissionregistration.k8s.io 17 | path: webhooks/clientConfig/service/namespace 18 | create: true 19 | - kind: ValidatingWebhookConfiguration 20 | group: admissionregistration.k8s.io 21 | path: webhooks/clientConfig/service/namespace 22 | create: true 23 | -------------------------------------------------------------------------------- /config/webhook/manifests.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingWebhookConfiguration 3 | metadata: 4 | name: validate-secrets 5 | webhooks: 6 | - name: validate.secrets.nebius.ai 7 | clientConfig: 8 | service: 9 | name: webhook-service 10 | namespace: system 11 | path: "/validate--v1-secret" 12 | caBundle: "" 13 | rules: 14 | - operations: ["CREATE", "UPDATE", "DELETE"] 15 | apiGroups: [""] 16 | apiVersions: ["v1"] 17 | resources: ["secrets"] 18 | objectSelector: 19 | matchLabels: 20 | slurm.nebius.ai/webhook: "true" 21 | failurePolicy: Fail 22 | admissionReviewVersions: ["v1", "v1beta1"] 23 | sideEffects: None 24 | -------------------------------------------------------------------------------- /config/webhook/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: slurm-operator 6 | app.kubernetes.io/managed-by: kustomize 7 | name: webhook-service 8 | namespace: system 9 | spec: 10 | ports: 11 | - port: 443 12 | protocol: TCP 13 | targetPort: 9443 14 | selector: 15 | control-plane: controller-manager 16 | -------------------------------------------------------------------------------- /docs/images/layers_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebius/soperator/c5d96cc2826b38c6b07bb9277ca895957794d319/docs/images/layers_diagram.png -------------------------------------------------------------------------------- /fluxcd/base/soperator-fluxcd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - ./resources.yaml 5 | -------------------------------------------------------------------------------- /fluxcd/base/soperator-fluxcd/resources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: source.toolkit.fluxcd.io/v1 2 | kind: HelmRepository 3 | metadata: 4 | name: soperator-fluxcd 5 | namespace: flux-system 6 | spec: 7 | type: "oci" 8 | interval: 24h 9 | url: oci://cr.eu-north1.nebius.cloud/soperator 10 | --- 11 | apiVersion: helm.toolkit.fluxcd.io/v2 12 | kind: HelmRelease 13 | metadata: 14 | name: soperator-fluxcd 15 | namespace: flux-system 16 | spec: 17 | interval: 60m 18 | driftDetection: 19 | mode: enabled 20 | chart: 21 | spec: 22 | chart: helm-soperator-fluxcd 23 | sourceRef: 24 | kind: HelmRepository 25 | name: soperator-fluxcd 26 | version: ${soperator_version} 27 | interval: 3m 28 | install: 29 | remediation: 30 | retries: 3 31 | targetNamespace: flux-system 32 | valuesFrom: 33 | - kind: ConfigMap 34 | name: terraform-fluxcd-values 35 | valuesKey: values.yaml 36 | optional: false 37 | - kind: ConfigMap 38 | name: soperator-fluxcd 39 | valuesKey: values.yaml 40 | optional: true 41 | - kind: Secret 42 | name: soperator-fluxcd 43 | valuesKey: values.yaml 44 | optional: true 45 | - kind: ConfigMap 46 | name: backup-schedule 47 | valuesKey: values.yaml 48 | optional: true 49 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/base/custom-configmaps-soperator/95-nebius-o11y: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | printf "\nTo open monitoring dashboards in your browser:\n" 4 | printf " 1. Execute this command on your local computer:\n" 5 | printf " \`ssh -L 3000:metrics-grafana.monitoring-system.svc:80 -N @\`\n" 6 | printf " 2. Open \`localhost:3000\` in your browser\n" 7 | 8 | printf "\nTo open logs explorer in your browser:\n" 9 | printf " 1. Execute this command on your local computer:\n" 10 | printf " \`ssh -L 9428:vm-logs-victoria-logs-single-server.logs-system.svc:9428 -N @\`\n" 11 | printf " 2. Open \`localhost:9428/select/vmui\` in your browser\n" 12 | 13 | printf "\n" 14 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/base/custom-configmaps-soperator/enroot.conf: -------------------------------------------------------------------------------- 1 | ENROOT_CACHE_PATH /mnt/image-storage/enroot/cache 2 | ENROOT_DATA_PATH /mnt/image-storage/enroot/data 3 | ENROOT_RUNTIME_PATH /mnt/image-storage/enroot/runtime 4 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/base/custom-configmaps-soperator/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | configMapGenerator: 4 | - name: custom-supervisord-config 5 | namespace: soperator 6 | files: 7 | - supervisord.conf 8 | options: 9 | disableNameSuffixHash: true 10 | - name: slurm-epilog 11 | namespace: soperator 12 | files: 13 | - epilog.sh 14 | options: 15 | disableNameSuffixHash: true 16 | - name: slurm-prolog 17 | namespace: soperator 18 | files: 19 | - prolog.sh 20 | options: 21 | disableNameSuffixHash: true 22 | - name: motd-nebius-o11y 23 | namespace: soperator 24 | files: 25 | - 95-nebius-o11y 26 | options: 27 | disableNameSuffixHash: true 28 | - name: image-storage 29 | namespace: soperator 30 | files: 31 | - daemon.json 32 | - enroot.conf 33 | options: 34 | disableNameSuffixHash: true 35 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/base/flux_kustomization.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: kustomize.toolkit.fluxcd.io/v1 3 | kind: Kustomization 4 | metadata: 5 | name: custom-configmaps-soperator 6 | namespace: flux-system 7 | spec: 8 | interval: 3m 9 | sourceRef: 10 | kind: GitRepository 11 | name: flux-system 12 | path: "./fluxcd/environment/nebius-cloud/base/custom-configmaps-soperator" 13 | prune: false 14 | timeout: 1m 15 | postBuild: 16 | substituteFrom: 17 | - kind: ConfigMap 18 | name: custom-configmaps-soperator-vars 19 | optional: true 20 | - kind: Secret 21 | name: custom-configmaps-soperator-vars 22 | optional: true 23 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - ../../../base/soperator-fluxcd/ 5 | - ./flux_kustomization.yaml 6 | - https://github.com/k8up-io/k8up/releases/download/k8up-4.8.4/k8up-crd.yaml 7 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/dev/bootstrap/flux-kustomization.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: kustomize.toolkit.fluxcd.io/v1 3 | kind: Kustomization 4 | metadata: 5 | name: flux-system 6 | namespace: flux-system 7 | spec: 8 | interval: 3m 9 | sourceRef: 10 | kind: GitRepository 11 | name: nebius-cloud 12 | postBuild: 13 | substitute: 14 | soperator_version: 1.20.0 15 | path: "./fluxcd/enviroment/nebius-cloud/dev" 16 | prune: true 17 | timeout: 1m 18 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/dev/bootstrap/git-repository.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: source.toolkit.fluxcd.io/v1 2 | kind: GitRepository 3 | metadata: 4 | name: flux-system 5 | namespace: flux-system 6 | spec: 7 | interval: 3m 8 | url: oci://cr.eu-north1.nebius.cloud/soperator-unstable 9 | ref: 10 | branch: dev 11 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/dev/bootstrap/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - ./git-repository.yaml 5 | - ./flux-kustomization.yaml 6 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/dev/helmrelease-patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: helm.toolkit.fluxcd.io/v2 2 | kind: HelmRelease 3 | metadata: 4 | name: soperator-fluxcd 5 | namespace: flux-system 6 | spec: 7 | values: 8 | helmRepository: 9 | soperator: 10 | url: oci://cr.eu-north1.nebius.cloud/soperator-unstable 11 | type: oci 12 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/dev/helmrepository-patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: source.toolkit.fluxcd.io/v1 2 | kind: HelmRepository 3 | metadata: 4 | name: soperator-fluxcd 5 | namespace: flux-system 6 | spec: 7 | url: oci://cr.eu-north1.nebius.cloud/soperator-unstable 8 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/dev/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - ../base 5 | patchesStrategicMerge: 6 | - helmrepository-patch.yaml 7 | - helmrelease-patch.yaml 8 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/prod/bootstrap/flux-kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.toolkit.fluxcd.io/v1 2 | kind: Kustomization 3 | metadata: 4 | name: flux-system 5 | namespace: flux-system 6 | spec: 7 | interval: 3m 8 | sourceRef: 9 | kind: GitRepository 10 | name: nebius-cloud 11 | postBuild: 12 | substitute: 13 | soperator_version: 1.20.0 14 | path: "./fluxcd/enviroment/nebius-cloud/prod" 15 | prune: false 16 | timeout: 1m 17 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/prod/bootstrap/git-repository.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: source.toolkit.fluxcd.io/v1 2 | kind: GitRepository 3 | metadata: 4 | name: flux-system 5 | namespace: flux-system 6 | spec: 7 | interval: 3m 8 | url: oci://cr.eu-north1.nebius.cloud/soperator 9 | ref: 10 | branch: main 11 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/prod/bootstrap/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - ./git-repository.yaml 5 | - ./flux-kustomization.yaml 6 | -------------------------------------------------------------------------------- /fluxcd/environment/nebius-cloud/prod/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - ../base 5 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Nebius B.V. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /helm/nodeconfigurator/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/nodeconfigurator/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-nodeconfigurator 3 | description: A Helm chart for soperator-nodeconfigurator 4 | # A chart can be either an 'application' or a 'library' chart. 5 | # 6 | # Application charts are a collection of templates that can be packaged into versioned archives 7 | # to be deployed. 8 | # 9 | # Library charts provide useful utilities or functions for the chart developer. They're included as 10 | # a dependency of application charts to inject those utilities and functions into the rendering 11 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 12 | type: application 13 | # This is the chart version. This version number should be incremented each time you make changes 14 | # to the chart and its templates, including the app version. 15 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 16 | version: 1.20.0 17 | # This is the version number of the application being deployed. This version number should be 18 | # incremented each time you make changes to the application. Versions are not expected to 19 | # follow Semantic Versioning. They should reflect the version the application is using. 20 | # It is recommended to use it with quotes. 21 | appVersion: "1.20.0" 22 | -------------------------------------------------------------------------------- /helm/nodeconfigurator/templates/binding-rbac.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.rebooter.generateRBAC .Values.rebooter.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: {{ include "nodeconfigurator.fullname" . }}-binding 6 | labels: 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/part-of: slurm-operator 9 | {{- include "nodeconfigurator.labels" . | nindent 4 }} 10 | roleRef: 11 | apiGroup: rbac.authorization.k8s.io 12 | kind: ClusterRole 13 | name: '{{ include "nodeconfigurator.fullname" . }}-nodeconfigurator-role' 14 | subjects: 15 | - kind: ServiceAccount 16 | name: {{ include "nodeconfigurator.name" . }}-sa 17 | namespace: '{{ .Release.Namespace }}' 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /helm/nodeconfigurator/templates/nodeconfigurator-rbac.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.rebooter.generateRBAC .Values.rebooter.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: {{ include "nodeconfigurator.fullname" . }}-nodeconfigurator-role 6 | labels: 7 | {{- include "nodeconfigurator.labels" . | nindent 4 }} 8 | rules: 9 | - apiGroups: 10 | - "" 11 | resources: 12 | - nodes 13 | verbs: 14 | - get 15 | - list 16 | - update 17 | - watch 18 | - apiGroups: 19 | - "" 20 | resources: 21 | - nodes/status 22 | verbs: 23 | - get 24 | - list 25 | - patch 26 | - update 27 | - watch 28 | - apiGroups: 29 | - "" 30 | resources: 31 | - pods 32 | verbs: 33 | - get 34 | - list 35 | - watch 36 | {{- end }} 37 | -------------------------------------------------------------------------------- /helm/nodeconfigurator/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.rebooter.generateRBAC .Values.rebooter.enabled }} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "nodeconfigurator.name" . }}-sa 6 | {{- with include "nodeconfigurator.labels" . }} 7 | labels: 8 | {{- . | nindent 4 -}} 9 | {{- end }} 10 | {{- end }} 11 | -------------------------------------------------------------------------------- /helm/nodeconfigurator/values.yaml: -------------------------------------------------------------------------------- 1 | nodeConfigurator: 2 | enabled: false 3 | env: [] 4 | image: 5 | repository: cr.eu-north1.nebius.cloud/soperator/busybox 6 | tag: "latest" 7 | pullPolicy: IfNotPresent 8 | nodeSelector: {} 9 | resources: {} 10 | livenessProbe: {} 11 | readinessProbe: {} 12 | tolerations: [] 13 | affinity: {} 14 | priorityClassName: "" 15 | serviceAccountName: "" 16 | rebooter: 17 | enabled: true 18 | generateRBAC: true 19 | evictionMethod: "evict" 20 | logLevel: "info" 21 | logFormat: "json" 22 | env: [] 23 | image: 24 | repository: "cr.eu-north1.nebius.cloud/soperator/rebooter" 25 | tag: "1.20.0" 26 | pullPolicy: IfNotPresent 27 | nodeSelector: {} 28 | resources: {} 29 | livenessProbe: {} 30 | readinessProbe: {} 31 | tolerations: [] 32 | affinity: {} 33 | priorityClassName: "" 34 | serviceAccountName: "" 35 | initContainers: [] 36 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-slurm-cluster-storage 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: "1.20.0" 6 | appVersion: "1.20.0" 7 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/accounting-pv.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.volume.accounting.enabled }} 2 | apiVersion: v1 3 | kind: PersistentVolume 4 | metadata: 5 | name: {{ include "slurm-cluster-storage.volume.accounting.pv" . }} 6 | spec: 7 | storageClassName: {{ include "slurm-cluster-storage.volume.accounting.storageClass" . }} 8 | volumeMode: Filesystem 9 | mountOptions: 10 | - rw 11 | - relatime 12 | capacity: 13 | storage: {{ include "slurm-cluster-storage.volume.accounting.size" . }} 14 | accessModes: 15 | - ReadWriteMany 16 | persistentVolumeReclaimPolicy: Retain 17 | local: 18 | path: /mnt/accounting 19 | nodeAffinity: 20 | required: 21 | nodeSelectorTerms: 22 | - matchExpressions: 23 | {{ .Values.storage.accounting.matchExpressions | toYaml | indent 12 }} 24 | {{- end }} 25 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/controller-spool-pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: {{ include "slurm-cluster-storage.volume.controller-spool.pv" . }} 5 | spec: 6 | storageClassName: {{ include "slurm-cluster-storage.volume.controller-spool.storageClass" . }} 7 | volumeMode: Filesystem 8 | mountOptions: 9 | - rw 10 | - relatime 11 | capacity: 12 | storage: {{ include "slurm-cluster-storage.volume.controller-spool.size" . }} 13 | accessModes: 14 | - ReadWriteMany 15 | persistentVolumeReclaimPolicy: Retain 16 | local: 17 | path: /mnt/controller-spool 18 | claimRef: 19 | namespace: {{ .Release.Namespace }} 20 | name: {{ include "slurm-cluster-storage.volume.controller-spool.pvc" . }} 21 | nodeAffinity: 22 | required: 23 | nodeSelectorTerms: 24 | - matchExpressions: 25 | {{ .Values.storage.controllerSpool.matchExpressions | toYaml | indent 12 }} 26 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/controller-spool-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | namespace: {{ .Release.Namespace }} 5 | name: {{ include "slurm-cluster-storage.volume.controller-spool.pvc" . }} 6 | spec: 7 | storageClassName: {{ include "slurm-cluster-storage.volume.controller-spool.storageClass" . }} 8 | resources: 9 | requests: 10 | storage: {{ include "slurm-cluster-storage.volume.controller-spool.size" . }} 11 | accessModes: 12 | - ReadWriteMany 13 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/jail-pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: {{ include "slurm-cluster-storage.volume.jail.pv" . }} 5 | spec: 6 | storageClassName: {{ include "slurm-cluster-storage.volume.jail.storageClass" . }} 7 | volumeMode: Filesystem 8 | mountOptions: 9 | - rw 10 | - relatime 11 | - exec 12 | - dev 13 | capacity: 14 | storage: {{ include "slurm-cluster-storage.volume.jail.size" . }} 15 | accessModes: 16 | - ReadWriteMany 17 | persistentVolumeReclaimPolicy: Retain 18 | local: 19 | path: /mnt/jail 20 | claimRef: 21 | namespace: {{ .Release.Namespace }} 22 | name: {{ include "slurm-cluster-storage.volume.jail.pvc" . }} 23 | nodeAffinity: 24 | required: 25 | nodeSelectorTerms: 26 | - matchExpressions: 27 | {{ .Values.storage.jail.matchExpressions | toYaml | indent 12 }} 28 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/jail-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | namespace: {{ .Release.Namespace }} 5 | name: {{ include "slurm-cluster-storage.volume.jail.pvc" . }} 6 | annotations: 7 | k8up.io/backup: 'true' 8 | spec: 9 | storageClassName: {{ include "slurm-cluster-storage.volume.jail.storageClass" . }} 10 | resources: 11 | requests: 12 | storage: {{ include "slurm-cluster-storage.volume.jail.size" . }} 13 | accessModes: 14 | - ReadWriteMany 15 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/jail-submounts-pv.yaml: -------------------------------------------------------------------------------- 1 | {{- range .Values.volume.jailSubMounts }} 2 | apiVersion: v1 3 | kind: PersistentVolume 4 | metadata: 5 | name: {{ include "slurm-cluster-storage.volume.jail-submount.pv" . }} 6 | spec: 7 | storageClassName: {{ include "slurm-cluster-storage.volume.jail-submount.storageClass" $ }} 8 | volumeMode: Filesystem 9 | mountOptions: 10 | - rw 11 | - relatime 12 | - exec 13 | capacity: 14 | storage: {{ include "slurm-cluster-storage.volume.jail-submount.size" . }} 15 | accessModes: 16 | - ReadWriteMany 17 | persistentVolumeReclaimPolicy: Retain 18 | local: 19 | path: /mnt/{{ include "slurm-cluster-storage.volume.jail-submount.name" . }} 20 | claimRef: 21 | namespace: {{ $.Release.Namespace }} 22 | name: {{ include "slurm-cluster-storage.volume.jail-submount.pvc" . }} 23 | nodeAffinity: 24 | required: 25 | nodeSelectorTerms: 26 | - matchExpressions: 27 | {{ $.Values.storage.jail.matchExpressions | toYaml | indent 12 }} 28 | --- 29 | {{- end }} 30 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/jail-submounts-pvc.yaml: -------------------------------------------------------------------------------- 1 | {{- range .Values.volume.jailSubMounts }} 2 | apiVersion: v1 3 | kind: PersistentVolumeClaim 4 | metadata: 5 | namespace: {{ $.Release.Namespace }} 6 | name: {{ include "slurm-cluster-storage.volume.jail-submount.pvc" . }} 7 | spec: 8 | storageClassName: {{ include "slurm-cluster-storage.volume.jail-submount.storageClass" $ }} 9 | resources: 10 | requests: 11 | storage: {{ include "slurm-cluster-storage.volume.jail-submount.size" . }} 12 | accessModes: 13 | - ReadWriteMany 14 | --- 15 | {{- end }} 16 | -------------------------------------------------------------------------------- /helm/slurm-cluster-storage/templates/local-storageclass.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: storage.k8s.io/v1 2 | kind: StorageClass 3 | metadata: 4 | name: {{ include "slurm-cluster-storage.class.local.name" . }} 5 | provisioner: kubernetes.io/no-provisioner 6 | volumeBindingMode: WaitForFirstConsumer 7 | -------------------------------------------------------------------------------- /helm/slurm-cluster/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/slurm-cluster/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-slurm-cluster 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: "1.20.0" 6 | appVersion: "1.20.0" 7 | kubeVersion: ">=1.29.0-0" 8 | -------------------------------------------------------------------------------- /helm/slurm-cluster/templates/_registry_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* This file is generated by make sync-version. */}} 2 | 3 | {{/* Container registry with stable Docker images */}} 4 | {{- define "slurm-cluster.containerRegistry" -}} 5 | {{- "cr.eu-north1.nebius.cloud/soperator" -}} 6 | {{- end }} 7 | -------------------------------------------------------------------------------- /helm/slurm-cluster/templates/_secret_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* sshdKeysName secret */}} 2 | {{- define "slurm-cluster.secret.sshdKeysName" -}} 3 | {{- .Values.secrets.sshdKeysName }} 4 | {{- end }} 5 | 6 | {{/* 7 | --- 8 | */}} 9 | -------------------------------------------------------------------------------- /helm/slurm-cluster/templates/priority-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scheduling.k8s.io/v1 2 | kind: PriorityClass 3 | metadata: 4 | name: {{ include "slurm-cluster.name" . }} 5 | value: 1000000 6 | globalDefault: false 7 | -------------------------------------------------------------------------------- /helm/slurm-cluster/templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | {{- range $name, $volume := .Values.volumeSources }} 2 | {{- if $volume.createPVC }} 3 | apiVersion: v1 4 | kind: PersistentVolumeClaim 5 | metadata: 6 | namespace: {{ $.Release.Namespace }} 7 | name: {{ required "Claim name must be provided." $volume.persistentVolumeClaim.claimName | quote }} 8 | spec: 9 | accessModes: 10 | - ReadWriteMany 11 | volumeMode: Filesystem 12 | resources: 13 | requests: 14 | storage: {{ required "Volume size must be provided." $volume.size | quote }} 15 | storageClassName: {{ required "Storage class name must be provided." $volume.storageClassName | quote }} 16 | {{- end }} 17 | --- 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /helm/soperator-activechecks/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/soperator-activechecks/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-soperator-activechecks 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: "1.20.0" 6 | appVersion: "1.20.0" 7 | -------------------------------------------------------------------------------- /helm/soperator-activechecks/README.md: -------------------------------------------------------------------------------- 1 | # Soperator ActiveCheck helm chart 2 | 3 | This helm chart deploys ActiveCheck to soperator cluster 4 | 5 | ### To install / update: 6 | 7 | ```bash 8 | helm upgrade --install activecheck ./soperator-activechecks -f activecheck.yaml 9 | ``` 10 | As an example we can use next `activecheck.yaml` for `k8sJobs`: 11 | ```yaml 12 | activeCheck: 13 | enabled: true 14 | checkType: "k8sJob" 15 | schedule: "0 */2 * * *" # every 2 hours 16 | k8sJobSpec: 17 | command: 18 | - "/bin/sh" 19 | - "-c" 20 | - "echo Hello, activecheck!" 21 | ``` 22 | and for `slurmJobs`: 23 | ```yaml 24 | activeCheck: 25 | enabled: true 26 | checkType: "slurmJob" 27 | schedule: "0 */3 * * *" # every 3 hours 28 | slurmJobSpec: 29 | sbatchScript: | 30 | #!/bin/bash 31 | #SBATCH -J simple_job 32 | #SBATCH --output=output.txt 33 | 34 | srun echo "Hello, activecheck!" 35 | ``` 36 | 37 | ### To delete: 38 | 39 | ```bash 40 | helm uninstall activecheck 41 | ``` 42 | -------------------------------------------------------------------------------- /helm/soperator-activechecks/values.yaml: -------------------------------------------------------------------------------- 1 | activeCheck: 2 | enabled: false 3 | namespace: "soperator" 4 | checkType: "k8sJob" 5 | schedule: "0 * * * *" 6 | suspend: true 7 | slurmClusterRefName: "soperator" 8 | successfulJobsHistoryLimit: 3 9 | failedJobsHistoryLimit: 1 10 | runAfterCreation: false 11 | k8sJobSpec: 12 | jobContainer: 13 | command: ["/bin/sh", "-c", "echo Hello, world!"] 14 | slurmJobSpec: 15 | sbatchScript: | 16 | #!/bin/bash 17 | #SBATCH -J simple_job # Job name 18 | #SBATCH --output=output.txt # Output file 19 | 20 | srun echo "Hello, world!" 21 | jobContainer: 22 | env: 23 | - name: "K8S_POD_NAME" 24 | valueFrom: 25 | fieldRef: 26 | fieldPath: "metadata.name" 27 | - name: "K8S_POD_NAMESPACE" 28 | valueFrom: 29 | fieldRef: 30 | fieldPath: "metadata.namespace" 31 | volumeMounts: 32 | - mountPath: "/mnt/jail" 33 | name: "jail" 34 | volumes: 35 | - name: "jail" 36 | persistentVolumeClaim: 37 | claimName: "jail-pvc" 38 | images: 39 | slurmJob: "cr.eu-north1.nebius.cloud/soperator/slurm_check_job:1.20.0-jammy-slurm24.05.7" 40 | k8sJob: "cr.eu-north1.nebius.cloud/soperator/ubuntu:jammy" 41 | munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.20.0-jammy-slurm24.05.7" 42 | -------------------------------------------------------------------------------- /helm/soperator-crds/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/soperator-crds/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-soperator-crds 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 1.20.0 6 | appVersion: "1.20.0" 7 | kubeVersion: ">=1.29.0-0" 8 | -------------------------------------------------------------------------------- /helm/soperator-crds/values.yaml: -------------------------------------------------------------------------------- 1 | kubernetesClusterDomain: cluster.local 2 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-soperator-dcgm-exporter 3 | description: A Helm chart for Nvidia DCGM Exporter 4 | type: application 5 | version: 1.20.0 6 | appVersion: "1.20.0" 7 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "soperator-dcgm-exporter.name" . }}-metrics 5 | labels: 6 | {{- include "soperator-dcgm-exporter.labels" . | nindent 4 }} 7 | data: 8 | dcgm-metrics.csv: {{ .Values.exporterMetricsConfigMap.dcgmMetricsCsv | toYaml | indent 1 }} 9 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/templates/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: {{ include "soperator-dcgm-exporter.name" . }}-role 5 | labels: 6 | app: nvidia-dcgm-exporter 7 | {{- include "soperator-dcgm-exporter.labels" . | nindent 4 }} 8 | rules: 9 | - apiGroups: 10 | - security.openshift.io 11 | resources: 12 | - securitycontextconstraints 13 | verbs: 14 | - use 15 | - apiGroups: 16 | - "" 17 | resources: 18 | - configmaps 19 | - pods 20 | verbs: 21 | - get 22 | - list 23 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/templates/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: {{ include "soperator-dcgm-exporter.name" . }}-role-binding 5 | labels: 6 | app: nvidia-dcgm-exporter 7 | {{- include "soperator-dcgm-exporter.labels" . | nindent 4 }} 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: Role 11 | name: {{ include "soperator-dcgm-exporter.name" . }}-role 12 | subjects: 13 | - kind: ServiceAccount 14 | name: {{ include "soperator-dcgm-exporter.name" . }}-sa 15 | namespace: '{{ .Release.Namespace }}' 16 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "soperator-dcgm-exporter.name" . }}-svc 5 | labels: 6 | app: nvidia-dcgm-exporter 7 | {{- include "soperator-dcgm-exporter.labels" . | nindent 4 }} 8 | annotations: 9 | prometheus.io/scrape: "true" 10 | spec: 11 | type: {{ .Values.serviceType }} 12 | selector: 13 | app: nvidia-dcgm-exporter 14 | {{- include "soperator-dcgm-exporter.selectorLabels" . | nindent 4 }} 15 | ports: 16 | - name: gpu-metrics 17 | port: {{ .Values.metricsPort }} 18 | protocol: TCP 19 | targetPort: {{ .Values.metricsPort }} 20 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "soperator-dcgm-exporter.name" . }}-sa 5 | labels: 6 | app: nvidia-dcgm-exporter 7 | {{- include "soperator-dcgm-exporter.labels" . | nindent 4 }} 8 | annotations: 9 | {{- toYaml .Values.serviceAccount.annotations | nindent 4 }} 10 | -------------------------------------------------------------------------------- /helm/soperator-dcgm-exporter/templates/servicemonitor.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: {{ include "soperator-dcgm-exporter.name" . }}-service-monitor 5 | labels: 6 | app: nvidia-dcgm-exporter 7 | {{- include "soperator-dcgm-exporter.labels" . | nindent 4 }} 8 | spec: 9 | endpoints: 10 | - interval: {{ .Values.scrapeInterval }} 11 | path: /metrics 12 | port: gpu-metrics 13 | relabelings: 14 | - action: replace 15 | regex: nvidia-dcgm-exporter 16 | replacement: dcgm-exporter 17 | sourceLabels: 18 | - __meta_kubernetes_pod_label_app 19 | targetLabel: app_kubernetes_io_name 20 | jobLabel: app 21 | namespaceSelector: 22 | matchNames: 23 | - soperator 24 | selector: 25 | matchLabels: 26 | app: nvidia-dcgm-exporter 27 | -------------------------------------------------------------------------------- /helm/soperator-fluxcd/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | tests/ 25 | -------------------------------------------------------------------------------- /helm/soperator-fluxcd/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-soperator-fluxcd 3 | description: Umbrella Helm chart for soperator-fluxcd 4 | # A chart can be either an 'application' or a 'library' chart. 5 | # 6 | # Application charts are a collection of templates that can be packaged into versioned archives 7 | # to be deployed. 8 | # 9 | # Library charts provide useful utilities or functions for the chart developer. They're included as 10 | # a dependency of application charts to inject those utilities and functions into the rendering 11 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 12 | type: application 13 | # This is the chart version. This version number should be incremented each time you make changes 14 | # to the chart and its templates, including the app version. 15 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 16 | version: 1.20.0 17 | # This is the version number of the application being deployed. This version number should be 18 | # incremented each time you make changes to the application. Versions are not expected to 19 | # follow Semantic Versioning. They should reflect the version the application is using. 20 | # It is recommended to use it with quotes. 21 | appVersion: "1.20.0" 22 | -------------------------------------------------------------------------------- /helm/soperator-fluxcd/templates/backup_schedule.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.backup.enabled }} 2 | apiVersion: helm.toolkit.fluxcd.io/v2 3 | kind: HelmRelease 4 | metadata: 5 | name: {{ include "soperator-fluxcd.fullname" . }}-backup-schedule 6 | labels: 7 | {{- include "soperator-fluxcd.labels" . | nindent 4 }} 8 | spec: 9 | chart: 10 | spec: 11 | chart: raw 12 | interval: {{ .Values.ns.interval }} 13 | sourceRef: 14 | kind: HelmRepository 15 | name: {{ include "soperator-fluxcd.fullname" . }}-bedag 16 | version: {{ .Values.ns.version }} 17 | dependsOn: 18 | - name: {{ include "soperator-fluxcd.fullname" . }}-ns 19 | - name: {{ include "soperator-fluxcd.fullname" . }}-k8up 20 | install: 21 | crds: Skip 22 | remediation: 23 | retries: 3 24 | interval: {{ .Values.backup.schedule.interval }} 25 | timeout: {{ .Values.backup.schedule.timeout }} 26 | releaseName: {{ .Values.backup.schedule.releaseName }} 27 | targetNamespace: {{ .Values.slurmCluster.namespace }} 28 | upgrade: 29 | crds: Skip 30 | values: 31 | resources: 32 | - apiVersion: k8up.io/v1 33 | kind: Schedule 34 | metadata: 35 | name: soperator-jail 36 | spec: {{ toYaml .Values.backup.schedule.values.spec | nindent 8 }} 37 | valuesFrom: 38 | - kind: ConfigMap 39 | name: backup-schedule 40 | optional: true 41 | valuesKey: values.yaml 42 | {{- end }} 43 | -------------------------------------------------------------------------------- /helm/soperator-fluxcd/templates/mariadb-operator-crds.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.mariadbOperator.enabled }} 2 | apiVersion: helm.toolkit.fluxcd.io/v2 3 | kind: HelmRelease 4 | metadata: 5 | name: {{ include "soperator-fluxcd.fullname" . }}-mariadb-operator-crds 6 | labels: 7 | {{- include "soperator-fluxcd.labels" . | nindent 4 }} 8 | spec: 9 | chart: 10 | spec: 11 | chart: mariadb-operator-crds 12 | interval: {{ .Values.mariadbOperator.interval }} 13 | sourceRef: 14 | kind: HelmRepository 15 | name: {{ include "soperator-fluxcd.fullname" . }}-mariadb-operator 16 | version: {{ .Values.mariadbOperator.version }} 17 | driftDetection: 18 | mode: enabled 19 | install: 20 | crds: CreateReplace 21 | remediation: 22 | retries: 3 23 | interval: {{ .Values.mariadbOperator.interval }} 24 | timeout: {{ .Values.mariadbOperator.timeout }} 25 | upgrade: 26 | crds: CreateReplace 27 | {{- end }} 28 | -------------------------------------------------------------------------------- /helm/soperator-fluxcd/templates/prometheus-operator-crds.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.observability.enabled .Values.observability.prometheusOperator.enabled }} 2 | apiVersion: helm.toolkit.fluxcd.io/v2 3 | kind: HelmRelease 4 | metadata: 5 | name: {{ include "soperator-fluxcd.fullname" . }}-prometheus-operator-crds 6 | labels: 7 | {{- include "soperator-fluxcd.labels" . | nindent 4 }} 8 | spec: 9 | chart: 10 | spec: 11 | chart: prometheus-operator-crds 12 | interval: {{ .Values.observability.prometheusOperator.interval }} 13 | sourceRef: 14 | kind: HelmRepository 15 | name: {{ include "soperator-fluxcd.fullname" . }}-prometheus-operator-crds 16 | version: {{ .Values.observability.prometheusOperator.version }} 17 | interval: 60m 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /helm/soperator-fluxcd/templates/victoria-metrics-operator-crds.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.observability.enabled .Values.observability.vmStack.enabled }} 2 | apiVersion: helm.toolkit.fluxcd.io/v2 3 | kind: HelmRelease 4 | metadata: 5 | name: {{ include "soperator-fluxcd.fullname" . }}-victoria-metrics-operator-crds 6 | labels: 7 | {{- include "soperator-fluxcd.labels" . | nindent 4 }} 8 | spec: 9 | chart: 10 | spec: 11 | chart: victoria-metrics-operator-crds 12 | interval: {{ .Values.observability.vmStack.crds.interval }} 13 | sourceRef: 14 | kind: HelmRepository 15 | name: {{ include "soperator-fluxcd.fullname" . }}-victoriametrics 16 | version: {{ .Values.observability.vmStack.crds.version }} 17 | driftDetection: 18 | mode: enabled 19 | install: 20 | crds: CreateReplace 21 | remediation: 22 | retries: 3 23 | interval: {{ .Values.observability.vmStack.crds.interval }} 24 | upgrade: 25 | crds: CreateReplace 26 | remediation: 27 | retries: 3 28 | remediateLastFailure: true 29 | {{- end }} 30 | -------------------------------------------------------------------------------- /helm/soperator-fluxcd/tests/component_enabled_test.yaml: -------------------------------------------------------------------------------- 1 | suite: test general enabled flag behavior 2 | templates: 3 | - templates/*.yaml 4 | excludeTemplates: 5 | - templates/helmrepository.yaml 6 | tests: 7 | - it: should not render component when enabled=false 8 | asserts: 9 | - hasDocuments: 10 | count: 1 11 | -------------------------------------------------------------------------------- /helm/soperator/.gitignore: -------------------------------------------------------------------------------- 1 | charts/kruise-*.tgz 2 | Chart.lock 3 | -------------------------------------------------------------------------------- /helm/soperator/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/soperator/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-soperator 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 1.20.0 6 | appVersion: "1.20.0" 7 | kubeVersion: ">=1.29.0-0" 8 | dependencies: 9 | - name: kruise 10 | version: 1.8.0 11 | repository: https://openkruise.github.io/charts/ 12 | condition: kruise.installOperator 13 | -------------------------------------------------------------------------------- /helm/soperator/charts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebius/soperator/c5d96cc2826b38c6b07bb9277ca895957794d319/helm/soperator/charts/.gitkeep -------------------------------------------------------------------------------- /helm/soperator/templates/binding-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "soperator.fullname" . }}-binding 5 | labels: 6 | app.kubernetes.io/component: rbac 7 | app.kubernetes.io/created-by: slurm-operator 8 | app.kubernetes.io/part-of: slurm-operator 9 | {{- include "soperator.labels" . | nindent 4 }} 10 | roleRef: 11 | apiGroup: rbac.authorization.k8s.io 12 | kind: ClusterRole 13 | name: '{{ include "soperator.fullname" . }}-manager-role' 14 | subjects: 15 | - kind: ServiceAccount 16 | name: '{{ include "soperator.fullname" . }}-manager' 17 | namespace: '{{ .Release.Namespace }}' 18 | -------------------------------------------------------------------------------- /helm/soperator/templates/metrics-reader-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "soperator.fullname" . }}-metrics-reader 5 | labels: 6 | app.kubernetes.io/component: kube-rbac-proxy 7 | app.kubernetes.io/created-by: slurm-operator 8 | app.kubernetes.io/part-of: slurm-operator 9 | {{- include "soperator.labels" . | nindent 4 }} 10 | rules: 11 | - nonResourceURLs: 12 | - /metrics 13 | verbs: 14 | - get 15 | -------------------------------------------------------------------------------- /helm/soperator/templates/proxy-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "soperator.fullname" . }}-proxy-role 5 | labels: 6 | app.kubernetes.io/component: kube-rbac-proxy 7 | app.kubernetes.io/created-by: slurm-operator 8 | app.kubernetes.io/part-of: slurm-operator 9 | {{- include "soperator.labels" . | nindent 4 }} 10 | rules: 11 | - apiGroups: 12 | - authentication.k8s.io 13 | resources: 14 | - tokenreviews 15 | verbs: 16 | - create 17 | - apiGroups: 18 | - authorization.k8s.io 19 | resources: 20 | - subjectaccessreviews 21 | verbs: 22 | - create 23 | --- 24 | apiVersion: rbac.authorization.k8s.io/v1 25 | kind: ClusterRoleBinding 26 | metadata: 27 | name: {{ include "soperator.fullname" . }}-proxy-rolebinding 28 | labels: 29 | app.kubernetes.io/component: kube-rbac-proxy 30 | app.kubernetes.io/created-by: slurm-operator 31 | app.kubernetes.io/part-of: slurm-operator 32 | {{- include "soperator.labels" . | nindent 4 }} 33 | roleRef: 34 | apiGroup: rbac.authorization.k8s.io 35 | kind: ClusterRole 36 | name: '{{ include "soperator.fullname" . }}-proxy-role' 37 | subjects: 38 | - kind: ServiceAccount 39 | name: '{{ include "soperator.fullname" . }}-manager' 40 | namespace: '{{ .Release.Namespace }}' 41 | -------------------------------------------------------------------------------- /helm/soperator/templates/selfsigned-issuer.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.certManager.enabled }} 2 | apiVersion: cert-manager.io/v1 3 | kind: Issuer 4 | metadata: 5 | name: {{ include "soperator.fullname" . }}-selfsigned-issuer 6 | labels: 7 | {{- include "soperator.labels" . | nindent 4 }} 8 | spec: 9 | selfSigned: {} 10 | {{- end }} 11 | -------------------------------------------------------------------------------- /helm/soperator/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "soperator.fullname" . }}-manager 5 | labels: 6 | app.kubernetes.io/component: rbac 7 | app.kubernetes.io/created-by: slurm-operator 8 | app.kubernetes.io/part-of: slurm-operator 9 | {{- include "soperator.labels" . | nindent 4 }} 10 | annotations: 11 | {{- toYaml .Values.controllerManager.serviceAccount.annotations | nindent 4 }} 12 | -------------------------------------------------------------------------------- /helm/soperator/templates/serving-cert.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.certManager.enabled }} 2 | apiVersion: cert-manager.io/v1 3 | kind: Certificate 4 | metadata: 5 | name: {{ include "soperator.fullname" . }}-serving-cert 6 | labels: 7 | {{- include "soperator.labels" . | nindent 4 }} 8 | spec: 9 | dnsNames: 10 | - '{{ include "soperator.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc.{{ .Values.kubernetesClusterDomain }}' 11 | - '{{ include "soperator.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc' 12 | - '{{ include "soperator.fullname" . }}-webhook-service.{{ .Release.Namespace }}' 13 | - '{{ include "soperator.fullname" . }}-webhook-service' 14 | issuerRef: 15 | kind: Issuer 16 | name: '{{ include "soperator.fullname" . }}-selfsigned-issuer' 17 | secretName: webhook-server-cert 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /helm/soperator/templates/soperator-checks-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "soperator.fullname" . }}-soperator-checks-role 5 | labels: 6 | {{- include "soperator.labels" . | nindent 4 }} 7 | rules: 8 | - apiGroups: 9 | - "" 10 | resources: 11 | - nodes 12 | verbs: 13 | - get 14 | - list 15 | - watch 16 | - apiGroups: 17 | - "" 18 | resources: 19 | - nodes/status 20 | verbs: 21 | - get 22 | - update 23 | - apiGroups: 24 | - "" 25 | resources: 26 | - pods 27 | verbs: 28 | - create 29 | - delete 30 | - get 31 | - list 32 | - update 33 | - watch -------------------------------------------------------------------------------- /helm/soperator/templates/validate-secrets.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.certManager.enabled }} 2 | apiVersion: admissionregistration.k8s.io/v1 3 | kind: ValidatingWebhookConfiguration 4 | metadata: 5 | name: {{ include "soperator.fullname" . }}-validate-secrets 6 | {{- if.Values.certManager.enabled }} 7 | annotations: 8 | cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "soperator.fullname" . }}-serving-cert 9 | {{- end }} 10 | labels: 11 | {{- include "soperator.labels" . | nindent 4 }} 12 | webhooks: 13 | - admissionReviewVersions: 14 | - v1 15 | - v1beta1 16 | clientConfig: 17 | service: 18 | name: '{{ include "soperator.fullname" . }}-webhook-service' 19 | namespace: '{{ .Release.Namespace }}' 20 | path: /validate--v1-secret 21 | failurePolicy: Fail 22 | name: validate.secrets.nebius.ai 23 | objectSelector: 24 | matchLabels: 25 | slurm.nebius.ai/webhook: "true" 26 | rules: 27 | - apiGroups: 28 | - "" 29 | apiVersions: 30 | - v1 31 | operations: 32 | - CREATE 33 | - UPDATE 34 | - DELETE 35 | resources: 36 | - secrets 37 | sideEffects: None 38 | {{- end }} 39 | -------------------------------------------------------------------------------- /helm/soperator/templates/webhook-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "soperator.fullname" . }}-webhook-service 5 | labels: 6 | {{- include "soperator.labels" . | nindent 4 }} 7 | spec: 8 | type: {{ .Values.webhookService.type }} 9 | selector: 10 | control-plane: controller-manager 11 | {{- include "soperator.selectorLabels" . | nindent 4 }} 12 | ports: 13 | {{- .Values.webhookService.ports | toYaml | nindent 2 }} 14 | -------------------------------------------------------------------------------- /helm/soperatorchecks/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/soperatorchecks/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: helm-soperatorchecks 3 | description: A Helm chart for Kubernetes 4 | # A chart can be either an 'application' or a 'library' chart. 5 | # 6 | # Application charts are a collection of templates that can be packaged into versioned archives 7 | # to be deployed. 8 | # 9 | # Library charts provide useful utilities or functions for the chart developer. They're included as 10 | # a dependency of application charts to inject those utilities and functions into the rendering 11 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 12 | type: application 13 | # This is the chart version. This version number should be incremented each time you make changes 14 | # to the chart and its templates, including the app version. 15 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 16 | version: 1.20.0 17 | # This is the version number of the application being deployed. This version number should be 18 | # incremented each time you make changes to the application. Versions are not expected to 19 | # follow Semantic Versioning. They should reflect the version the application is using. 20 | # It is recommended to use it with quotes. 21 | appVersion: "1.20.0" 22 | -------------------------------------------------------------------------------- /helm/soperatorchecks/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "soperatorchecks.fullname" . }}-checks 5 | labels: 6 | {{- include "soperatorchecks.labels" . | nindent 4 }} 7 | annotations: 8 | {{- toYaml .Values.checks.serviceAccount.annotations | nindent 4 }} -------------------------------------------------------------------------------- /helm/soperatorchecks/templates/soperator-checks-binding-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | {{- include "soperatorchecks.labels" . | nindent 4 }} 6 | name: {{ include "soperatorchecks.fullname" . }}-rolebinding 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: ClusterRole 10 | name: {{ include "soperatorchecks.fullname" . }}-soperator-checks-role 11 | subjects: 12 | - kind: ServiceAccount 13 | name: {{ include "soperatorchecks.fullname" . }}-checks 14 | namespace: '{{ .Release.Namespace }}' 15 | -------------------------------------------------------------------------------- /images/accounting/slurmdbd_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | echo "Bind-mount REST JWT key from K8S secret" 6 | touch /var/spool/slurmdbd/jwt_hs256.key 7 | mount --bind /mnt/rest-jwt-key/rest_jwt.key /var/spool/slurmdbd/jwt_hs256.key 8 | 9 | echo "Symlink slurm configs from K8S config map" 10 | rm -rf /etc/slurm && ln -s /mnt/slurm-configs /etc/slurm 11 | 12 | echo "Set permissions for shared /var/spool/slurmdbd" 13 | chmod 755 /var/spool/slurmdbd # It changes permissions of this shared directory in other containers as well 14 | 15 | # TODO: Since 1.29 kubernetes supports native sidecar containers. We can remove it in feature releases 16 | echo "Waiting until munge started" 17 | while [ ! -S "/run/munge/munge.socket.2" ]; do sleep 2; done 18 | 19 | # Hack with logs: multilog will write log in stdout and in log file, and rotate log file 20 | # # s100000000 (bytes) - 100MB, n5 - 5 files 21 | 22 | echo "Start slurmdbd daemon" 23 | exec /usr/sbin/slurmdbd -D 2>&1 | tee >(multilog s100000000 n5 /var/log/slurm/multilog) 24 | -------------------------------------------------------------------------------- /images/common/nvidia-container-runtime/config.toml: -------------------------------------------------------------------------------- 1 | #accept-nvidia-visible-devices-as-volume-mounts = false 2 | #accept-nvidia-visible-devices-envvar-when-unprivileged = true 3 | disable-require = false 4 | supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,video" 5 | #swarm-resource = "DOCKER_RESOURCE_GPU" 6 | 7 | [nvidia-container-cli] 8 | #debug = "/var/log/nvidia-container-toolkit.log" 9 | environment = [] 10 | #ldcache = "/etc/ld.so.cache" 11 | ldconfig = "@/sbin/ldconfig.real" 12 | load-kmods = true 13 | #no-cgroups = false 14 | #path = "/usr/bin/nvidia-container-cli" 15 | #root = "/run/nvidia/driver" 16 | #user = "root:video" 17 | 18 | [nvidia-container-cli.options] 19 | devices = ["/dev/infiniband"] 20 | 21 | [nvidia-container-runtime] 22 | #debug = "/var/log/nvidia-container-runtime.log" 23 | log-level = "info" 24 | mode = "auto" 25 | runtimes = ["docker-runc", "runc", "crun"] 26 | 27 | [nvidia-container-runtime.modes] 28 | 29 | [nvidia-container-runtime.modes.cdi] 30 | annotation-prefixes = ["cdi.k8s.io/"] 31 | default-kind = "nvidia.com/gpu" 32 | spec-dirs = ["/etc/cdi", "/var/run/cdi"] 33 | 34 | [nvidia-container-runtime.modes.csv] 35 | mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" 36 | 37 | [nvidia-container-runtime-hook] 38 | path = "nvidia-container-runtime-hook" 39 | skip-mode-detection = false 40 | 41 | [nvidia-ctk] 42 | path = "nvidia-ctk" 43 | -------------------------------------------------------------------------------- /images/common/scripts/install_awscli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | # ALT_ARCH has the extended form like: x86_64, aarch64 6 | ALT_ARCH="$(uname -m)" 7 | 8 | pushd /tmp || exit 1 9 | curl "https://awscli.amazonaws.com/awscli-exe-linux-${ALT_ARCH}.zip" -o "awscliv2.zip" 10 | unzip awscliv2.zip 11 | ./aws/install 12 | rm -rf /tmp/* 13 | popd || exit 1 14 | -------------------------------------------------------------------------------- /images/common/scripts/install_chroot_plugin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | set -u # Treat unset variables as an error and exit immediately 5 | 6 | # Check if ALT_ARCH is set and not empty 7 | if [ -z "${ALT_ARCH:-}" ]; then 8 | echo "❌ ALT_ARCH is not set. Please set the ALT_ARCH environment variable (e.g., x86_64, aarch64)." 9 | exit 1 10 | fi 11 | 12 | # Compile and install chroot SPANK plugin 13 | gcc -fPIC -shared -o /usr/src/chroot-plugin/chroot.so /usr/src/chroot-plugin/chroot.c -I/usr/local/include/slurm -L/usr/local/lib -lslurm && \ 14 | cp /usr/src/chroot-plugin/chroot.so /usr/lib/"${ALT_ARCH}"-linux-gnu/slurm/ 15 | -------------------------------------------------------------------------------- /images/common/scripts/install_container_toolkit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | # 1.17.4 latest working version 6 | # since 1.17.7 there was commit that breaks CUDA 7 | NVIDIA_TOOLKIT_VERSION=1.17.4-1 8 | 9 | # Install nvidia-container-toolkit for propagating NVIDIA drivers to containers 10 | curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ 11 | && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ 12 | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ 13 | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list 14 | 15 | apt-get update 16 | apt-get install -y nvidia-container-toolkit=${NVIDIA_TOOLKIT_VERSION} \ 17 | nvidia-container-toolkit-base=${NVIDIA_TOOLKIT_VERSION} \ 18 | libnvidia-container-tools=${NVIDIA_TOOLKIT_VERSION} \ 19 | libnvidia-container1=${NVIDIA_TOOLKIT_VERSION} \ 20 | 21 | apt-get clean 22 | rm -rf /var/lib/apt/lists/* 23 | -------------------------------------------------------------------------------- /images/common/scripts/install_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | # Add Docker's official GPG key 6 | apt update -y 7 | apt install -y ca-certificates curl 8 | install -m 0755 -d /etc/apt/keyrings 9 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc 10 | chmod a+r /etc/apt/keyrings/docker.asc 11 | 12 | # Add the repository to Apt sources 13 | echo \ 14 | "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ 15 | $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ 16 | tee /etc/apt/sources.list.d/docker.list > /dev/null 17 | apt update -y 18 | 19 | # Install Docker daemon and its dependencies 20 | apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin 21 | apt clean 22 | -------------------------------------------------------------------------------- /images/common/scripts/install_docker_cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | # Add Docker's official GPG key 6 | apt update -y 7 | apt install -y ca-certificates curl 8 | install -m 0755 -d /etc/apt/keyrings 9 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc 10 | chmod a+r /etc/apt/keyrings/docker.asc 11 | 12 | # Add the repository to Apt sources 13 | echo \ 14 | "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ 15 | $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ 16 | tee /etc/apt/sources.list.d/docker.list > /dev/null 17 | apt update -y 18 | 19 | # Install Docker daemon and its dependencies 20 | apt install -y docker-ce-cli 21 | apt clean 22 | rm -rf /var/lib/apt/lists/* 23 | -------------------------------------------------------------------------------- /images/common/scripts/install_enroot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | ENROOT_VERSION=3.5.0 6 | 7 | apt-get update 8 | apt -y install enroot=${ENROOT_VERSION}-1 enroot+caps=${ENROOT_VERSION}-1 9 | apt-get clean 10 | rm -rf /var/lib/apt/lists/* 11 | 12 | # Add an extra hook that sets env vars for PyTorch 13 | curl -fSsL -o /etc/enroot/hooks.d/50-slurm-pytorch.sh "https://raw.githubusercontent.com/NVIDIA/enroot/refs/tags/v${ENROOT_VERSION}/conf/hooks/extra/50-slurm-pytorch.sh" 14 | chmod +x /etc/enroot/hooks.d/50-slurm-pytorch.sh 15 | 16 | # Prepare env for running enroot 17 | mkdir -m 777 /usr/share/enroot/enroot-data 18 | mkdir -m 755 /run/enroot 19 | -------------------------------------------------------------------------------- /images/common/scripts/install_munge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | # Install munge 6 | apt update 7 | apt install -y munge libmunge-dev 8 | apt clean 9 | rm -rf /var/lib/apt/lists/* 10 | 11 | # Fix permissions 12 | chmod -R 700 /etc/munge /var/log/munge 13 | chmod -R 711 /var/lib/munge 14 | chown -R 0:0 /etc/munge /var/log/munge /var/lib/munge 15 | -------------------------------------------------------------------------------- /images/common/scripts/install_openmpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | OPENMPI_VERSION=4.1.7a1-1.2310055 6 | OPENMPI_VERSION_SHORT=4.1.7a1 7 | UCX_VERSION=1.16.0-1.2310213 8 | OFED_VERSION=23.10-2.1.3.1 9 | DISTRO=$(. /etc/os-release; echo "$ID""$VERSION_ID") 10 | ALT_ARCH="$(uname -m)" 11 | 12 | cd /etc/apt/sources.list.d || exit 13 | wget https://linux.mellanox.com/public/repo/mlnx_ofed/$OFED_VERSION/"$DISTRO"/mellanox_mlnx_ofed.list 14 | wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - 15 | apt update 16 | apt install openmpi="$OPENMPI_VERSION" ucx="$UCX_VERSION" 17 | apt clean 18 | rm -rf /var/lib/apt/lists/* 19 | 20 | echo "export PATH=\$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION_SHORT}/bin" > /etc/profile.d/path_openmpi.sh 21 | source /etc/profile.d/path_openmpi.sh 22 | 23 | printf "/lib/${ALT_ARCH}-linux-gnu\n/usr/lib/${ALT_ARCH}-linux-gnu\n/usr/local/cuda/targets/${ALT_ARCH}-linux/lib\n/usr/mpi/gcc/openmpi-%s/lib" "${OPENMPI_VERSION_SHORT}" > /etc/ld.so.conf.d/openmpi.conf 24 | ldconfig 25 | -------------------------------------------------------------------------------- /images/common/scripts/install_python.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | # Install python 6 | add-apt-repository ppa:deadsnakes/ppa -y 7 | apt-get update 8 | apt -y install \ 9 | python3.10 \ 10 | python3.10-dev \ 11 | python3.10-venv \ 12 | python3.10-dbg 13 | apt-get clean 14 | rm -rf /var/lib/apt/lists/* 15 | 16 | # Install pip 17 | curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 18 | 19 | # Make python3.10 the default python 20 | ln -s -f /usr/bin/python3.10 /usr/bin/python && ln -s -f /usr/bin/python3.10 /usr/bin/python3 21 | -------------------------------------------------------------------------------- /images/common/scripts/install_rclone.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | pushd /tmp || exit 1 6 | curl "https://rclone.org/install.sh" | bash 7 | rm -rf /tmp/* 8 | popd || exit 1 9 | -------------------------------------------------------------------------------- /images/controller/slurmctld_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | echo "Link users from jail" 6 | ln -s /mnt/jail/etc/passwd /etc/passwd 7 | ln -s /mnt/jail/etc/group /etc/group 8 | ln -s /mnt/jail/etc/shadow /etc/shadow 9 | ln -s /mnt/jail/etc/gshadow /etc/gshadow 10 | chown -h 0:42 /etc/{shadow,gshadow} 11 | 12 | echo "Bind-mount REST JWT secret key from K8S secret" 13 | touch /var/spool/slurmctld/jwt_hs256.key 14 | mount --bind /mnt/rest-jwt-key/rest_jwt.key /var/spool/slurmctld/jwt_hs256.key 15 | 16 | echo "Symlink slurm configs from K8S config map" 17 | rm -rf /etc/slurm && ln -s /mnt/jail/slurm /etc/slurm 18 | 19 | echo "Set permissions for shared /var/spool/slurmctld" 20 | chmod 755 /var/spool/slurmctld # It changes permissions of this shared directory in other containers as well 21 | 22 | # TODO: Since 1.29 kubernetes supports native sidecar containers. We can remove it in feature releases 23 | echo "Waiting until munge started" 24 | while [ ! -S "/run/munge/munge.socket.2" ]; do sleep 2; done 25 | 26 | # Hack with logs: multilog will write log in stdout and in log file, and rotate log file 27 | # # s100000000 (bytes) - 100MB, n5 - 5 files 28 | 29 | echo "Start slurmctld daemon" 30 | exec /usr/sbin/slurmctld -D 2>&1 | tee >(multilog s100000000 n5 /var/log/slurm/multilog) 31 | -------------------------------------------------------------------------------- /images/jail/init-users/group: -------------------------------------------------------------------------------- 1 | root:x:0: 2 | daemon:x:1: 3 | bin:x:2: 4 | sys:x:3: 5 | adm:x:4: 6 | tty:x:5: 7 | disk:x:6: 8 | lp:x:7: 9 | mail:x:8: 10 | news:x:9: 11 | uucp:x:10: 12 | man:x:12: 13 | proxy:x:13: 14 | kmem:x:15: 15 | dialout:x:20: 16 | fax:x:21: 17 | voice:x:22: 18 | cdrom:x:24: 19 | floppy:x:25: 20 | tape:x:26: 21 | sudo:x:27: 22 | audio:x:29: 23 | dip:x:30: 24 | www-data:x:33: 25 | backup:x:34: 26 | operator:x:37: 27 | list:x:38: 28 | irc:x:39: 29 | src:x:40: 30 | gnats:x:41: 31 | shadow:x:42: 32 | utmp:x:43: 33 | video:x:44: 34 | sasl:x:45: 35 | plugdev:x:46: 36 | staff:x:50: 37 | games:x:60: 38 | users:x:100: 39 | nogroup:x:65534: 40 | _ssh:x:101: 41 | crontab:x:102: 42 | messagebus:x:105: 43 | docker:x:999: 44 | -------------------------------------------------------------------------------- /images/jail/init-users/gshadow: -------------------------------------------------------------------------------- 1 | root:*:: 2 | daemon:*:: 3 | bin:*:: 4 | sys:*:: 5 | adm:*:: 6 | tty:*:: 7 | disk:*:: 8 | lp:*:: 9 | mail:*:: 10 | news:*:: 11 | uucp:*:: 12 | man:*:: 13 | proxy:*:: 14 | kmem:*:: 15 | dialout:*:: 16 | fax:*:: 17 | voice:*:: 18 | cdrom:*:: 19 | floppy:*:: 20 | tape:*:: 21 | sudo:*:: 22 | audio:*:: 23 | dip:*:: 24 | www-data:*:: 25 | backup:*:: 26 | operator:*:: 27 | list:*:: 28 | irc:*:: 29 | src:*:: 30 | gnats:*:: 31 | shadow:*:: 32 | utmp:*:: 33 | video:*:: 34 | sasl:*:: 35 | plugdev:*:: 36 | staff:*:: 37 | games:*:: 38 | users:*:: 39 | nogroup:*:: 40 | _ssh:!:: 41 | crontab:!:: 42 | docker:!:: 43 | -------------------------------------------------------------------------------- /images/jail/init-users/passwd: -------------------------------------------------------------------------------- 1 | root:x:0:0:root:/root:/bin/bash 2 | daemon:x:1:1:daemon:/usr/sbin:/usr/sbin/nologin 3 | bin:x:2:2:bin:/bin:/usr/sbin/nologin 4 | sys:x:3:3:sys:/dev:/usr/sbin/nologin 5 | sync:x:4:65534:sync:/bin:/bin/sync 6 | games:x:5:60:games:/usr/games:/usr/sbin/nologin 7 | man:x:6:12:man:/var/cache/man:/usr/sbin/nologin 8 | lp:x:7:7:lp:/var/spool/lpd:/usr/sbin/nologin 9 | mail:x:8:8:mail:/var/mail:/usr/sbin/nologin 10 | news:x:9:9:news:/var/spool/news:/usr/sbin/nologin 11 | uucp:x:10:10:uucp:/var/spool/uucp:/usr/sbin/nologin 12 | proxy:x:13:13:proxy:/bin:/usr/sbin/nologin 13 | www-data:x:33:33:www-data:/var/www:/usr/sbin/nologin 14 | backup:x:34:34:backup:/var/backups:/usr/sbin/nologin 15 | list:x:38:38:Mailing List Manager:/var/list:/usr/sbin/nologin 16 | irc:x:39:39:ircd:/run/ircd:/usr/sbin/nologin 17 | gnats:x:41:41:Gnats Bug-Reporting System (admin):/var/lib/gnats:/usr/sbin/nologin 18 | nobody:x:65534:65534:nobody:/nonexistent:/usr/sbin/nologin 19 | _apt:x:100:65534::/nonexistent:/usr/sbin/nologin 20 | sshd:x:105:65534::/run/sshd:/usr/sbin/nologin 21 | -------------------------------------------------------------------------------- /images/jail/init-users/shadow: -------------------------------------------------------------------------------- 1 | root:*:19634:0:99999:7::: 2 | daemon:*:19634:0:99999:7::: 3 | bin:*:19634:0:99999:7::: 4 | sys:*:19634:0:99999:7::: 5 | sync:*:19634:0:99999:7::: 6 | games:*:19634:0:99999:7::: 7 | man:*:19634:0:99999:7::: 8 | lp:*:19634:0:99999:7::: 9 | mail:*:19634:0:99999:7::: 10 | news:*:19634:0:99999:7::: 11 | uucp:*:19634:0:99999:7::: 12 | proxy:*:19634:0:99999:7::: 13 | www-data:*:19634:0:99999:7::: 14 | backup:*:19634:0:99999:7::: 15 | list:*:19634:0:99999:7::: 16 | irc:*:19634:0:99999:7::: 17 | gnats:*:19634:0:99999:7::: 18 | nobody:*:19634:0:99999:7::: 19 | _apt:*:19634:0:99999:7::: 20 | sshd:*:19870:0:99999:7::: 21 | -------------------------------------------------------------------------------- /images/jail/motd/00-welcome: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | [ -r /etc/lsb-release ] && . /etc/lsb-release 4 | 5 | if [ -z "$DISTRIB_DESCRIPTION" ] && [ -x /usr/bin/lsb_release ]; then 6 | DISTRIB_DESCRIPTION=$(lsb_release -s -d) 7 | fi 8 | 9 | CPU_ARCH="$(uname -m)" 10 | HOSTNAME="$(hostname)" 11 | 12 | printf "Welcome to Soperator cluster \n\n" 13 | printf "You are on node %s (%s %s) \n" "${HOSTNAME}" "${DISTRIB_DESCRIPTION}" "${CPU_ARCH}" 14 | -------------------------------------------------------------------------------- /images/jail/motd/20-slurm-stats: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SLURM_STATS_CMD=" 4 | CONTROLLERS=\"\$(scontrol ping 2>/dev/null)\" 5 | CONTROLLERS_EXITCODE=\"\$?\" 6 | 7 | if [ \"\${CONTROLLERS_EXITCODE}\" = \"0\" ]; then 8 | printf \"\\nSlurm nodes:\\n\" 9 | sinfo --Format=PartitionName:12,CPUs:7,Memory:10,Gres:37,Nodes:8,NodeList:26,StateLong:8,Reason:50 2>/dev/null | sed 's/^/ /' 10 | printf \"\\n\" 11 | 12 | QUEUE=\$(squeue --Format=JobID:12,Partition:12,Name:24,UserName:16,State:10,TimeUsed:9,NumNodes:8,ReasonList:50 2>/dev/null | awk 'NR == 1 || \$4 != \"root\"') 13 | QUEUE_LINES=\$(printf \"%s\" \"\${QUEUE}\" | grep -c '^') 14 | if [ \"\${QUEUE_LINES}\" -le 1 ]; then 15 | printf \"No user jobs in the queue\\n\" 16 | else 17 | printf \"Job queue:\\n\" 18 | printf \"%s\\n\" \"\${QUEUE}\" | sed 's/^/ /' 19 | fi 20 | else 21 | printf \"\\nSlurm controllers:\\n\" 22 | echo \"\${CONTROLLERS}\" | sed 's/^/ /' 23 | fi 24 | " 25 | 26 | chroot /mnt/jail /bin/sh -c "${SLURM_STATS_CMD}" 27 | -------------------------------------------------------------------------------- /images/jail/motd/30-ssh-users: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | USER="$(whoami)" 4 | LOGGED_IN_USERS=$(last -F | awk '/still logged in/ {print $1, $5, $6, $7, $8}' | grep -vE "^${USER}") 5 | if [ -z "$LOGGED_IN_USERS" ]; then 6 | printf "\nNo other users are currently logged in \n" 7 | else 8 | printf "\nOther users currently logged in: \n" 9 | printf "%s\n" "$LOGGED_IN_USERS" | awk '{printf " * %s - since %s %s %s %s\n", $1, $2, $3, $4, $5}' 10 | fi 11 | -------------------------------------------------------------------------------- /images/jail/pin_packages/cuda-pins: -------------------------------------------------------------------------------- 1 | Package: cuda-drivers 2 | Pin: version 9999.9999.9999 3 | Pin-Priority: 1001 4 | 5 | Package: nvidia-open 6 | Pin: version 9999.9999.9999 7 | Pin-Priority: 1001 8 | 9 | Package: cuda 10 | Pin: version 12.4.1-1 11 | Pin-Priority: 1001 12 | -------------------------------------------------------------------------------- /images/jail/scripts/nvidia_smi_hostpid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | chroot /run/nvidia/driver nvidia-smi 4 | -------------------------------------------------------------------------------- /images/jail/skel/.bash_logout: -------------------------------------------------------------------------------- 1 | # ~/.bash_logout: executed by bash(1) when login shell exits. 2 | 3 | # when leaving the console clear the screen to increase privacy 4 | 5 | if [ "$SHLVL" = 1 ]; then 6 | [ -x /usr/bin/clear_console ] && /usr/bin/clear_console -q 7 | fi 8 | -------------------------------------------------------------------------------- /images/jail/skel/.config/enroot/.credentials: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebius/soperator/c5d96cc2826b38c6b07bb9277ca895957794d319/images/jail/skel/.config/enroot/.credentials -------------------------------------------------------------------------------- /images/jail/skel/.profile: -------------------------------------------------------------------------------- 1 | # ~/.profile: executed by the command interpreter for login shells. 2 | # This file is not read by bash(1), if ~/.bash_profile or ~/.bash_login 3 | # exists. 4 | # see /usr/share/doc/bash/examples/startup-files for examples. 5 | # the files are located in the bash-doc package. 6 | 7 | # the default umask is set in /etc/profile; for setting the umask 8 | # for ssh logins, install and configure the libpam-umask package. 9 | #umask 022 10 | 11 | # if running bash 12 | if [ -n "$BASH_VERSION" ]; then 13 | # include .bashrc if it exists 14 | if [ -f "$HOME/.bashrc" ]; then 15 | . "$HOME/.bashrc" 16 | fi 17 | fi 18 | 19 | # set PATH so it includes user's private bin if it exists 20 | if [ -d "$HOME/bin" ] ; then 21 | PATH="$HOME/bin:$PATH" 22 | fi 23 | 24 | # set PATH so it includes user's private bin if it exists 25 | if [ -d "$HOME/.local/bin" ] ; then 26 | PATH="$HOME/.local/bin:$PATH" 27 | fi 28 | 29 | export SLURM_TIME_FORMAT='%Y-%m-%d %H:%M:%S.%Z' 30 | -------------------------------------------------------------------------------- /images/jail/skel/.slurm/defaults: -------------------------------------------------------------------------------- 1 | cpu-bind=verbose 2 | -------------------------------------------------------------------------------- /images/k8s_check_job/k8s_check_job.dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=cr.eu-north1.nebius.cloud/soperator/ubuntu:jammy 2 | 3 | FROM $BASE_IMAGE AS k8s_check_job 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y openssh-client && \ 7 | apt-get clean && \ 8 | rm -rf /var/lib/apt/lists/* 9 | -------------------------------------------------------------------------------- /images/login/sshd_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | echo "Link users from jail" 6 | ln -s /mnt/jail/etc/passwd /etc/passwd 7 | ln -s /mnt/jail/etc/group /etc/group 8 | ln -s /mnt/jail/etc/shadow /etc/shadow 9 | ln -s /mnt/jail/etc/gshadow /etc/gshadow 10 | chown -h 0:42 /etc/{shadow,gshadow} 11 | 12 | echo "Link SSH \"message of the day\" scripts from jail" 13 | ln -s /mnt/jail/etc/update-motd.d /etc/update-motd.d 14 | 15 | echo "Link home from jail to use SSH keys from there" 16 | ln -s /mnt/jail/home /home 17 | 18 | echo "Link soperatorchecks home from jail to use SSH keys from there" 19 | ln -s /mnt/jail/opt/soperatorchecks /opt/soperatorchecks 20 | 21 | echo "Create privilege separation directory /var/run/sshd" 22 | mkdir -p /var/run/sshd 23 | 24 | echo "Complement jail rootfs" 25 | /opt/bin/slurm/complement_jail.sh -j /mnt/jail -u /mnt/jail.upper 26 | 27 | # TODO: Since 1.29 kubernetes supports native sidecar containers. We can remove it in feature releases 28 | echo "Waiting until munge started" 29 | while [ ! -S "/run/munge/munge.socket.2" ]; do sleep 2; done 30 | 31 | echo "Start sshd daemon" 32 | /usr/sbin/sshd -D -e -f /mnt/ssh-configs/sshd_config 33 | -------------------------------------------------------------------------------- /images/munge/munge.dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=cr.eu-north1.nebius.cloud/soperator/ubuntu:jammy 2 | 3 | FROM $BASE_IMAGE AS munge 4 | 5 | ARG DEBIAN_FRONTEND=noninteractive 6 | 7 | # Install munge 8 | COPY images/common/scripts/install_munge.sh /opt/bin/ 9 | RUN chmod +x /opt/bin/install_munge.sh && \ 10 | /opt/bin/install_munge.sh && \ 11 | rm /opt/bin/install_munge.sh 12 | 13 | # Update linker cache 14 | RUN ldconfig 15 | 16 | ENV MUNGE_NUM_THREADS=10 17 | ENV MUNGE_KEY_FILE=/etc/munge/munge.key 18 | ENV MUNGE_PID_FILE=/run/munge/munged.pid 19 | ENV MUNGE_SOCKET_FILE=/run/munge/munge.socket.2 20 | 21 | # Copy & run the entrypoint script 22 | COPY images/munge/munge_entrypoint.sh /opt/bin/ 23 | RUN chmod +x /opt/bin/munge_entrypoint.sh 24 | ENTRYPOINT ["/opt/bin/munge_entrypoint.sh"] 25 | -------------------------------------------------------------------------------- /images/munge/munge_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | echo "Bind-mount munge key from K8S secret" 6 | mount --bind /mnt/munge-key/munge.key /etc/munge/munge.key 7 | 8 | echo "Set permissions for shared /run/munge" 9 | chmod 755 /run/munge # It changes permissions of this shared directory in other containers as well 10 | 11 | echo "Start munge daemon" 12 | munged -F --num-threads="$MUNGE_NUM_THREADS" --key-file="$MUNGE_KEY_FILE" --pid-file="$MUNGE_PID_FILE" -S "$MUNGE_SOCKET_FILE" 13 | -------------------------------------------------------------------------------- /images/populate_jail/populate_jail.dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=cr.eu-north1.nebius.cloud/soperator/ubuntu:jammy 2 | 3 | # First stage: untap jail_rootfs.tar 4 | FROM $BASE_IMAGE AS untaped 5 | COPY images/jail_rootfs.tar /jail_rootfs.tar 6 | RUN mkdir /jail && \ 7 | tar -xvf /jail_rootfs.tar -C /jail && \ 8 | rm /jail_rootfs.tar 9 | 10 | # Second stage: copy untaped jail environment to the target 11 | FROM $BASE_IMAGE AS populate_jail 12 | 13 | ARG DEBIAN_FRONTEND=noninteractive 14 | 15 | RUN apt update && \ 16 | apt install -y rclone rsync && \ 17 | apt clean 18 | 19 | COPY --from=untaped /jail /jail 20 | 21 | COPY images/populate_jail/populate_jail_entrypoint.sh . 22 | RUN chmod +x ./populate_jail_entrypoint.sh 23 | ENTRYPOINT ["./populate_jail_entrypoint.sh"] 24 | -------------------------------------------------------------------------------- /images/populate_jail/populate_jail_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while ! mountpoint -q /mnt/jail; do 4 | echo "Waiting until /mnt/jail is mounted" 5 | sleep 10 6 | done 7 | 8 | if [ "$OVERWRITE" != "1" ] && [ -d /mnt/jail/dev ]; then 9 | echo "Jail is already populated and content overwriting is turned off, exiting" 10 | exit 0 11 | fi 12 | 13 | echo "Delete everything from jail directory" 14 | rm -rf -- /mnt/jail/..?* /mnt/jail/.[!.]* /mnt/jail/* 15 | 16 | echo "Rclone and rsync jail rootfs into jail directory" 17 | rclone copy /jail /mnt/jail --progress --transfers="$(( $(nproc) * 2 ))" --links 18 | rsync --verbose --archive --one-file-system --xattrs --numeric-ids --sparse --acls --hard-links /jail/ /mnt/jail/ 19 | 20 | echo "Set permissions for jail directory" 21 | chmod 755 /mnt/jail # Permissions 755 are only allowed permissions for OpenSSH ChrootDirectory feature 22 | 23 | # TODO: Move this to an active check/action when it's implemented 24 | echo "Generate an internal SSH keypair for user root" 25 | apt update -y 26 | apt install -y openssh-client 27 | mkdir -p /mnt/jail/root/.ssh 28 | ssh-keygen -t ecdsa -f /mnt/jail/root/.ssh/id_ecdsa -N "" && cat /mnt/jail/root/.ssh/id_ecdsa.pub >> /mnt/jail/root/.ssh/authorized_keys 29 | -------------------------------------------------------------------------------- /images/rebooter/rebooter.dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.24 AS rebooter_builder 2 | 3 | ARG GO_LDFLAGS="" 4 | ARG BUILD_TIME 5 | ARG CGO_ENABLED=0 6 | ARG GOOS=linux 7 | 8 | WORKDIR /operator 9 | 10 | # Copy only the necessary files to build the binary. 11 | COPY api api 12 | COPY cmd cmd 13 | COPY internal internal 14 | COPY pkg pkg 15 | COPY go.mod go.sum ./ 16 | 17 | RUN go mod download 18 | 19 | RUN GOOS=$GOOS CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ 20 | go build -o rebooter ./cmd/rebooter 21 | 22 | ####################################################################################################################### 23 | FROM alpine:latest@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS rebooter 24 | 25 | COPY --from=rebooter_builder /operator/rebooter /usr/bin/ 26 | 27 | RUN addgroup -S -g 1001 rebooter && \ 28 | adduser -S -u 1001 rebooter -G rebooter rebooter && \ 29 | chown 1001:1001 /usr/bin/rebooter && \ 30 | chmod 755 /usr/bin/rebooter 31 | 32 | USER 1001 33 | 34 | CMD ["/usr/bin/rebooter"] 35 | -------------------------------------------------------------------------------- /images/restd/slurmrestd_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Exit immediately if any command returns a non-zero error code 4 | 5 | echo "Symlink slurm configs from K8S config map" 6 | rm -rf /etc/slurm && ln -s /mnt/slurm-configs /etc/slurm 7 | 8 | chown www-data:www-data /usr/sbin/slurmrestd && chmod 500 /usr/sbin/slurmrestd 9 | 10 | echo "Start slurmrestd daemon" 11 | exec /usr/sbin/slurmrestd -f /etc/slurm/slurm_rest.conf -u www-data -g www-data -a rest_auth/jwt -vvvvvv :6820 12 | -------------------------------------------------------------------------------- /images/sconfigcontroller/sconfigcontroller.dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.24 AS sconfigcontroller_builder 2 | 3 | ARG GO_LDFLAGS="" 4 | ARG BUILD_TIME 5 | ARG CGO_ENABLED=0 6 | ARG GOOS=linux 7 | 8 | WORKDIR /operator 9 | 10 | # Copy only the necessary files to build the binary. 11 | COPY api api 12 | COPY cmd cmd 13 | COPY internal internal 14 | COPY pkg pkg 15 | COPY go.mod go.sum ./ 16 | 17 | RUN go mod download 18 | 19 | RUN GOOS=$GOOS CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ 20 | go build -o sconfigcontroller ./cmd/sconfigcontroller 21 | 22 | ####################################################################################################################### 23 | FROM alpine:latest@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS sconfigcontroller 24 | 25 | COPY --from=sconfigcontroller_builder /operator/sconfigcontroller /usr/bin/ 26 | 27 | RUN addgroup -S -g 1001 sconfigcontroller && \ 28 | adduser -S -u 1001 sconfigcontroller -G sconfigcontroller sconfigcontroller && \ 29 | chown 1001:1001 /usr/bin/sconfigcontroller && \ 30 | chmod 755 /usr/bin/sconfigcontroller 31 | 32 | USER 1001 33 | 34 | CMD ["/usr/bin/sconfigcontroller"] 35 | -------------------------------------------------------------------------------- /images/soperator-exporter/soperator-exporter.dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.24 AS exporter-builder 2 | 3 | ARG GO_LDFLAGS="" 4 | ARG BUILD_TIME 5 | ARG CGO_ENABLED=0 6 | ARG GOOS=linux 7 | 8 | WORKDIR /exporter 9 | 10 | # Copy only the necessary files to build the binary. 11 | COPY api api 12 | COPY cmd cmd 13 | COPY internal internal 14 | COPY pkg pkg 15 | COPY go.mod go.sum ./ 16 | 17 | RUN go mod download 18 | 19 | RUN GOOS=$GOOS CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ 20 | go build -o exporter ./cmd/exporter 21 | 22 | ####################################################################################################################### 23 | FROM alpine:latest@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS soperator-exporter 24 | 25 | COPY --from=exporter-builder /exporter/exporter /usr/bin/ 26 | 27 | RUN addgroup -S -g 1001 exporter && \ 28 | adduser -S -u 1001 exporter -G exporter exporter && \ 29 | chown 1001:1001 /usr/bin/exporter && \ 30 | chmod 755 /usr/bin/exporter 31 | 32 | USER 1001 33 | 34 | ENTRYPOINT ["/usr/bin/exporter"] 35 | -------------------------------------------------------------------------------- /images/soperator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.24 AS operator_builder 2 | 3 | ARG GO_LDFLAGS="" 4 | ARG BUILD_TIME 5 | ARG CGO_ENABLED=0 6 | ARG GOOS=linux 7 | 8 | WORKDIR /operator 9 | 10 | # Copy only the necessary files to build the binary. 11 | COPY api api 12 | COPY cmd cmd 13 | COPY internal internal 14 | COPY pkg pkg 15 | COPY go.mod go.sum ./ 16 | 17 | RUN go mod download 18 | 19 | RUN GOOS=$GOOS CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ 20 | go build -o slurm_operator ./cmd/ 21 | 22 | ####################################################################################################################### 23 | FROM alpine:latest@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS slurm-operator 24 | 25 | COPY --from=operator_builder /operator/slurm_operator /usr/bin/ 26 | 27 | RUN addgroup -S -g 1001 operator && \ 28 | adduser -S -u 1001 operator -G operator operator && \ 29 | chown 1001:1001 /usr/bin/slurm_operator && \ 30 | chmod 500 /usr/bin/slurm_operator 31 | 32 | USER 1001 33 | 34 | CMD ["/usr/bin/slurm_operator"] 35 | -------------------------------------------------------------------------------- /images/soperatorchecks/soperatorchecks.dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.24 AS soperatorchecks_builder 2 | 3 | ARG GO_LDFLAGS="" 4 | ARG BUILD_TIME 5 | ARG CGO_ENABLED=0 6 | ARG GOOS=linux 7 | 8 | WORKDIR /operator 9 | 10 | # Copy only the necessary files to build the binary. 11 | COPY api api 12 | COPY cmd cmd 13 | COPY internal internal 14 | COPY pkg pkg 15 | COPY go.mod go.sum ./ 16 | 17 | RUN go mod download 18 | 19 | RUN GOOS=$GOOS CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ 20 | go build -o soperatorchecks ./cmd/soperatorchecks 21 | 22 | ####################################################################################################################### 23 | FROM alpine:latest@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS soperatorchecks 24 | 25 | COPY --from=soperatorchecks_builder /operator/soperatorchecks /usr/bin/ 26 | 27 | RUN addgroup -S -g 1001 soperatorchecks && \ 28 | adduser -S -u 1001 soperatorchecks -G soperatorchecks soperatorchecks && \ 29 | chown 1001:1001 /usr/bin/soperatorchecks && \ 30 | chmod 755 /usr/bin/soperatorchecks 31 | 32 | USER 1001 33 | 34 | CMD ["/usr/bin/soperatorchecks"] 35 | -------------------------------------------------------------------------------- /images/worker/scripts/gpu_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Run GPU healthcheck 6 | exit_code=0 7 | output=$(/usr/bin/nvidia-smi 2>&1) || exit_code=$? 8 | 9 | current_node=$(hostname) 10 | node_info=$(scontrol show node "$current_node") 11 | node_status=$(sinfo -h -n "$current_node" -o "%t") 12 | node_reason=$(echo "$node_info" | grep "Reason=" | awk -F'Reason=' '{print $2}') 13 | 14 | if [[ $exit_code -eq 0 ]]; 15 | then 16 | echo "OK" 17 | if [[ "$node_status" == "drain" && "$node_reason" == "GPUHealthcheckError" ]]; 18 | then 19 | scontrol update NodeName="$current_node" State=resume Reason="" 20 | fi 21 | else 22 | echo "ERROR: nvidia-smi finished with exit code $exit_code" 23 | echo "$output" 24 | scontrol update NodeName="$current_node" State=drain Reason="GPUHealthcheckError" 25 | fi 26 | -------------------------------------------------------------------------------- /internal/check/consts.go: -------------------------------------------------------------------------------- 1 | package check 2 | 3 | const ( 4 | ForceTrue = true 5 | ForceFalse = false 6 | ) 7 | -------------------------------------------------------------------------------- /internal/check/maintanence.go: -------------------------------------------------------------------------------- 1 | package check 2 | 3 | import "nebius.ai/slurm-operator/internal/consts" 4 | 5 | func IsMaintenanceActive(maintenance *consts.MaintenanceMode) bool { 6 | return maintenance != nil && *maintenance != consts.ModeNone && *maintenance != consts.ModeSkipPopulate 7 | } 8 | 9 | func IsModeDownscaleAndDeletePopulate(maintenance *consts.MaintenanceMode) bool { 10 | return maintenance != nil && *maintenance == consts.ModeDownscaleAndDeletePopulate 11 | } 12 | 13 | func IsModeDownscaleAndOverwritePopulate(maintenance *consts.MaintenanceMode) bool { 14 | return maintenance != nil && *maintenance == consts.ModeDownscaleAndOverwritePopulate 15 | } 16 | 17 | func IsModeSkipPopulateJail(maintenance *consts.MaintenanceMode) bool { 18 | return maintenance != nil && *maintenance == consts.ModeSkipPopulate 19 | } 20 | -------------------------------------------------------------------------------- /internal/check/resources.go: -------------------------------------------------------------------------------- 1 | package check 2 | 3 | import ( 4 | "fmt" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | ) 8 | 9 | func CheckResourceRequests(resources corev1.ResourceRequirements) error { 10 | memoryQuantity, memoryOk := resources.Requests[corev1.ResourceMemory] 11 | cpuQuantity, cpuOk := resources.Requests[corev1.ResourceCPU] 12 | 13 | if !memoryOk || memoryQuantity.IsZero() { 14 | return fmt.Errorf("memory request not set or is zero") 15 | } 16 | 17 | if !cpuOk || cpuQuantity.IsZero() { 18 | return fmt.Errorf("CPU request not set or is zero") 19 | } 20 | 21 | return nil 22 | } 23 | -------------------------------------------------------------------------------- /internal/consts/accounting.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | SecretSlurmdbdConfigs = "slurmdbd-configs" 5 | HostnameAccounting = "accounting" 6 | DefaultAccountingPort = 6819 7 | SlurmdbdPidFile = "/var/run/slurmdbd.pid" 8 | SlurmdbdRESTJWTKeyPath = "/var/spool/slurmdbd/jwt_hs256.key" 9 | ) 10 | -------------------------------------------------------------------------------- /internal/consts/activecheck.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | ActiveCheckFinalizer = "slurm.nebius.ai/activecheck-finalizer" 5 | ) 6 | -------------------------------------------------------------------------------- /internal/consts/annotation.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | AnnotationVersions = "versions" 5 | 6 | AnnotationApparmorKey = "container.apparmor.security.beta.kubernetes.io" 7 | DefaultContainerAnnotationName = "kubectl.kubernetes.io/default-container" 8 | AnnotationClusterName = "slurm.nebius.ai/cluster" 9 | AnnotationActiveCheckKey = "slurm.nebius.ai/activecheck" 10 | 11 | AnnotationSConfigControllerSourceKey = LabelSConfigControllerSourceKey + "/path" 12 | DefaultSConfigControllerSourcePath = "/slurm" 13 | ) 14 | -------------------------------------------------------------------------------- /internal/consts/cgroup.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | CGroupV2 = "v2" 5 | CGroupV1 = "v1" 6 | CGroupV2Env = "CGROUP_V2" 7 | ) 8 | -------------------------------------------------------------------------------- /internal/consts/cluster_type.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | import "errors" 4 | 5 | type ClusterType interface { 6 | ClusterType() 7 | String() string 8 | } 9 | 10 | type baseClusterType struct { 11 | value string 12 | } 13 | 14 | func (b baseClusterType) ClusterType() {} 15 | func (b baseClusterType) String() string { 16 | return b.value 17 | } 18 | 19 | var ( 20 | ClusterTypeGPU ClusterType = baseClusterType{"gpu"} 21 | ClusterTypeCPU ClusterType = baseClusterType{"cpu"} 22 | ) 23 | 24 | var clusterTypeMap = map[string]ClusterType{ 25 | "gpu": ClusterTypeGPU, 26 | "cpu": ClusterTypeCPU, 27 | } 28 | 29 | func StringToClusterType(s string) (ClusterType, error) { 30 | if val, ok := clusterTypeMap[s]; ok { 31 | return val, nil 32 | } 33 | return nil, errors.New("unknown ClusterType: " + s) 34 | } 35 | -------------------------------------------------------------------------------- /internal/consts/container.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | ContainerNameSlurmctld = SlurmctldName 5 | ContainerNameAccounting = AccountingName 6 | ContainerNameMunge = Munge 7 | ContainerNameSlurmd = SlurmdName 8 | ContainerNameREST = Slurmrestd 9 | ContainerNameSshd = SshdName 10 | ContainerNameToolkitValidation = "toolkit-validation" 11 | ContainerNameNCCLBenchmark = ncclBenchmark 12 | ContainerNamePopulateJail = populateJail 13 | ContainerNameExporter = Exporter 14 | ContainerNameNodeSysctl = "node-sysctl" 15 | ContainerNameRebooter = "rebooter" 16 | ContainerNameNodeSysctlSleep = "node-sysctl-sleep" 17 | ContainerNameSConfigController = SConfigControllerName 18 | 19 | ContainerSecurityContextCapabilitySysAdmin = "SYS_ADMIN" 20 | 21 | ContainerPortNameExporter = "metrics" 22 | ContainerPortExporter = 8080 23 | ContainerPathExporter = "/metrics" 24 | ContainerSchemeExporter = "http" 25 | ) 26 | -------------------------------------------------------------------------------- /internal/consts/cronjob.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | CronJobNameNCCLBenchmark = ncclBenchmark 5 | ) 6 | -------------------------------------------------------------------------------- /internal/consts/indexfield.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | // IndexFieldSecretMungeKey is a field with the name of the secret containing Slurm key 5 | IndexFieldSecretMungeKey = ".spec.secrets.mungeKey" 6 | ) 7 | -------------------------------------------------------------------------------- /internal/consts/job.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | JobNamePopulateJail = populateJail 5 | ) 6 | -------------------------------------------------------------------------------- /internal/consts/label.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | LabelNameKey = "app.kubernetes.io/name" 5 | LabelNameValue = SlurmCluster 6 | 7 | LabelNameExporterValue = "slurm-exporter" 8 | 9 | // LabelInstanceKey value is taken from the corresponding CRD 10 | LabelInstanceKey = "app.kubernetes.io/instance" 11 | 12 | // LabelComponentKey value is taken from the corresponding CRD 13 | LabelComponentKey = "app.kubernetes.io/component" 14 | 15 | LabelPartOfKey = "app.kubernetes.io/part-of" 16 | LabelPartOfValue = slurmOperator 17 | 18 | LabelManagedByKey = "app.kubernetes.io/managed-by" 19 | LabelManagedByValue = slurmOperator 20 | LabelValidateKey = "slurm.nebius.ai/webhook" 21 | LabelValidateValue = "true" 22 | 23 | LabelNodeConfiguratorKey = "slurm.nebius.ai/node-configurator" 24 | LabelNodeConfiguratorValue = "true" 25 | 26 | LabelSConfigControllerSourceKey = "sconficontroller.slurm.nebius.ai" 27 | LabelSConfigControllerSourceValue = "true" 28 | 29 | TopologyLabelPrefix = "topologyconf.slurm.nebius.ai" 30 | TierOnePrefix = TopologyLabelPrefix + "/tier-1" 31 | ) 32 | -------------------------------------------------------------------------------- /internal/consts/maintenance.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | type MaintenanceMode string 4 | 5 | const ( 6 | ModeNone MaintenanceMode = "none" 7 | ModeDownscale MaintenanceMode = "downscale" 8 | ModeDownscaleAndDeletePopulate MaintenanceMode = "downscaleAndDeletePopulateJail" 9 | ModeDownscaleAndOverwritePopulate MaintenanceMode = "downscaleAndOverwritePopulateJail" 10 | ModeSkipPopulate MaintenanceMode = "skipPopulateJail" 11 | ) 12 | 13 | const ( 14 | ZeroReplicas = int32(0) 15 | SingleReplicas = int32(1) 16 | ) 17 | -------------------------------------------------------------------------------- /internal/consts/mariadb.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | MariaDbDatabase = "slurm_acct_db" 5 | MariaDbClusterSuffix = "acct-db" 6 | MariaDbTable = "slurm_acct_db.*" 7 | MariaDbUsername = "slurm" 8 | MariaDbPasswordKey = "password" 9 | MariaDbSecretName = "mariadb-password" 10 | MariaDbSecretRootName = "mariadb-root" 11 | MariaDbPort = 3306 12 | MariaDbDefaultMyCnf = `[mariadb] 13 | bind-address=* 14 | default_storage_engine=InnoDB 15 | innodb_default_row_format=DYNAMIC 16 | innodb_buffer_pool_size=32768M 17 | innodb_log_file_size=64M 18 | innodb_lock_wait_timeout=900 19 | max_allowed_packet=16M` 20 | ) 21 | -------------------------------------------------------------------------------- /internal/consts/nccl_topology_type.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | import "errors" 4 | 5 | type NCCLType interface { 6 | ncclType() 7 | String() string 8 | } 9 | 10 | type baseNCCLType struct { 11 | value string 12 | } 13 | 14 | func (b baseNCCLType) ncclType() {} 15 | func (b baseNCCLType) String() string { 16 | return b.value 17 | } 18 | 19 | var ( 20 | NCCLTypeAuto NCCLType = baseNCCLType{"auto"} 21 | NCCLTypeCustom NCCLType = baseNCCLType{"custom"} 22 | ) 23 | 24 | var ncclTypeMap = map[string]NCCLType{ 25 | "auto": NCCLTypeAuto, 26 | "custom": NCCLTypeCustom, 27 | } 28 | 29 | func StringToNCCLType(s string) (NCCLType, error) { 30 | if val, ok := ncclTypeMap[s]; ok { 31 | return val, nil 32 | } 33 | return nil, errors.New("unknown NCCLType: " + s) 34 | } 35 | -------------------------------------------------------------------------------- /internal/consts/node_configurator.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | RebooterMethodEnv = "REBOOTER_EVICTION_METHOD" 5 | RebooterNodeNameEnv = "REBOOTER_NODE_NAME" 6 | ) 7 | 8 | type RebooterMethod string 9 | 10 | const ( 11 | RebooterEvict RebooterMethod = "evict" 12 | RebooterDrain RebooterMethod = "drain" 13 | ) 14 | 15 | const ( 16 | NodeConfiguratorName = "node-configurator" 17 | ) 18 | -------------------------------------------------------------------------------- /internal/consts/nvidia.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const NVIDIAGDRCopy = "NVIDIA_GDRCOPY" 4 | -------------------------------------------------------------------------------- /internal/consts/pagination.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const DefaultLimit int64 = 64 4 | -------------------------------------------------------------------------------- /internal/consts/rest.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | SecretSlurmRESTJWTKey = "rest-jwt-key" 5 | DefaultRESTPort = 6820 6 | RESTJWTKeyPath = "/var/spool/slurmctld/jwt_hs256.key" 7 | HostnameREST = "rest" 8 | ) 9 | -------------------------------------------------------------------------------- /internal/consts/sconfigcontroller.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | InitContainerImageSconfigController = "cr.eu-north1.nebius.cloud/soperator/busybox" 5 | ) 6 | -------------------------------------------------------------------------------- /internal/consts/secret.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | SecretMungeKeyName = Munge 5 | SecretMungeKeyFileName = Munge + ".key" 6 | SecretMungeKeyFileMode = int32(0400) 7 | 8 | SecretRESTJWTKeyFileName = "rest_jwt.key" 9 | SecretRESTJWTKeyFileMode = int32(0400) 10 | 11 | SecretSshdKeysPrivateFileMode = int32(0600) 12 | SecretSshdKeysPublicFileMode = int32(0644) 13 | SecretSshdKeysName = "sshd-keys" 14 | SecretSshdPublicKeysPostfix = ".pub" 15 | SecretSshdECDSAKeyName = "ssh_host_ecdsa_key" 16 | SecretSshdECDSAPubKeyName = SecretSshdECDSAKeyName + SecretSshdPublicKeysPostfix 17 | SecretSshdECDSA25519KeyName = "ssh_host_ed25519_key" 18 | SecretSshdECDSA25519PubKeyName = SecretSshdECDSA25519KeyName + SecretSshdPublicKeysPostfix 19 | SecretSshdRSAKeyName = "ssh_host_rsa_key" 20 | SecretSshdRSAPubKeyName = SecretSshdRSAKeyName + SecretSshdPublicKeysPostfix 21 | ) 22 | -------------------------------------------------------------------------------- /internal/consts/service.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | SlurmctldName = "slurmctld" 5 | AccountingName = "accounting" 6 | SlurmdName = "slurmd" 7 | Slurmrestd = "slurmrestd" 8 | SshdName = "sshd" 9 | Exporter = "exporter" 10 | NodeConfigurator = "node-configurator" 11 | SConfigControllerName = "sconfigctrl" 12 | 13 | ncclBenchmark = "nccl-benchmark" 14 | populateJail = "populate-jail" 15 | ) 16 | -------------------------------------------------------------------------------- /internal/consts/slurm.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | Slurm = "slurm" 5 | slurmPrefix = Slurm + "-" 6 | 7 | SlurmCluster = Slurm + "cluster" 8 | slurmOperator = slurmPrefix + "operator" 9 | 10 | // TODO: we should rename it. It's not only recommended using root user 11 | SlurmUser = "root" 12 | SlurmLogFile = "/dev/null" 13 | SlurmDefaultDebugLevel = "info" 14 | ) 15 | 16 | const ( 17 | SlurmNodeReasonKillTaskFailed string = "Kill task failed" 18 | SlurmNodeReasonNodeReplacement string = "Soperator auto-healing: node replacement process" 19 | SlurmNodeReasonNodeReboot string = "Soperator auto-healing: node reboot process" 20 | ) 21 | 22 | var SlurmNodeReasonsMap = map[string]struct{}{ 23 | SlurmNodeReasonKillTaskFailed: {}, 24 | SlurmNodeReasonNodeReplacement: {}, 25 | SlurmNodeReasonNodeReboot: {}, 26 | } 27 | 28 | const ( 29 | SlurmConfigRawStrategyPatch = "patch" 30 | SlurmConfigRawStrategyOverride = "override" 31 | ) 32 | -------------------------------------------------------------------------------- /internal/consts/sshd.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | // https://linux.die.net/man/5/sshd_config 4 | 5 | const ( 6 | SSHDClientAliveInterval = "3600" // 1 hour 7 | SSHDClientAliveCountMax = "5" 8 | SSHDMaxStartups = "100:50:300" 9 | SSHDLoginGraceTime = "120" 10 | SSHDMaxAuthTries = "4" 11 | ) 12 | -------------------------------------------------------------------------------- /internal/consts/statefulset.go: -------------------------------------------------------------------------------- 1 | package consts 2 | 3 | const ( 4 | PodManagementPolicy = "Parallel" 5 | ) 6 | -------------------------------------------------------------------------------- /internal/consts/version.go: -------------------------------------------------------------------------------- 1 | // This file is generated by make sync-version. 2 | package consts 3 | 4 | const ( 5 | VersionCR = "1.20.0" 6 | ) 7 | -------------------------------------------------------------------------------- /internal/controller/common_subjects_test.go: -------------------------------------------------------------------------------- 1 | package controller_test 2 | 3 | import ( 4 | "context" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | 10 | slurmv1 "nebius.ai/slurm-operator/api/v1" 11 | ) 12 | 13 | type shouldCreateAvailableSlurmCluster struct { 14 | ctx context.Context 15 | client client.Client 16 | crd *slurmv1.SlurmCluster 17 | } 18 | 19 | func (s shouldCreateAvailableSlurmCluster) run() { 20 | By("checking that the CR can be created") 21 | Expect(s.client.Create(s.ctx, s.crd)).Should(Succeed()) 22 | 23 | By("checking that the Slurm Cluster can be fetched") 24 | createdCluster := &slurmv1.SlurmCluster{} 25 | eventuallyGetNamespacedObj(s.ctx, s.client, s.crd.Namespace, s.crd.Name, createdCluster) 26 | } 27 | -------------------------------------------------------------------------------- /internal/controller/fixtures_test.go: -------------------------------------------------------------------------------- 1 | package controller_test 2 | 3 | import ( 4 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 5 | 6 | slurmv1 "nebius.ai/slurm-operator/api/v1" 7 | ) 8 | 9 | func minimalSlurmClusterFixture(namespace string) *slurmv1.SlurmCluster { 10 | return &slurmv1.SlurmCluster{ 11 | TypeMeta: metav1.TypeMeta{ 12 | APIVersion: "slurm.nebius.ai/v1", 13 | Kind: slurmv1.KindSlurmCluster, 14 | }, 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: "minimal-slurm-cluster", 17 | Namespace: namespace, 18 | Labels: map[string]string{ 19 | "app.kubernetes.io/name": "slurmcluster", 20 | "app.kubernetes.io/instance": "test-slurm-cluster", 21 | "app.kubernetes.io/part-of": "slurm-operator", 22 | "app.kubernetes.io/managed-by": "kustomize", 23 | "app.kubernetes.io/created-by": "slurm-operator", 24 | }, 25 | }, 26 | Spec: slurmv1.SlurmClusterSpec{}, 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /internal/controller/helpers_test.go: -------------------------------------------------------------------------------- 1 | package controller_test 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | . "github.com/onsi/gomega" 8 | "k8s.io/apimachinery/pkg/types" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | ) 11 | 12 | func eventuallyGetNamespacedObj(ctx context.Context, client client.Client, namespace, name string, obj client.Object) { 13 | Eventually(func() bool { 14 | err := client.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, obj) 15 | return err == nil 16 | }, time.Second*10, time.Millisecond*300).Should(BeTrue()) 17 | } 18 | -------------------------------------------------------------------------------- /internal/controller/nodesetcontroller/controller_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 Nebius B.V. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package nodesetcontroller_test 18 | 19 | import ( 20 | . "github.com/onsi/ginkgo/v2" 21 | ) 22 | 23 | var _ = Describe("NodeSet Controller", func() { 24 | Context("When reconciling a resource", func() { 25 | 26 | It("should successfully reconcile the resource", func() { 27 | 28 | // TODO(user): Add more specific assertions depending on your controller's reconciliation logic. 29 | // Example: If you expect a certain status condition after reconciliation, verify it here. 30 | }) 31 | }) 32 | }) 33 | -------------------------------------------------------------------------------- /internal/controller/reconciler/fake_error_client_test.go: -------------------------------------------------------------------------------- 1 | package reconciler 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | 7 | apierrors "k8s.io/apimachinery/pkg/api/errors" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "sigs.k8s.io/controller-runtime/pkg/client" 10 | ) 11 | 12 | type fakeGoneClient struct { 13 | client.Client 14 | } 15 | 16 | func (c *fakeGoneClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { 17 | return &apierrors.StatusError{ 18 | ErrStatus: metav1.Status{ 19 | Status: metav1.StatusFailure, 20 | Code: http.StatusGone, 21 | Reason: metav1.StatusReasonGone, 22 | Message: "the resource is gone", 23 | }, 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /internal/controller/sconfigcontroller/file_store.go: -------------------------------------------------------------------------------- 1 | package sconfigcontroller 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | ) 8 | 9 | type FileStore struct { 10 | path string 11 | } 12 | 13 | func NewFileStore(path string) *FileStore { 14 | return &FileStore{ 15 | path: path, 16 | } 17 | } 18 | 19 | func ensureDir(dirPath string) error { 20 | _, err := os.Stat(dirPath) 21 | switch { 22 | case err == nil: 23 | return nil 24 | case os.IsNotExist(err): 25 | if err := os.MkdirAll(dirPath, 0755); err != nil { 26 | return fmt.Errorf("create directory %q: %w", dirPath, err) 27 | } 28 | return nil 29 | default: 30 | return fmt.Errorf("check directory %q: %w", dirPath, err) 31 | } 32 | } 33 | 34 | func (s *FileStore) Add(name, content, subPath string) error { 35 | dirPath := filepath.Join(s.path, subPath) 36 | 37 | if err := ensureDir(dirPath); err != nil { 38 | return err 39 | } 40 | 41 | file, err := os.Create(fmt.Sprintf("%s/%s", dirPath, name)) 42 | if err != nil { 43 | return fmt.Errorf("open file: %w", err) 44 | } 45 | 46 | defer file.Close() 47 | 48 | if _, err = file.Write([]byte(content)); err != nil { 49 | return fmt.Errorf("write file: %w", err) 50 | } 51 | 52 | return nil 53 | } 54 | -------------------------------------------------------------------------------- /internal/controller/state/state.go: -------------------------------------------------------------------------------- 1 | package state 2 | 3 | import ( 4 | "strings" 5 | 6 | "k8s.io/apimachinery/pkg/runtime/schema" 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | ) 9 | 10 | type reconciliationState struct { 11 | data map[string]any 12 | } 13 | 14 | var ( 15 | ReconciliationState = reconciliationState{ 16 | data: map[string]any{}, 17 | } 18 | ) 19 | 20 | func (s *reconciliationState) ensureData() { 21 | if s.data == nil { 22 | s.data = map[string]any{} 23 | } 24 | } 25 | 26 | func (s *reconciliationState) buildKey(kind schema.ObjectKind, key client.ObjectKey) string { 27 | return strings.Join([]string{kind.GroupVersionKind().String(), key.String()}, "/") 28 | } 29 | 30 | func (s *reconciliationState) Set(kind schema.ObjectKind, key client.ObjectKey) { 31 | s.ensureData() 32 | 33 | s.data[s.buildKey(kind, key)] = nil 34 | } 35 | 36 | func (s *reconciliationState) Present(kind schema.ObjectKind, key client.ObjectKey) bool { 37 | s.ensureData() 38 | 39 | _, found := s.data[s.buildKey(kind, key)] 40 | return found 41 | } 42 | 43 | func (s *reconciliationState) Remove(kind schema.ObjectKind, key client.ObjectKey) { 44 | delete(s.data, s.buildKey(kind, key)) 45 | } 46 | -------------------------------------------------------------------------------- /internal/controller/topologyconfcontroller/nodetopology_controller_test.go: -------------------------------------------------------------------------------- 1 | package topologyconfcontroller_test 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | tc "nebius.ai/slurm-operator/internal/controller/topologyconfcontroller" 9 | ) 10 | 11 | func TestExtractTierLabels(t *testing.T) { 12 | // Test data 13 | k8sNodeLabels := map[string]string{ 14 | consts.TopologyLabelPrefix + "/tier-1": "leaf00", 15 | consts.TopologyLabelPrefix + "/other": "value", 16 | consts.TopologyLabelPrefix + "/tier-2": "spine00", 17 | "unrelated.label": "unrelatedValue", 18 | } 19 | 20 | // Expected result 21 | expected := map[string]string{ 22 | "tier-1": "leaf00", 23 | "tier-2": "spine00", 24 | } 25 | 26 | // Call the function 27 | result := tc.ExtractTierLabels(k8sNodeLabels, consts.TopologyLabelPrefix) 28 | 29 | // Validate the result 30 | if !reflect.DeepEqual(result, expected) { 31 | t.Errorf("ExtractTierLabels() = %v, want %v", result, expected) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/controllerconfig/options.go: -------------------------------------------------------------------------------- 1 | package controllerconfig 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | "k8s.io/client-go/util/workqueue" 8 | "sigs.k8s.io/controller-runtime/pkg/controller" 9 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 10 | ) 11 | 12 | var ( 13 | optionsInit sync.Once 14 | defaultOptions *controller.Options 15 | ) 16 | 17 | func ControllerOptions(maxConcurrency int, cacheSyncTimeout time.Duration) controller.Options { 18 | rateLimiters := workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](30*time.Second, 5*time.Minute) 19 | optionsInit.Do(func() { 20 | defaultOptions = &controller.Options{ 21 | RateLimiter: rateLimiters, 22 | CacheSyncTimeout: cacheSyncTimeout, 23 | MaxConcurrentReconciles: maxConcurrency, 24 | } 25 | }) 26 | return *defaultOptions 27 | } 28 | -------------------------------------------------------------------------------- /internal/exporter/state.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type metricsCollectorState struct { 8 | lastUpdateTime time.Time 9 | } 10 | 11 | // newMetricsCollectorState initializes a new metrics collector state 12 | func newMetricsCollectorState() metricsCollectorState { 13 | return metricsCollectorState{ 14 | lastUpdateTime: time.Now(), 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /internal/jwt/consts.go: -------------------------------------------------------------------------------- 1 | package jwt 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | var ( 8 | DefaultTokenLifetime = time.Hour * 24 * 30 9 | DefaultTokenEviction = time.Hour * 24 10 | ) 11 | 12 | const ( 13 | SigningKeyLength = 32 14 | DefaultMaxCacheEntries = 10 15 | ) 16 | -------------------------------------------------------------------------------- /internal/jwt/signing_key.go: -------------------------------------------------------------------------------- 1 | package jwt 2 | 3 | import ( 4 | "crypto/rand" 5 | ) 6 | 7 | // GenerateSigningKey generates a cryptographically secure random signing key. 8 | func GenerateSigningKey() ([]byte, error) { 9 | key := make([]byte, SigningKeyLength) 10 | 11 | _, err := rand.Read(key) 12 | if err != nil { 13 | return nil, err 14 | } 15 | 16 | return key, nil 17 | } 18 | -------------------------------------------------------------------------------- /internal/logfield/fields.go: -------------------------------------------------------------------------------- 1 | package logfield 2 | 3 | import ( 4 | "nebius.ai/slurm-operator/internal/consts" 5 | ) 6 | 7 | const ( 8 | SlurmCluster = consts.Slurm + ".cluster" 9 | ) 10 | 11 | const ( 12 | ClusterNamespace = SlurmCluster + ".namespace" 13 | ClusterName = SlurmCluster + ".name" 14 | 15 | ResourceKind = SlurmCluster + ".resourceKind" 16 | ResourceName = SlurmCluster + ".resourceName" 17 | 18 | SubResourceKind = SlurmCluster + ".subResourceKind" 19 | SubResourceName = SlurmCluster + ".subResourceName" 20 | ) 21 | -------------------------------------------------------------------------------- /internal/logfield/resource.go: -------------------------------------------------------------------------------- 1 | package logfield 2 | 3 | import ( 4 | "reflect" 5 | 6 | "sigs.k8s.io/controller-runtime/pkg/client" 7 | ) 8 | 9 | func ResourceKV(obj client.Object) []any { 10 | var kind string 11 | t := reflect.TypeOf(obj) 12 | if t.Kind() == reflect.Ptr { 13 | kind = t.Elem().Name() 14 | } else { 15 | kind = t.Name() 16 | } 17 | return []any{ 18 | SubResourceKind, kind, 19 | SubResourceName, obj.GetName(), 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /internal/render/accounting/deployment_test.go: -------------------------------------------------------------------------------- 1 | package accounting_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | appsv1 "k8s.io/api/apps/v1" 8 | 9 | "nebius.ai/slurm-operator/internal/consts" 10 | "nebius.ai/slurm-operator/internal/naming" 11 | "nebius.ai/slurm-operator/internal/render/accounting" 12 | "nebius.ai/slurm-operator/internal/render/common" 13 | ) 14 | 15 | func Test_RenderDeployment(t *testing.T) { 16 | 17 | deployment, err := accounting.RenderDeployment(defaultNamespace, defaultNameCluster, acc, defaultNodeFilter, defaultVolumeSources, slurmTopologyConfigMapRefName) 18 | assert.NoError(t, err) 19 | 20 | assert.Equal(t, naming.BuildDeploymentName(consts.ComponentTypeAccounting), deployment.Name) 21 | assert.Equal(t, defaultNamespace, deployment.Namespace) 22 | assert.Equal(t, common.RenderLabels(consts.ComponentTypeAccounting, defaultNameCluster), deployment.Labels) 23 | 24 | assert.Equal(t, &acc.Deployment.Replicas, deployment.Spec.Replicas) 25 | assert.Equal(t, appsv1.RecreateDeploymentStrategyType, deployment.Spec.Strategy.Type) 26 | assert.Equal(t, common.RenderMatchLabels(consts.ComponentTypeAccounting, defaultNameCluster), deployment.Spec.Selector.MatchLabels) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /internal/render/accounting/grant_test.go: -------------------------------------------------------------------------------- 1 | package accounting_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | slurmv1 "nebius.ai/slurm-operator/api/v1" 8 | "nebius.ai/slurm-operator/internal/consts" 9 | . "nebius.ai/slurm-operator/internal/render/accounting" 10 | "nebius.ai/slurm-operator/internal/values" 11 | ) 12 | 13 | func Test_RenderMariaDbGrant(t *testing.T) { 14 | namespace := "test-namespace" 15 | clusterName := "test-cluster" 16 | accounting := &values.SlurmAccounting{ 17 | MariaDb: slurmv1.MariaDbOperator{ 18 | Enabled: true, 19 | }, 20 | } 21 | 22 | grant, err := RenderMariaDbGrant(namespace, clusterName, accounting) 23 | 24 | assert.NoError(t, err) 25 | assert.NotNil(t, grant) 26 | assert.Equal(t, namespace, grant.Namespace) 27 | assert.Equal(t, clusterName+"-"+consts.MariaDbClusterSuffix, grant.Name) 28 | assert.Equal(t, "ALL PRIVILEGES", grant.Spec.Privileges[0]) 29 | } 30 | -------------------------------------------------------------------------------- /internal/render/accounting/mariadb_password.go: -------------------------------------------------------------------------------- 1 | package accounting 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/sethvargo/go-password/password" 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "nebius.ai/slurm-operator/internal/consts" 10 | ) 11 | 12 | func RenderSecretMariaDb( 13 | namespace, 14 | secretName, 15 | clusterName string, 16 | ) (*corev1.Secret, error) { 17 | generator, err := password.NewGenerator(&password.GeneratorInput{ 18 | Symbols: "@$^&*()_+-={}|[]<>/", 19 | }) 20 | if err != nil { 21 | return nil, fmt.Errorf("error creating password generator: %v", err) 22 | } 23 | password, err := generator.Generate(16, 4, 2, false, false) 24 | if err != nil { 25 | return nil, fmt.Errorf("error generating password Secret: %v", err) 26 | } 27 | 28 | annotations := map[string]string{ 29 | consts.AnnotationClusterName: clusterName, 30 | } 31 | labels := map[string]string{ 32 | consts.LabelNameKey: consts.LabelNameValue, 33 | consts.LabelValidateKey: consts.LabelValidateValue, 34 | } 35 | 36 | data := map[string][]byte{ 37 | "password": []byte(password), 38 | } 39 | 40 | return &corev1.Secret{ 41 | ObjectMeta: metav1.ObjectMeta{ 42 | Name: secretName, 43 | Namespace: namespace, 44 | Labels: labels, 45 | Annotations: annotations, 46 | }, 47 | Data: data, 48 | }, nil 49 | } 50 | -------------------------------------------------------------------------------- /internal/render/accounting/service.go: -------------------------------------------------------------------------------- 1 | package accounting 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/apimachinery/pkg/util/intstr" 7 | 8 | "nebius.ai/slurm-operator/internal/consts" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | "nebius.ai/slurm-operator/internal/values" 11 | ) 12 | 13 | // RenderService renders new [corev1.Service] serving Slurm accountings 14 | func RenderService(namespace, clusterName string, accounting *values.SlurmAccounting) *corev1.Service { 15 | return &corev1.Service{ 16 | ObjectMeta: metav1.ObjectMeta{ 17 | Name: accounting.Service.Name, 18 | Namespace: namespace, 19 | Labels: common.RenderLabels(consts.ComponentTypeAccounting, clusterName), 20 | }, 21 | Spec: corev1.ServiceSpec{ 22 | Type: accounting.Service.Type, 23 | Selector: common.RenderMatchLabels(consts.ComponentTypeAccounting, clusterName), 24 | ClusterIP: "", 25 | Ports: []corev1.ServicePort{{ 26 | Protocol: accounting.Service.Protocol, 27 | Port: accounting.ContainerAccounting.Port, 28 | TargetPort: intstr.FromString(accounting.ContainerAccounting.Name), 29 | }}, 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /internal/render/accounting/service_test.go: -------------------------------------------------------------------------------- 1 | package accounting_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | "k8s.io/apimachinery/pkg/util/intstr" 8 | 9 | "nebius.ai/slurm-operator/internal/consts" 10 | "nebius.ai/slurm-operator/internal/render/accounting" 11 | "nebius.ai/slurm-operator/internal/render/common" 12 | ) 13 | 14 | func Test_RenderService(t *testing.T) { 15 | namespace := "test-namespace" 16 | clusterName := "test-cluster" 17 | 18 | service := accounting.RenderService(namespace, clusterName, acc) 19 | 20 | assert.Equal(t, acc.Service.Name, service.Name) 21 | assert.Equal(t, namespace, service.Namespace) 22 | assert.Equal(t, common.RenderLabels(consts.ComponentTypeAccounting, clusterName), service.Labels) 23 | 24 | assert.Equal(t, acc.Service.Type, service.Spec.Type) 25 | assert.Equal(t, common.RenderMatchLabels(consts.ComponentTypeAccounting, clusterName), service.Spec.Selector) 26 | assert.Equal(t, "", service.Spec.ClusterIP) 27 | assert.Equal(t, acc.Service.Protocol, service.Spec.Ports[0].Protocol) 28 | assert.Equal(t, acc.ContainerAccounting.Port, service.Spec.Ports[0].Port) 29 | assert.Equal(t, intstr.FromString(acc.ContainerAccounting.Name), service.Spec.Ports[0].TargetPort) 30 | } 31 | -------------------------------------------------------------------------------- /internal/render/accounting/volume.go: -------------------------------------------------------------------------------- 1 | package accounting 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | "k8s.io/utils/ptr" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/naming" 9 | "nebius.ai/slurm-operator/internal/values" 10 | ) 11 | 12 | func RenderVolumeProjecitonSlurmdbdConfigs(clusterName string) *corev1.VolumeProjection { 13 | return &corev1.VolumeProjection{ 14 | Secret: &corev1.SecretProjection{ 15 | LocalObjectReference: corev1.LocalObjectReference{ 16 | Name: naming.BuildSecretSlurmdbdConfigsName(clusterName), 17 | }, 18 | Items: []corev1.KeyToPath{ 19 | { 20 | Key: consts.SlurmdbdConfFile, 21 | Path: consts.SlurmdbdConfFile, 22 | Mode: ptr.To(int32(0600)), 23 | }, 24 | }, 25 | }, 26 | } 27 | } 28 | 29 | func RenderVolumeMountSlurmdbdSpool() corev1.VolumeMount { 30 | return corev1.VolumeMount{ 31 | Name: consts.VolumeNameSpool, 32 | MountPath: consts.VolumeMountPathSpoolSlurmdbd, 33 | ReadOnly: false, 34 | } 35 | } 36 | 37 | func RenderVolumeSlurmdbdSpool(accounting *values.SlurmAccounting) corev1.Volume { 38 | return corev1.Volume{ 39 | Name: consts.VolumeNameSpool, 40 | VolumeSource: corev1.VolumeSource{ 41 | EmptyDir: &corev1.EmptyDirVolumeSource{ 42 | Medium: corev1.StorageMediumDefault, 43 | SizeLimit: accounting.ContainerAccounting.Resources.Storage(), 44 | }, 45 | }, 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /internal/render/accounting/volume_test.go: -------------------------------------------------------------------------------- 1 | package accounting_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | corev1 "k8s.io/api/core/v1" 8 | "k8s.io/apimachinery/pkg/api/resource" 9 | 10 | slurmv1 "nebius.ai/slurm-operator/api/v1" 11 | "nebius.ai/slurm-operator/internal/consts" 12 | "nebius.ai/slurm-operator/internal/render/accounting" 13 | "nebius.ai/slurm-operator/internal/values" 14 | ) 15 | 16 | func Test_RenderVolumeSlurmdbd(t *testing.T) { 17 | sizeGi := resource.MustParse("1Gi") 18 | testAcc := *acc 19 | testAcc.ContainerAccounting = values.Container{ 20 | NodeContainer: slurmv1.NodeContainer{ 21 | Resources: corev1.ResourceList{ 22 | corev1.ResourceStorage: sizeGi, 23 | }, 24 | }, 25 | } 26 | volume := accounting.RenderVolumeSlurmdbdSpool(&testAcc) 27 | assert.Equal(t, consts.VolumeNameSpool, volume.Name) 28 | assert.Equal(t, corev1.StorageMediumDefault, volume.VolumeSource.EmptyDir.Medium) 29 | assert.Equal(t, &sizeGi, volume.VolumeSource.EmptyDir.SizeLimit) 30 | volumeEmpty := accounting.RenderVolumeSlurmdbdSpool(acc) 31 | assert.Equal(t, &resource.Quantity{Format: "BinarySI"}, volumeEmpty.VolumeSource.EmptyDir.SizeLimit) 32 | } 33 | -------------------------------------------------------------------------------- /internal/render/common/label.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "nebius.ai/slurm-operator/internal/consts" 5 | ) 6 | 7 | // RenderLabels prepends to the provided labels, the default set of labels used for all resources. 8 | // These labels are recommended by k8s https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/ 9 | func RenderLabels(componentType consts.ComponentType, clusterName string) map[string]string { 10 | return map[string]string{ 11 | consts.LabelNameKey: consts.LabelNameValue, 12 | consts.LabelInstanceKey: clusterName, 13 | consts.LabelComponentKey: componentType.String(), 14 | consts.LabelPartOfKey: consts.LabelPartOfValue, 15 | consts.LabelManagedByKey: consts.LabelManagedByValue, 16 | } 17 | } 18 | 19 | // RenderMatchLabels prepends to the provided labels, the default set of match-labels used for all resources. 20 | func RenderMatchLabels(componentType consts.ComponentType, clusterName string) map[string]string { 21 | return map[string]string{ 22 | consts.LabelNameKey: consts.LabelNameValue, 23 | consts.LabelInstanceKey: clusterName, 24 | consts.LabelComponentKey: componentType.String(), 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /internal/render/common/pod.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | "k8s.io/apimachinery/pkg/util/strategicpatch" 9 | ) 10 | 11 | const ( 12 | DefaultPodTerminationGracePeriodSeconds = int64(30) 13 | ) 14 | 15 | func MergePodTemplateSpecs( 16 | baseSpec corev1.PodTemplateSpec, 17 | refSpec *corev1.PodTemplateSpec, 18 | ) (corev1.PodTemplateSpec, error) { 19 | var result corev1.PodTemplateSpec 20 | 21 | originalJSON, err := json.Marshal(baseSpec) 22 | if err != nil { 23 | return corev1.PodTemplateSpec{}, fmt.Errorf("error marshalling original PodTemplateSpec: %v", err) 24 | } 25 | 26 | patchJSON, err := json.Marshal(refSpec) 27 | if err != nil { 28 | return corev1.PodTemplateSpec{}, fmt.Errorf("error marshalling patch PodTemplateSpec: %v", err) 29 | } 30 | 31 | mergedJSON, err := strategicpatch.StrategicMergePatch(originalJSON, patchJSON, &corev1.PodTemplateSpec{}) 32 | if err != nil { 33 | return corev1.PodTemplateSpec{}, fmt.Errorf("error performing strategic merge: %v", err) 34 | } 35 | 36 | // Ummarshal the merged JSON back into a struct 37 | err = json.Unmarshal(mergedJSON, &result) 38 | if err != nil { 39 | return corev1.PodTemplateSpec{}, fmt.Errorf("error unmarshalling merged PodTemplateSpec: %v", err) 40 | } 41 | 42 | return result, nil 43 | } 44 | -------------------------------------------------------------------------------- /internal/render/common/probe.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | const ( 4 | DefaultProbeTimeoutSeconds = 1 5 | DefaultProbePeriodSeconds = 10 6 | DefaultProbeSuccessThreshold = 1 7 | DefaultProbeFailureThreshold = 3 8 | ) 9 | -------------------------------------------------------------------------------- /internal/render/common/resources.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | ) 6 | 7 | // CopyNonCPUResources returns a copy of corev1.ResourceList but excludes CPU from it. 8 | // This is useful for getting resource limits from resource requests for K8s containers. 9 | // They usually should be identical for everything except CPU, because setting CPU limits may lead to throttling by CFS scheduler in Linux. 10 | func CopyNonCPUResources(resourceList corev1.ResourceList) corev1.ResourceList { 11 | limits := corev1.ResourceList{} 12 | for resourceName, quantity := range resourceList { 13 | if resourceName != corev1.ResourceCPU { 14 | limits[resourceName] = quantity 15 | } 16 | } 17 | return limits 18 | } 19 | 20 | type RenderOption func(*renderOptions) 21 | 22 | type renderOptions struct { 23 | guaranteed bool 24 | } 25 | 26 | // GuaranteedPod is a RenderOption that sets the guaranteed flag 27 | // Needed for setting the limits of the container to the same values as the requests. 28 | // This is useful for slurm worker cgroupv2 support. 29 | // It's necessary for cpuset https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#static-policy 30 | func GuaranteedPod(guaranteed bool) RenderOption { 31 | return func(opts *renderOptions) { 32 | opts.guaranteed = guaranteed 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /internal/render/controller/service.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/apimachinery/pkg/util/intstr" 7 | 8 | "nebius.ai/slurm-operator/internal/consts" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | "nebius.ai/slurm-operator/internal/values" 11 | ) 12 | 13 | // RenderService renders new [corev1.Service] serving Slurm controllers 14 | func RenderService(namespace, clusterName string, controller *values.SlurmController) corev1.Service { 15 | return corev1.Service{ 16 | ObjectMeta: metav1.ObjectMeta{ 17 | Name: controller.Service.Name, 18 | Namespace: namespace, 19 | Labels: common.RenderLabels(consts.ComponentTypeController, clusterName), 20 | }, 21 | Spec: corev1.ServiceSpec{ 22 | Type: controller.Service.Type, 23 | Selector: common.RenderMatchLabels(consts.ComponentTypeController, clusterName), 24 | ClusterIP: "None", 25 | Ports: []corev1.ServicePort{{ 26 | Protocol: controller.Service.Protocol, 27 | Port: controller.ContainerSlurmctld.Port, 28 | TargetPort: intstr.FromString(controller.ContainerSlurmctld.Name), 29 | }}, 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /internal/render/exporter/container.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "fmt" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | 8 | "nebius.ai/slurm-operator/internal/consts" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | "nebius.ai/slurm-operator/internal/render/rest" 11 | "nebius.ai/slurm-operator/internal/values" 12 | ) 13 | 14 | func renderContainerExporter(clusterValues *values.SlurmCluster) corev1.Container { 15 | return corev1.Container{ 16 | Name: consts.ContainerNameExporter, 17 | Image: clusterValues.SlurmExporter.Container.Image, 18 | Command: clusterValues.SlurmExporter.Container.Command, 19 | Args: []string{ 20 | fmt.Sprintf("--cluster-namespace=%s", clusterValues.Namespace), 21 | fmt.Sprintf("--cluster-name=%s", clusterValues.Name), 22 | fmt.Sprintf("--slurm-api-server=%s", rest.GetServiceURL(clusterValues.Namespace, &clusterValues.NodeRest)), 23 | fmt.Sprintf("--soperator-version=%s", clusterValues.CRVersion), 24 | }, 25 | ImagePullPolicy: clusterValues.SlurmExporter.Container.ImagePullPolicy, 26 | Ports: []corev1.ContainerPort{ 27 | { 28 | Name: consts.ContainerPortNameExporter, 29 | ContainerPort: consts.ContainerPortExporter, 30 | }, 31 | }, 32 | Resources: corev1.ResourceRequirements{ 33 | Requests: clusterValues.SlurmExporter.Container.Resources, 34 | Limits: common.CopyNonCPUResources(clusterValues.SlurmExporter.Container.Resources), 35 | }, 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /internal/render/exporter/names.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | func buildExporterServiceAccountName(clusterName string) string { 4 | return clusterName + "-exporter-sa" 5 | } 6 | 7 | func buildExporterRoleName(clusterName string) string { 8 | return clusterName + "-exporter-role" 9 | } 10 | 11 | func buildExporterRoleBindingName(clusterName string) string { 12 | return clusterName + "-exporter-role-binding" 13 | } 14 | -------------------------------------------------------------------------------- /internal/render/exporter/pod.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | slurmv1 "nebius.ai/slurm-operator/api/v1" 8 | "nebius.ai/slurm-operator/internal/utils" 9 | "nebius.ai/slurm-operator/internal/values" 10 | ) 11 | 12 | func renderPodTemplateSpec( 13 | clusterValues *values.SlurmCluster, 14 | initContainers []corev1.Container, 15 | matchLabels map[string]string, 16 | ) corev1.PodTemplateSpec { 17 | nodeFilter, err := utils.GetBy( 18 | clusterValues.NodeFilters, 19 | clusterValues.SlurmExporter.K8sNodeFilterName, 20 | func(f slurmv1.K8sNodeFilter) string { return f.Name }, 21 | ) 22 | if err != nil { 23 | _ = err // Ignore not found error, use "empty" node filter. 24 | nodeFilter = slurmv1.K8sNodeFilter{} 25 | } 26 | result := corev1.PodTemplateSpec{ 27 | ObjectMeta: metav1.ObjectMeta{ 28 | Labels: matchLabels, 29 | }, 30 | Spec: corev1.PodSpec{ 31 | Affinity: nodeFilter.Affinity, 32 | Tolerations: nodeFilter.Tolerations, 33 | NodeSelector: nodeFilter.NodeSelector, 34 | InitContainers: initContainers, 35 | Containers: []corev1.Container{renderContainerExporter(clusterValues)}, 36 | ServiceAccountName: buildExporterServiceAccountName(clusterValues.Name), 37 | }, 38 | } 39 | return result 40 | } 41 | -------------------------------------------------------------------------------- /internal/render/exporter/role.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderRole(clusterNamespace, clusterName string) rbacv1.Role { 12 | labels := common.RenderLabels(consts.ComponentTypeExporter, clusterName) 13 | 14 | return rbacv1.Role{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: buildExporterRoleName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | Rules: []rbacv1.PolicyRule{ 21 | { 22 | APIGroups: []string{""}, 23 | Resources: []string{"secrets"}, 24 | Verbs: []string{"get", "list", "watch"}, 25 | }, 26 | }, 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /internal/render/exporter/role_binding.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderExporterRoleBinding(clusterNamespace, clusterName string) rbacv1.RoleBinding { 12 | labels := common.RenderLabels(consts.ComponentTypeExporter, clusterName) 13 | 14 | return rbacv1.RoleBinding{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: buildExporterRoleBindingName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | Subjects: []rbacv1.Subject{ 21 | { 22 | Kind: rbacv1.ServiceAccountKind, 23 | Name: buildExporterServiceAccountName(clusterName), 24 | Namespace: clusterNamespace, 25 | }, 26 | }, 27 | RoleRef: rbacv1.RoleRef{ 28 | Kind: "Role", 29 | Name: buildExporterRoleName(clusterName), 30 | APIGroup: rbacv1.GroupName, 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/render/exporter/service_account.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderServiceAccount(clusterNamespace, clusterName string) corev1.ServiceAccount { 12 | labels := common.RenderLabels(consts.ComponentTypeExporter, clusterName) 13 | 14 | return corev1.ServiceAccount{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: buildExporterServiceAccountName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /internal/render/exporter/vars_test.go: -------------------------------------------------------------------------------- 1 | package exporter_test 2 | 3 | const ( 4 | defaultNamespace = "test-namespace" 5 | defaultNameCluster = "test-cluster" 6 | ) 7 | -------------------------------------------------------------------------------- /internal/render/nodeconfigurator/daemonset.go: -------------------------------------------------------------------------------- 1 | package nodeconfigurator 2 | 3 | import ( 4 | appsv1 "k8s.io/api/apps/v1" 5 | corev1 "k8s.io/api/core/v1" 6 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 7 | 8 | slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" 9 | ) 10 | 11 | // RenderDaemonSet renders the DaemonSet for the node-configurator 12 | func RenderDaemonSet( 13 | nodeConfigurator *slurmv1alpha1.NodeConfigurator, 14 | namespace string, 15 | ) *appsv1.DaemonSet { 16 | if nodeConfigurator == nil { 17 | return nil 18 | } 19 | 20 | labels, matchLabels := renderLabels(nodeConfigurator.Name) 21 | 22 | return &appsv1.DaemonSet{ 23 | ObjectMeta: metav1.ObjectMeta{ 24 | Name: nodeConfigurator.Name + "-ds", 25 | Namespace: namespace, 26 | Labels: labels, 27 | }, 28 | Spec: appsv1.DaemonSetSpec{ 29 | Selector: &metav1.LabelSelector{ 30 | MatchLabels: matchLabels, 31 | }, 32 | Template: corev1.PodTemplateSpec{ 33 | ObjectMeta: metav1.ObjectMeta{ 34 | Labels: labels, 35 | }, 36 | Spec: renderPodSpec(nodeConfigurator.Spec), 37 | }, 38 | }, 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /internal/render/prometheus/exporter_test.go: -------------------------------------------------------------------------------- 1 | package prometheus_test 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | 9 | . "nebius.ai/slurm-operator/internal/render/prometheus" 10 | "nebius.ai/slurm-operator/internal/values" 11 | ) 12 | 13 | func Test_RenderDeploymentExporter_Error(t *testing.T) { 14 | 15 | exporterNil := &values.SlurmExporter{} 16 | 17 | exporterImageNil := &values.SlurmExporter{ 18 | Enabled: true, 19 | } 20 | 21 | testCases := []struct { 22 | valuesExporter *values.SlurmExporter 23 | expectedError error 24 | }{ 25 | { 26 | valuesExporter: exporterNil, 27 | expectedError: errors.New("prometheus is not enabled"), 28 | }, 29 | { 30 | valuesExporter: exporterImageNil, 31 | expectedError: errors.New("image for ContainerExporter is empty"), 32 | }, 33 | } 34 | 35 | for _, tc := range testCases { 36 | t.Run("exporter", func(t *testing.T) { 37 | 38 | _, err := RenderDeploymentExporter( 39 | defaultNamespace, defaultNameCluster, tc.valuesExporter, defaultNodeFilter, defaultVolumeSources, defaultPodTemplate, 40 | ) 41 | if err == nil { 42 | t.Errorf("expected error, got nil") 43 | } 44 | 45 | assert.Equal(t, tc.expectedError, err) 46 | }) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /internal/render/prometheus/names.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | func buildExporterServiceAccountName(clusterName string) string { 4 | return clusterName + "-exporter-sa" 5 | } 6 | 7 | func buildExporterRoleName(clusterName string) string { 8 | return clusterName + "-exporter-role" 9 | } 10 | 11 | func buildExporterRoleBindingName(clusterName string) string { 12 | return clusterName + "-exporter-role-binding" 13 | } 14 | -------------------------------------------------------------------------------- /internal/render/prometheus/role.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderExporterRole(clusterNamespace, clusterName string) rbacv1.Role { 12 | labels := common.RenderLabels(consts.ComponentTypeExporter, clusterName) 13 | 14 | return rbacv1.Role{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: buildExporterRoleName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | Rules: []rbacv1.PolicyRule{ 21 | { 22 | APIGroups: []string{""}, 23 | Resources: []string{"secrets"}, 24 | Verbs: []string{"get", "list", "watch"}, 25 | }, 26 | }, 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /internal/render/prometheus/role_binding.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderExporterRoleBinding(clusterNamespace, clusterName string) rbacv1.RoleBinding { 12 | labels := common.RenderLabels(consts.ComponentTypeExporter, clusterName) 13 | 14 | return rbacv1.RoleBinding{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: buildExporterRoleBindingName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | Subjects: []rbacv1.Subject{ 21 | { 22 | Kind: rbacv1.ServiceAccountKind, 23 | Name: buildExporterServiceAccountName(clusterName), 24 | Namespace: clusterNamespace, 25 | }, 26 | }, 27 | RoleRef: rbacv1.RoleRef{ 28 | Kind: "Role", 29 | Name: buildExporterRoleName(clusterName), 30 | APIGroup: rbacv1.GroupName, 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/render/prometheus/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package prometheus 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderServiceAccount(clusterNamespace, clusterName string) corev1.ServiceAccount { 12 | labels := common.RenderLabels(consts.ComponentTypeExporter, clusterName) 13 | 14 | return corev1.ServiceAccount{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: buildExporterServiceAccountName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /internal/render/rest/secret.go: -------------------------------------------------------------------------------- 1 | package rest 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/jwt" 9 | "nebius.ai/slurm-operator/internal/naming" 10 | "nebius.ai/slurm-operator/internal/render/common" 11 | ) 12 | 13 | func RenderSecret(namespace, clusterName string) (corev1.Secret, error) { 14 | secretName := naming.BuildSecretSlurmRESTSecretName(clusterName) 15 | labels := common.RenderLabels(consts.ComponentTypeREST, clusterName) 16 | key, err := jwt.GenerateSigningKey() 17 | if err != nil { 18 | return corev1.Secret{}, err 19 | } 20 | 21 | data := map[string][]byte{consts.SecretRESTJWTKeyFileName: key} 22 | 23 | return corev1.Secret{ 24 | ObjectMeta: metav1.ObjectMeta{ 25 | Name: secretName, 26 | Namespace: namespace, 27 | Labels: labels, 28 | }, 29 | Data: data, 30 | }, nil 31 | } 32 | -------------------------------------------------------------------------------- /internal/render/rest/service.go: -------------------------------------------------------------------------------- 1 | package rest 2 | 3 | import ( 4 | "fmt" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | "k8s.io/apimachinery/pkg/util/intstr" 9 | 10 | "nebius.ai/slurm-operator/internal/consts" 11 | "nebius.ai/slurm-operator/internal/render/common" 12 | "nebius.ai/slurm-operator/internal/values" 13 | ) 14 | 15 | // RenderService renders new [corev1.Service] serving Slurm REST API 16 | func RenderService(namespace, clusterName string, rest *values.SlurmREST) *corev1.Service { 17 | return &corev1.Service{ 18 | ObjectMeta: metav1.ObjectMeta{ 19 | Name: rest.Service.Name, 20 | Namespace: namespace, 21 | Labels: common.RenderLabels(consts.ComponentTypeREST, clusterName), 22 | }, 23 | Spec: corev1.ServiceSpec{ 24 | Type: rest.Service.Type, 25 | Selector: common.RenderMatchLabels(consts.ComponentTypeREST, clusterName), 26 | ClusterIP: "", 27 | Ports: []corev1.ServicePort{{ 28 | Protocol: rest.Service.Protocol, 29 | Port: rest.ContainerREST.Port, 30 | TargetPort: intstr.FromString(rest.ContainerREST.Name), 31 | }}, 32 | }, 33 | } 34 | } 35 | 36 | func GetServiceURL(namespace string, rest *values.SlurmREST) string { 37 | return fmt.Sprintf("http://%s.%s.svc:%d", rest.Service.Name, namespace, rest.ContainerREST.Port) 38 | } 39 | -------------------------------------------------------------------------------- /internal/render/sconfigcontroller/container.go: -------------------------------------------------------------------------------- 1 | package sconfigcontroller 2 | 3 | import ( 4 | "fmt" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | 8 | "nebius.ai/slurm-operator/internal/consts" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | "nebius.ai/slurm-operator/internal/values" 11 | ) 12 | 13 | func renderContainerSConfigController(clusterNamespace, clusterName, slurmAPIServer string, container values.Container) corev1.Container { 14 | // Create a copy of the container's limits and add non-CPU resources from Requests 15 | limits := common.CopyNonCPUResources(container.Resources) 16 | 17 | return corev1.Container{ 18 | Name: consts.ContainerNameSConfigController, 19 | Image: container.Image, 20 | ImagePullPolicy: container.ImagePullPolicy, 21 | VolumeMounts: []corev1.VolumeMount{ 22 | common.RenderVolumeMountJail(), 23 | }, 24 | Resources: corev1.ResourceRequirements{ 25 | Limits: limits, 26 | Requests: container.Resources, 27 | }, 28 | Command: []string{ 29 | "/usr/bin/sconfigcontroller", 30 | }, 31 | Args: []string{ 32 | fmt.Sprintf("--cluster-namespace=%s", clusterNamespace), 33 | fmt.Sprintf("--cluster-name=%s", clusterName), 34 | fmt.Sprintf("--slurmapiserver=%s", slurmAPIServer), 35 | }, 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /internal/render/sconfigcontroller/initcontainer.go: -------------------------------------------------------------------------------- 1 | package sconfigcontroller 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | "k8s.io/utils/ptr" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func renderInitContainerSConfigController() corev1.Container { 12 | // Create a copy of the container's limits and add non-CPU resources from Requests 13 | 14 | // restartPolicy := corev1.ContainerRestartPolicyAlways 15 | return corev1.Container{ 16 | Name: "init-dir", 17 | Image: consts.InitContainerImageSconfigController, 18 | ImagePullPolicy: corev1.PullIfNotPresent, 19 | SecurityContext: &corev1.SecurityContext{ 20 | RunAsUser: ptr.To(int64(0)), 21 | }, 22 | VolumeMounts: []corev1.VolumeMount{ 23 | common.RenderVolumeMountJail(), 24 | }, 25 | Command: []string{"/bin/sh", "-c"}, // Use bash to execute the script 26 | Args: []string{ 27 | "cd /mnt/jail && mkdir -p slurm && chown 1001:1001 slurm && chmod 755 slurm", 28 | }, 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /internal/render/sconfigcontroller/role.go: -------------------------------------------------------------------------------- 1 | package sconfigcontroller 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "nebius.ai/slurm-operator/internal/consts" 7 | "nebius.ai/slurm-operator/internal/naming" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderRole(clusterNamespace, clusterName string) rbacv1.Role { 12 | labels := common.RenderLabels(consts.ComponentTypeSConfigController, clusterName) 13 | 14 | return rbacv1.Role{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: naming.BuildRoleSConfigControllerName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | Rules: []rbacv1.PolicyRule{ 21 | { 22 | APIGroups: []string{""}, 23 | Resources: []string{"configmaps"}, 24 | Verbs: []string{"get", "list", "watch"}, 25 | }, 26 | { 27 | APIGroups: []string{""}, 28 | Resources: []string{"secrets"}, 29 | Verbs: []string{"get", "list", "watch"}, 30 | }, 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/render/sconfigcontroller/role_binding.go: -------------------------------------------------------------------------------- 1 | package sconfigcontroller 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "nebius.ai/slurm-operator/internal/consts" 7 | "nebius.ai/slurm-operator/internal/naming" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderRoleBinding(clusterNamespace, clusterName string) rbacv1.RoleBinding { 12 | labels := common.RenderLabels(consts.ComponentTypeSConfigController, clusterName) 13 | 14 | return rbacv1.RoleBinding{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: naming.BuildRoleBindingSConfigControllerName(clusterName), 17 | Namespace: clusterNamespace, 18 | Labels: labels, 19 | }, 20 | Subjects: []rbacv1.Subject{ 21 | { 22 | Kind: rbacv1.ServiceAccountKind, 23 | Name: naming.BuildServiceAccountSconfigControllerName(clusterName), 24 | Namespace: clusterNamespace, 25 | }, 26 | }, 27 | RoleRef: rbacv1.RoleRef{ 28 | Kind: "Role", 29 | Name: naming.BuildRoleSConfigControllerName(clusterName), 30 | APIGroup: rbacv1.GroupName, 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/render/sconfigcontroller/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package sconfigcontroller 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/naming" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | ) 11 | 12 | func RenderServiceAccount(clusterNamespace, clusterName string) corev1.ServiceAccount { 13 | labels := common.RenderLabels(consts.ComponentTypeWorker, clusterName) 14 | 15 | return corev1.ServiceAccount{ 16 | ObjectMeta: metav1.ObjectMeta{ 17 | Name: naming.BuildServiceAccountSconfigControllerName(clusterName), 18 | Namespace: clusterNamespace, 19 | Labels: labels, 20 | }, 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /internal/render/soperatorchecks/configmap.go: -------------------------------------------------------------------------------- 1 | package soperatorchecks 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/naming" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | ) 11 | 12 | // RenderSbatchConfigMap renders new [corev1.ConfigMap] containing slurm sbatch script 13 | func RenderSbatchConfigMap(checkName, clusterName, namespace, script string) corev1.ConfigMap { 14 | return corev1.ConfigMap{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: naming.BuildConfigMapSbatchScriptName(checkName), 17 | Namespace: namespace, 18 | Labels: common.RenderLabels(consts.ComponentTypeSoperatorChecks, clusterName), 19 | }, 20 | Data: map[string]string{ 21 | consts.ConfigMapKeySoperatorcheckSbatch: script, 22 | }, 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /internal/render/soperatorchecks/job.go: -------------------------------------------------------------------------------- 1 | package soperatorchecks 2 | 3 | import ( 4 | "fmt" 5 | 6 | batchv1 "k8s.io/api/batch/v1" 7 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | "k8s.io/utils/ptr" 9 | 10 | slurmv1alpha1 "nebius.ai/slurm-operator/api/v1alpha1" 11 | "nebius.ai/slurm-operator/internal/consts" 12 | "nebius.ai/slurm-operator/internal/render/common" 13 | ) 14 | 15 | func RenderK8sJob(check *slurmv1alpha1.ActiveCheck, cronJob *batchv1.CronJob) *batchv1.Job { 16 | labels := common.RenderLabels(consts.ComponentTypeSoperatorChecks, check.Spec.SlurmClusterRefName) 17 | 18 | return &batchv1.Job{ 19 | ObjectMeta: metav1.ObjectMeta{ 20 | Name: RenderK8sJobName(check), 21 | Namespace: check.Namespace, 22 | OwnerReferences: []metav1.OwnerReference{ 23 | { 24 | APIVersion: cronJob.APIVersion, 25 | Kind: cronJob.Kind, 26 | Name: cronJob.Name, 27 | UID: cronJob.UID, 28 | Controller: ptr.To(true), 29 | BlockOwnerDeletion: ptr.To(true), 30 | }, 31 | }, 32 | Labels: labels, 33 | Annotations: cronJob.Spec.JobTemplate.Annotations, 34 | }, 35 | Spec: *cronJob.Spec.JobTemplate.Spec.DeepCopy(), 36 | } 37 | } 38 | 39 | func RenderK8sJobName(check *slurmv1alpha1.ActiveCheck) string { 40 | return fmt.Sprintf("%s-initial-run", check.Spec.Name) 41 | } 42 | -------------------------------------------------------------------------------- /internal/render/soperatorchecks/role.go: -------------------------------------------------------------------------------- 1 | package soperatorchecks 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "nebius.ai/slurm-operator/internal/consts" 7 | "nebius.ai/slurm-operator/internal/naming" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderRole(namespace, clusterName string) rbacv1.Role { 12 | labels := common.RenderLabels(consts.ComponentTypeSoperatorChecks, clusterName) 13 | 14 | return rbacv1.Role{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: naming.BuildRoleActiveCheckName(clusterName), 17 | Namespace: namespace, 18 | Labels: labels, 19 | }, 20 | Rules: []rbacv1.PolicyRule{ 21 | { 22 | APIGroups: []string{""}, 23 | Resources: []string{"pods"}, 24 | Verbs: []string{"get"}, 25 | }, 26 | { 27 | APIGroups: []string{"batch"}, 28 | Resources: []string{"jobs"}, 29 | Verbs: []string{"get", "patch"}, 30 | }, 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/render/soperatorchecks/role_binding.go: -------------------------------------------------------------------------------- 1 | package soperatorchecks 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "nebius.ai/slurm-operator/internal/consts" 7 | "nebius.ai/slurm-operator/internal/naming" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderRoleBinding(namespace, clusterName string) rbacv1.RoleBinding { 12 | labels := common.RenderLabels(consts.ComponentTypeSoperatorChecks, clusterName) 13 | 14 | return rbacv1.RoleBinding{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: naming.BuildRoleBindingActiveCheckName(clusterName), 17 | Namespace: namespace, 18 | Labels: labels, 19 | }, 20 | Subjects: []rbacv1.Subject{ 21 | { 22 | Kind: rbacv1.ServiceAccountKind, 23 | Name: naming.BuildServiceAccountActiveCheckName(clusterName), 24 | Namespace: namespace, 25 | }, 26 | }, 27 | RoleRef: rbacv1.RoleRef{ 28 | Kind: "Role", 29 | Name: naming.BuildRoleActiveCheckName(clusterName), 30 | APIGroup: rbacv1.GroupName, 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/render/soperatorchecks/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package soperatorchecks 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/naming" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | ) 11 | 12 | func RenderServiceAccount(namespace, clusterName string) corev1.ServiceAccount { 13 | labels := common.RenderLabels(consts.ComponentTypeSoperatorChecks, clusterName) 14 | 15 | return corev1.ServiceAccount{ 16 | ObjectMeta: metav1.ObjectMeta{ 17 | Name: naming.BuildServiceAccountActiveCheckName(clusterName), 18 | Namespace: namespace, 19 | Labels: labels, 20 | }, 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /internal/render/worker/role.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "nebius.ai/slurm-operator/internal/consts" 7 | "nebius.ai/slurm-operator/internal/naming" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderRole(namespace, clusterName string) rbacv1.Role { 12 | labels := common.RenderLabels(consts.ComponentTypeWorker, clusterName) 13 | 14 | return rbacv1.Role{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: naming.BuildRoleWorkerName(clusterName), 17 | Namespace: namespace, 18 | Labels: labels, 19 | }, 20 | Rules: []rbacv1.PolicyRule{ 21 | { 22 | APIGroups: []string{""}, 23 | Resources: []string{"events"}, 24 | Verbs: []string{"create"}, 25 | }, 26 | }, 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /internal/render/worker/role_binding.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | rbacv1 "k8s.io/api/rbac/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "nebius.ai/slurm-operator/internal/consts" 7 | "nebius.ai/slurm-operator/internal/naming" 8 | "nebius.ai/slurm-operator/internal/render/common" 9 | ) 10 | 11 | func RenderRoleBinding(namespace, clusterName string) rbacv1.RoleBinding { 12 | labels := common.RenderLabels(consts.ComponentTypeWorker, clusterName) 13 | 14 | return rbacv1.RoleBinding{ 15 | ObjectMeta: metav1.ObjectMeta{ 16 | Name: naming.BuildRoleBindingWorkerName(clusterName), 17 | Namespace: namespace, 18 | Labels: labels, 19 | }, 20 | Subjects: []rbacv1.Subject{ 21 | { 22 | Kind: rbacv1.ServiceAccountKind, 23 | Name: naming.BuildServiceAccountWorkerName(clusterName), 24 | Namespace: namespace, 25 | }, 26 | }, 27 | RoleRef: rbacv1.RoleRef{ 28 | Kind: "Role", 29 | Name: naming.BuildRoleWorkerName(clusterName), 30 | APIGroup: rbacv1.GroupName, 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/render/worker/role_test.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "testing" 5 | 6 | "nebius.ai/slurm-operator/internal/naming" 7 | ) 8 | 9 | func Test_RenderRole(t *testing.T) { 10 | namespace := "test-namespace" 11 | clusterName := "test-cluster" 12 | 13 | role := RenderRole(namespace, clusterName) 14 | 15 | // Check the name 16 | if role.Name != naming.BuildRoleWorkerName(clusterName) { 17 | t.Errorf("Unexpected name: got %v, want %v", role.Name, naming.BuildRoleWorkerName(clusterName)) 18 | } 19 | 20 | // Check the namespace 21 | if role.Namespace != namespace { 22 | t.Errorf("Unexpected namespace: got %v, want %v", role.Namespace, namespace) 23 | } 24 | 25 | // Check the rules 26 | if len(role.Rules) != 1 || role.Rules[0].APIGroups[0] != "" || role.Rules[0].Resources[0] != "events" || role.Rules[0].Verbs[0] != "create" { 27 | t.Errorf("Unexpected rules: got %v, want one rule with apiGroups=[\"\"], resources=[\"events\"], and verbs=[\"create\"]", role.Rules) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /internal/render/worker/service.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | "k8s.io/apimachinery/pkg/util/intstr" 7 | 8 | "nebius.ai/slurm-operator/internal/consts" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | "nebius.ai/slurm-operator/internal/values" 11 | ) 12 | 13 | // RenderService renders new [corev1.Service] serving Slurm workers 14 | func RenderService(namespace, clusterName string, worker *values.SlurmWorker) corev1.Service { 15 | return corev1.Service{ 16 | ObjectMeta: metav1.ObjectMeta{ 17 | Name: worker.Service.Name, 18 | Namespace: namespace, 19 | Labels: common.RenderLabels(consts.ComponentTypeWorker, clusterName), 20 | }, 21 | Spec: corev1.ServiceSpec{ 22 | Type: worker.Service.Type, 23 | Selector: common.RenderMatchLabels(consts.ComponentTypeWorker, clusterName), 24 | ClusterIP: "None", 25 | Ports: []corev1.ServicePort{{ 26 | Protocol: worker.Service.Protocol, 27 | Port: worker.ContainerSlurmd.Port, 28 | TargetPort: intstr.FromString(worker.ContainerSlurmd.Name), 29 | }}, 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /internal/render/worker/serviceaccount.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 6 | 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/naming" 9 | "nebius.ai/slurm-operator/internal/render/common" 10 | ) 11 | 12 | func RenderServiceAccount(namespace, clusterName string) corev1.ServiceAccount { 13 | labels := common.RenderLabels(consts.ComponentTypeWorker, clusterName) 14 | 15 | return corev1.ServiceAccount{ 16 | ObjectMeta: metav1.ObjectMeta{ 17 | Name: naming.BuildServiceAccountWorkerName(clusterName), 18 | Namespace: namespace, 19 | Labels: labels, 20 | }, 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /internal/slurmapi/client_set.go: -------------------------------------------------------------------------------- 1 | package slurmapi 2 | 3 | import ( 4 | "maps" 5 | "sync" 6 | 7 | "k8s.io/apimachinery/pkg/types" 8 | ) 9 | 10 | type ClientSet struct { 11 | slurmAPIClients map[types.NamespacedName]Client 12 | mux *sync.Mutex 13 | } 14 | 15 | func NewClientSet() *ClientSet { 16 | return &ClientSet{ 17 | slurmAPIClients: make(map[types.NamespacedName]Client), 18 | mux: &sync.Mutex{}, 19 | } 20 | } 21 | 22 | func (cs *ClientSet) AddClient(name types.NamespacedName, client Client) { 23 | cs.mux.Lock() 24 | defer cs.mux.Unlock() 25 | 26 | cs.slurmAPIClients[name] = client 27 | } 28 | 29 | func (cs *ClientSet) GetClient(name types.NamespacedName) (Client, bool) { 30 | cs.mux.Lock() 31 | defer cs.mux.Unlock() 32 | 33 | client, found := cs.slurmAPIClients[name] 34 | return client, found 35 | } 36 | 37 | func (cs *ClientSet) GetClients() (slurmAPIClients map[types.NamespacedName]Client) { 38 | cs.mux.Lock() 39 | defer cs.mux.Unlock() 40 | 41 | return maps.Clone(cs.slurmAPIClients) 42 | } 43 | -------------------------------------------------------------------------------- /internal/slurmapi/interface.go: -------------------------------------------------------------------------------- 1 | package slurmapi 2 | 3 | import ( 4 | "context" 5 | 6 | api "github.com/SlinkyProject/slurm-client/api/v0041" 7 | ) 8 | 9 | type Client interface { 10 | api.ClientWithResponsesInterface 11 | ListNodes(ctx context.Context) ([]Node, error) 12 | GetNode(ctx context.Context, nodeName string) (Node, error) 13 | GetJobStatus(ctx context.Context, jobID string) (JobStatus, error) 14 | ListJobs(ctx context.Context) ([]Job, error) 15 | } 16 | -------------------------------------------------------------------------------- /internal/utils/get_by.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | 6 | "golang.org/x/exp/constraints" 7 | ) 8 | 9 | func GetBy[T any, V constraints.Ordered](slice []T, value V, getter func(T) V) (T, error) { 10 | for _, v := range slice { 11 | if getter(v) == value { 12 | return v, nil 13 | } 14 | } 15 | return *new(T), fmt.Errorf("element with value \"%v\" not found", value) 16 | } 17 | 18 | func MustGetBy[T any, V constraints.Ordered](slice []T, value V, getter func(T) V) T { 19 | for _, v := range slice { 20 | if getter(v) == value { 21 | return v 22 | } 23 | } 24 | panic(fmt.Sprintf("value with value \"%v\" not found", value)) 25 | } 26 | -------------------------------------------------------------------------------- /internal/utils/get_by_test.go: -------------------------------------------------------------------------------- 1 | package utils_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | 8 | "nebius.ai/slurm-operator/internal/utils" 9 | ) 10 | 11 | func TestValidateGetBy(t *testing.T) { 12 | type S struct { 13 | A int 14 | B string 15 | } 16 | 17 | s := []S{{ 18 | A: 10, 19 | B: "hello", 20 | }, { 21 | A: 20, 22 | B: "bye", 23 | }} 24 | 25 | t.Run("Test GetBy found", func(t *testing.T) { 26 | f, err := utils.GetBy(s, 10, func(t S) int { 27 | return t.A 28 | }) 29 | assert.Equal(t, "hello", f.B) 30 | assert.NoError(t, err) 31 | 32 | f, err = utils.GetBy(s, "bye", func(t S) string { 33 | return t.B 34 | }) 35 | assert.Equal(t, 20, f.A) 36 | assert.NoError(t, err) 37 | }) 38 | 39 | t.Run("Test GetBy not found", func(t *testing.T) { 40 | _, err := utils.GetBy(s, 30, func(t S) int { 41 | return t.A 42 | }) 43 | assert.Error(t, err) 44 | }) 45 | 46 | t.Run("Test MustGetBy found", func(t *testing.T) { 47 | f := utils.MustGetBy(s, 10, func(t S) int { 48 | return t.A 49 | }) 50 | assert.Equal(t, "hello", f.B) 51 | 52 | f = utils.MustGetBy(s, "bye", func(t S) string { 53 | return t.B 54 | }) 55 | assert.Equal(t, 20, f.A) 56 | }) 57 | 58 | t.Run("Test MustGetBy not found", func(t *testing.T) { 59 | assert.Panics(t, func() { 60 | _ = utils.MustGetBy(s, 30, func(t S) int { 61 | return t.A 62 | }) 63 | }) 64 | }) 65 | } 66 | -------------------------------------------------------------------------------- /internal/utils/oneof.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "reflect" 5 | ) 6 | 7 | // ValidateOneOf checks if only one pointer field is specified in the struct 8 | func ValidateOneOf(v any) bool { 9 | val := reflect.ValueOf(v) 10 | if val.Kind() != reflect.Struct { 11 | return false 12 | } 13 | 14 | count := 0 15 | 16 | // Iterate through the struct's fields 17 | for i := 0; i < val.NumField(); i++ { 18 | field := val.Field(i) 19 | 20 | // Check if the field is a pointer and is not nil 21 | if field.Kind() == reflect.Ptr && !field.IsNil() { 22 | count++ 23 | if count > 1 { 24 | return false 25 | } 26 | } 27 | } 28 | 29 | return count == 1 30 | } 31 | -------------------------------------------------------------------------------- /internal/utils/oneof_test.go: -------------------------------------------------------------------------------- 1 | package utils_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | 8 | "nebius.ai/slurm-operator/internal/utils" 9 | ) 10 | 11 | func TestValidateOneOf(t *testing.T) { 12 | type S struct { 13 | A *int 14 | B *string 15 | } 16 | 17 | var ( 18 | a = 10 19 | b = "hello" 20 | ) 21 | 22 | t.Run("Test one of specified", func(t *testing.T) { 23 | s1 := utils.ValidateOneOf(S{A: &a}) 24 | s2 := utils.ValidateOneOf(S{B: &b}) 25 | assert.True(t, s1) 26 | assert.True(t, s2) 27 | }) 28 | 29 | t.Run("Test multiple specified", func(t *testing.T) { 30 | s := utils.ValidateOneOf(S{A: &a, B: &b}) 31 | assert.False(t, s) 32 | }) 33 | 34 | t.Run("Test no specified", func(t *testing.T) { 35 | s := utils.ValidateOneOf(S{}) 36 | assert.False(t, s) 37 | }) 38 | } 39 | -------------------------------------------------------------------------------- /internal/utils/unique.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "golang.org/x/exp/constraints" 5 | ) 6 | 7 | // ValidateUniqueEntries checks if there are no duplicate values for a field in a slice of structs. 8 | // Returns true if all entries are unique by the value taken via getter. Otherwise, false. 9 | func ValidateUniqueEntries[T any, V constraints.Ordered](slice []T, getter func(T) V) bool { 10 | seen := map[V]bool{} 11 | 12 | for _, item := range slice { 13 | fieldValue := getter(item) 14 | if seen[fieldValue] { 15 | return false 16 | } 17 | seen[fieldValue] = true 18 | } 19 | 20 | return true 21 | } 22 | -------------------------------------------------------------------------------- /internal/utils/unique_test.go: -------------------------------------------------------------------------------- 1 | package utils_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | 8 | "nebius.ai/slurm-operator/internal/utils" 9 | ) 10 | 11 | func TestValidateUniqueEntries(t *testing.T) { 12 | type S struct { 13 | ID int 14 | Name string 15 | } 16 | slice := []S{ 17 | {ID: 1, Name: "Alice"}, 18 | {ID: 2, Name: "Bob"}, 19 | {ID: 3, Name: "Charlie"}, 20 | {ID: 2, Name: "David"}, 21 | } 22 | 23 | t.Run("Test Check for duplicates by the 'ID' field", func(t *testing.T) { 24 | isUnique := utils.ValidateUniqueEntries( 25 | slice, 26 | func(s S) int { return s.ID }, 27 | ) 28 | assert.False(t, isUnique) 29 | }) 30 | 31 | t.Run("Test Check for duplicates by the 'Name' field", func(t *testing.T) { 32 | isUnique := utils.ValidateUniqueEntries( 33 | slice, 34 | func(s S) string { return s.Name }, 35 | ) 36 | assert.True(t, isUnique) 37 | }) 38 | } 39 | -------------------------------------------------------------------------------- /internal/values/slurm_config.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import slurmv1 "nebius.ai/slurm-operator/api/v1" 4 | 5 | type PartitionConfiguration struct { 6 | ConfigType string 7 | RawConfig []string 8 | } 9 | 10 | func buildPartitionConfiguration(partitionConfiguration *slurmv1.PartitionConfiguration) PartitionConfiguration { 11 | return PartitionConfiguration{ 12 | ConfigType: partitionConfiguration.ConfigType, 13 | RawConfig: partitionConfiguration.RawConfig, 14 | } 15 | } 16 | 17 | type HealthCheckConfig struct { 18 | HealthCheckInterval int32 19 | HealthCheckProgram string 20 | HealthCheckNodeState []slurmv1.HealthCheckNodeState 21 | } 22 | 23 | func buildHealthCheckConfig(healthCheckConfig *slurmv1.HealthCheckConfig) *HealthCheckConfig { 24 | if healthCheckConfig == nil { 25 | return nil 26 | } 27 | 28 | return &HealthCheckConfig{ 29 | HealthCheckInterval: healthCheckConfig.HealthCheckInterval, 30 | HealthCheckProgram: healthCheckConfig.HealthCheckProgram, 31 | HealthCheckNodeState: healthCheckConfig.HealthCheckNodeState, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /internal/values/slurm_jail.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "k8s.io/utils/ptr" 5 | 6 | slurmv1 "nebius.ai/slurm-operator/api/v1" 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/naming" 9 | ) 10 | 11 | type PopulateJail struct { 12 | slurmv1.PopulateJail 13 | 14 | Name string 15 | 16 | ContainerPopulateJail Container 17 | 18 | VolumeJail slurmv1.NodeVolume 19 | 20 | Overwrite bool 21 | 22 | Maintenance *consts.MaintenanceMode 23 | } 24 | 25 | func buildSlurmPopulateJailFrom(clusterName string, maintenance *consts.MaintenanceMode, populateJail *slurmv1.PopulateJail) PopulateJail { 26 | return PopulateJail{ 27 | PopulateJail: *populateJail.DeepCopy(), 28 | Name: naming.BuildPopulateJailJobName(clusterName), 29 | ContainerPopulateJail: Container{ 30 | Name: consts.ContainerNamePopulateJail, 31 | NodeContainer: slurmv1.NodeContainer{ 32 | Image: populateJail.Image, 33 | ImagePullPolicy: populateJail.ImagePullPolicy, 34 | AppArmorProfile: populateJail.AppArmorProfile, 35 | }, 36 | }, 37 | VolumeJail: slurmv1.NodeVolume{ 38 | VolumeSourceName: ptr.To(consts.VolumeNameJail), 39 | }, 40 | Overwrite: populateJail.Overwrite, 41 | Maintenance: maintenance, 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /internal/values/slurm_periodic_checks.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "k8s.io/utils/ptr" 5 | 6 | slurmv1 "nebius.ai/slurm-operator/api/v1" 7 | "nebius.ai/slurm-operator/internal/consts" 8 | "nebius.ai/slurm-operator/internal/naming" 9 | ) 10 | 11 | type SlurmNCCLBenchmark struct { 12 | slurmv1.NCCLBenchmark 13 | 14 | Name string 15 | 16 | ContainerNCCLBenchmark Container 17 | 18 | VolumeJail slurmv1.NodeVolume 19 | } 20 | 21 | func buildSlurmNCCLBenchmarkFrom(clusterName string, ncclBenchmark *slurmv1.NCCLBenchmark) SlurmNCCLBenchmark { 22 | return SlurmNCCLBenchmark{ 23 | NCCLBenchmark: *ncclBenchmark.DeepCopy(), 24 | Name: naming.BuildCronJobNCCLBenchmarkName(clusterName), 25 | ContainerNCCLBenchmark: Container{ 26 | Name: consts.ContainerNameNCCLBenchmark, 27 | NodeContainer: slurmv1.NodeContainer{ 28 | Image: ncclBenchmark.Image, 29 | ImagePullPolicy: ncclBenchmark.ImagePullPolicy, 30 | AppArmorProfile: ncclBenchmark.AppArmorProfile, 31 | }, 32 | }, 33 | VolumeJail: slurmv1.NodeVolume{ 34 | VolumeSourceName: ptr.To(consts.VolumeNameJail), 35 | }, 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /internal/values/slurm_rest.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | slurmv1 "nebius.ai/slurm-operator/api/v1" 6 | "nebius.ai/slurm-operator/internal/consts" 7 | "nebius.ai/slurm-operator/internal/naming" 8 | ) 9 | 10 | // SlurmREST contains the data needed to deploy and reconcile the Slurm REST API 11 | type SlurmREST struct { 12 | slurmv1.SlurmNode 13 | 14 | Enabled bool 15 | ContainerREST Container 16 | CustomInitContainers []corev1.Container 17 | Service Service 18 | Maintenance *consts.MaintenanceMode 19 | } 20 | 21 | func buildRestFrom(clusterName string, maintenance *consts.MaintenanceMode, rest *slurmv1.SlurmRest) SlurmREST { 22 | containerREST := buildContainerFrom( 23 | rest.SlurmRestNode, 24 | consts.ContainerNameREST, 25 | ) 26 | if containerREST.Port == 0 { 27 | containerREST.Port = consts.DefaultRESTPort 28 | } 29 | 30 | return SlurmREST{ 31 | SlurmNode: *rest.SlurmNode.DeepCopy(), 32 | Enabled: rest.Enabled, 33 | ContainerREST: containerREST, 34 | CustomInitContainers: rest.CustomInitContainers, 35 | Service: buildServiceFrom(naming.BuildServiceName(consts.ComponentTypeREST, clusterName)), 36 | Maintenance: maintenance, 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /internal/values/slurm_sconfigcontroller.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "k8s.io/utils/ptr" 5 | 6 | slurmv1 "nebius.ai/slurm-operator/api/v1" 7 | "nebius.ai/slurm-operator/internal/consts" 8 | ) 9 | 10 | // SlurmWorker contains the data needed to deploy and reconcile the Slurm Workers 11 | type SConfigController struct { 12 | slurmv1.SlurmNode 13 | 14 | Container Container 15 | VolumeJail slurmv1.NodeVolume 16 | 17 | Maintenance consts.MaintenanceMode 18 | } 19 | 20 | func buildSConfigControllerFrom( 21 | node slurmv1.SlurmNode, 22 | container slurmv1.NodeContainer, 23 | maintenance consts.MaintenanceMode, 24 | ) SConfigController { 25 | containerSConfigController := buildContainerFrom( 26 | container, 27 | consts.ContainerNameSConfigController, 28 | ) 29 | res := SConfigController{ 30 | SlurmNode: node, 31 | Container: containerSConfigController, 32 | VolumeJail: slurmv1.NodeVolume{ 33 | VolumeSourceName: ptr.To(consts.VolumeNameJail), 34 | }, 35 | Maintenance: maintenance, 36 | } 37 | 38 | return res 39 | } 40 | -------------------------------------------------------------------------------- /internal/values/slurm_worker_test.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | "k8s.io/apimachinery/pkg/api/resource" 8 | "k8s.io/utils/ptr" 9 | slurmv1 "nebius.ai/slurm-operator/api/v1" 10 | "nebius.ai/slurm-operator/internal/consts" 11 | ) 12 | 13 | func TestBuildSlurmWorkerFrom(t *testing.T) { 14 | clusterName := "test-cluster" 15 | 16 | sharedMemorySizeValue := resource.NewQuantity(1, resource.DecimalSI) 17 | 18 | worker := &slurmv1.SlurmNodeWorker{ 19 | Volumes: slurmv1.SlurmNodeWorkerVolumes{ 20 | SharedMemorySize: sharedMemorySizeValue, 21 | }, 22 | } 23 | ncclSettings := &slurmv1.NCCLSettings{} 24 | 25 | result := buildSlurmWorkerFrom(clusterName, ptr.To(consts.ModeNone), worker, ncclSettings, false) 26 | 27 | if !reflect.DeepEqual(result.SlurmNode, worker.SlurmNode) { 28 | t.Errorf("Expected SlurmNode to be %v, but got %v", *worker.SlurmNode.DeepCopy(), result.SlurmNode) 29 | } 30 | if result.NCCLSettings != *ncclSettings.DeepCopy() { 31 | t.Errorf("Expected NCCLSettings to be %v, but got %v", *ncclSettings.DeepCopy(), result.NCCLSettings) 32 | } 33 | if result.SharedMemorySize != sharedMemorySizeValue { 34 | t.Errorf("Expected SharedMemorySize to be %v, but got %v", sharedMemorySizeValue, result.SharedMemorySize) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /pkg/jwt/jwt.go: -------------------------------------------------------------------------------- 1 | package jwt 2 | 3 | import ( 4 | "nebius.ai/slurm-operator/internal/jwt" 5 | ) 6 | 7 | type ( 8 | Token = jwt.Token 9 | TokenMeta = jwt.TokenMeta 10 | TokenRegistry = jwt.TokenRegistry 11 | ) 12 | 13 | var ( 14 | NewToken = jwt.NewToken 15 | NewTokenRegistry = jwt.NewTokenRegistry 16 | 17 | GenerateSigningKey = jwt.GenerateSigningKey 18 | ) 19 | 20 | var ( 21 | DefaultTokenLifetime = jwt.DefaultTokenLifetime 22 | DefaultTokenEviction = jwt.DefaultTokenEviction 23 | ) 24 | 25 | const ( 26 | DefaultMaxCacheEntries = jwt.DefaultMaxCacheEntries 27 | ) 28 | --------------------------------------------------------------------------------