├── pages ├── robots.txt └── charts │ ├── ais-operator-1.4.1.tgz │ ├── ais-operator-1.5.0.tgz │ ├── ais-operator-1.6.0.tgz │ ├── ais-operator-1.6.1.tgz │ ├── ais-operator-1.7.0.tgz │ ├── ais-operator-2.0.0.tgz │ ├── ais-operator-2.0.1.tgz │ ├── ais-operator-2.1.0.tgz │ ├── ais-operator-2.1.1.tgz │ ├── ais-operator-2.1.2.tgz │ ├── ais-operator-2.10.0.tgz │ ├── ais-operator-2.11.0.tgz │ ├── ais-operator-2.2.0.tgz │ ├── ais-operator-2.3.0.tgz │ ├── ais-operator-2.4.0.tgz │ ├── ais-operator-2.5.0.tgz │ ├── ais-operator-2.6.0.tgz │ ├── ais-operator-2.7.0.tgz │ ├── ais-operator-2.8.0.tgz │ ├── ais-operator-2.9.0.tgz │ ├── ais-operator-2.9.1.tgz │ ├── ais-operator-2.9.2.tgz │ ├── ais-operator-2.9.3.tgz │ └── artifacthub-repo.yaml ├── helm ├── .gitignore ├── authn │ ├── config │ │ └── authn │ │ │ ├── cert │ │ │ ├── tls.yaml │ │ │ ├── default.yaml │ │ │ ├── sjc4-dev.yaml │ │ │ ├── neb-fin.yaml │ │ │ ├── oci-iad.yaml │ │ │ ├── oci-kratos.yaml │ │ │ └── oci-ord.yaml │ │ │ ├── default.yaml.gotmpl │ │ │ ├── tls.yaml.gotmpl │ │ │ ├── nvidia.yaml.gotmpl │ │ │ └── oci.yaml.gotmpl │ ├── charts │ │ └── authn │ │ │ ├── Chart.yaml │ │ │ ├── values.yaml.gotmpl │ │ │ └── templates │ │ │ └── cert.yaml │ └── helmfile.yaml ├── operator │ ├── config │ │ ├── operator │ │ │ ├── default.yaml.gotmpl │ │ │ └── keycloak.yaml.gotmpl │ │ └── tls-cert │ │ │ ├── sjc4-1000.yaml │ │ │ ├── sjc11.yaml │ │ │ └── sjc4-dev.yaml │ ├── tls-cert │ │ ├── Chart.yaml │ │ └── templates │ │ │ └── cert.yaml │ └── check_cert_manager.sh ├── ais │ ├── charts │ │ ├── create-pv │ │ │ └── Chart.yaml │ │ ├── tls-cert │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ └── cert.yaml │ │ ├── cloud-secrets │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── gcp-secret.yaml │ │ │ │ ├── aws-secret.yaml │ │ │ │ └── oci-secret.yaml │ │ └── ais-cluster │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── NOTES.txt │ │ │ └── proxy-lb.yaml │ │ │ └── values.yaml │ ├── config │ │ ├── tls-cert │ │ │ ├── values-sample.yaml │ │ │ ├── sjc11.yaml │ │ │ ├── sjc112.yaml │ │ │ ├── sjc4-1000.yaml │ │ │ ├── sjc4-dev.yaml │ │ │ ├── keycloak.yaml │ │ │ └── neb-fin.yaml │ │ ├── cloud │ │ │ └── sjc11.yaml.gotmpl │ │ └── ais │ │ │ ├── sjc4-1000.yaml │ │ │ ├── neb-fin-test.yaml │ │ │ ├── sjc112.yaml │ │ │ ├── oci-iad-test.yaml │ │ │ ├── neb-fin.yaml │ │ │ └── sjc4-dev.yaml │ └── scripts │ │ ├── label-nodes.sh │ │ ├── delete-pvc.sh │ │ └── delete-released-pv.sh ├── cluster-issuer │ ├── issuer-chart │ │ ├── Chart.yaml │ │ └── templates │ │ │ └── ca.yaml │ ├── helmfile.yaml │ └── config │ │ ├── sjc11.yaml │ │ ├── default.yaml │ │ └── values-sample.yaml └── ais-client │ ├── helmfile.yaml │ ├── templates │ ├── serviceaccount.yaml │ ├── rbac.yaml │ └── deployment.yaml │ ├── Chart.yaml │ ├── trust-bundle.yaml │ └── values.yaml ├── auth └── keycloak │ ├── scripts │ ├── .gitignore │ ├── README.md │ ├── requirements.txt │ └── prepare_cluster.sh │ ├── delete-cluster.sh │ ├── docker │ ├── .gitignore │ ├── recreate-volumes.sh │ ├── sample.env │ ├── openssl-san.cnf │ └── docker-keycloak.sh │ ├── cnpg │ └── helm │ │ ├── cluster │ │ ├── values.yaml │ │ └── helmfile.yaml │ │ └── operator │ │ └── helmfile.yaml │ ├── manifests │ ├── admin-secret.template.yaml │ ├── trust-bundle.yaml │ ├── certificate.yaml │ └── keycloak.yaml │ ├── kind │ └── config.yaml │ └── prereq-helmfile.yaml ├── monitoring ├── alloy │ ├── environments │ │ ├── prod │ │ │ ├── alloy-values.yaml │ │ │ └── values.yaml.gotmpl │ │ ├── remote │ │ │ ├── alloy-values.yaml │ │ │ └── values.yaml.gotmpl │ │ └── local │ │ │ ├── alloy-values.yaml │ │ │ └── values.yaml.gotmpl │ ├── config-chart │ │ ├── Chart.yaml │ │ ├── environments │ │ │ ├── remote │ │ │ │ ├── logs.alloy.gotmpl │ │ │ │ └── metrics.alloy.gotmpl │ │ │ ├── local │ │ │ │ ├── logs.alloy.gotmpl │ │ │ │ └── metrics.alloy.gotmpl │ │ │ └── prod │ │ │ │ ├── logs.alloy.gotmpl │ │ │ │ └── metrics.alloy.gotmpl │ │ ├── templates │ │ │ └── configmap.yaml │ │ └── common │ │ │ ├── config.alloy.gotmpl │ │ │ └── common.alloy.gotmpl │ └── helmfile.yaml.gotmpl ├── kube-state-metrics │ ├── environments │ │ ├── dev │ │ │ └── values.yaml │ │ └── prod │ │ │ └── values.yaml │ ├── values.yaml.gotmpl │ ├── helmfile.yaml.gotmpl │ └── README.md ├── images │ ├── grafana.png │ └── prometheus.png ├── kube-prom │ ├── dashboard-configmap │ │ ├── Chart.yaml │ │ └── templates │ │ │ └── configmap.yaml │ ├── values │ │ ├── node-exporter.yaml │ │ └── kube-state-metrics.yaml.gotmpl │ ├── environments │ │ ├── prod │ │ │ └── values.yaml.gotmpl │ │ └── dev │ │ │ └── values.yaml.gotmpl │ └── helmfile.yaml.gotmpl ├── loki │ ├── environments │ │ └── prod │ │ │ └── values.yaml │ ├── README.md │ └── helmfile.yaml.gotmpl ├── vault │ ├── README.md │ └── update_secret.sh └── promtail │ ├── values.yaml │ └── helmfile.yaml ├── playbooks ├── cloud │ ├── roles │ │ ├── gcp_config │ │ │ ├── files │ │ │ │ └── .gitignore │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── aws_config │ │ │ ├── files │ │ │ │ └── .gitignore │ │ │ └── tasks │ │ │ │ └── main.yml │ │ └── oci_config │ │ │ ├── files │ │ │ └── .gitignore │ │ │ ├── templates │ │ │ └── config.j2 │ │ │ └── tasks │ │ │ └── main.yml │ ├── vars │ │ ├── aws_config.yml │ │ ├── gcp_config.yml │ │ └── oci_config.yml │ ├── ais_oci_config.yml │ ├── ais_aws_config.yml │ └── ais_gcp_config.yml ├── host-config │ ├── roles │ │ ├── config_kubelet │ │ │ ├── vars │ │ │ │ └── main.yml │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── config_kubelet_systemd │ │ │ ├── files │ │ │ │ └── kubelet-extra-args.conf │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── stat_tools │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ ├── ais_host_config_common │ │ │ ├── files │ │ │ │ └── aishostconfig.service │ │ │ └── templates │ │ │ │ └── 01-netcfg.yaml.j2 │ │ ├── ais_taint_nodes │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── pcm │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ ├── check_disk_info │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── ais_enable_multiqueue │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── ais_ntp │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ └── ais_gpuhost_device_plugin │ │ │ └── tasks │ │ │ └── main.yml │ ├── vars │ │ ├── ntp.yml │ │ ├── kubelet.yml │ │ └── ais_datafs.yml │ ├── ais_gpuhost_config.yml │ ├── ais_datafs_umount.yml │ ├── ais_datafs_umount_purge.yml │ ├── ais_datafs_mount.yml │ ├── ais_ntp.yml │ ├── ais_host_config_pcm.yml │ ├── ais_enable_multiqueue.yml │ ├── ais_host_config_common.yml │ ├── ais_host_config_sysctl.yml │ └── docs │ │ ├── ais_enable_multiqueue.md │ │ └── config_kubelet.md ├── ais-deployment │ ├── roles │ │ ├── ais_deploy_operator │ │ │ ├── defaults │ │ │ │ └── main.yml │ │ │ ├── templates │ │ │ │ └── deploy-operator.sh.j2 │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── ais_decommission_cluster │ │ │ └── defaults │ │ │ │ └── main.yml │ │ ├── install_controller_requirements │ │ │ ├── files │ │ │ │ └── requirements.txt │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── install_ansible_collections │ │ │ ├── files │ │ │ │ └── collections.yml │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── generate_https_cert │ │ │ ├── defaults │ │ │ │ └── main.yml │ │ │ ├── templates │ │ │ │ ├── cert.yaml.j2 │ │ │ │ └── ca.yaml.j2 │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── install_rancher_lpp │ │ │ ├── defaults │ │ │ │ └── main.yaml │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ ├── install_multus │ │ │ └── defaults │ │ │ │ └── main.yaml │ │ ├── ais_delete_conf_host_path │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ ├── ais_undeploy_operator │ │ │ ├── files │ │ │ │ └── undeploy-operator.sh │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── ais_cleanup_markers │ │ │ ├── tasks │ │ │ │ └── main.yml │ │ │ └── templates │ │ │ │ └── clear-markers.sh.j2 │ │ ├── fetch_ca_cert │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── create_network_definition │ │ │ └── files │ │ │ │ ├── create-network-definition.sh │ │ │ │ └── nad.template.yaml │ │ ├── create_namespace │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── ais_cleanup_all │ │ │ ├── tasks │ │ │ │ └── main.yml │ │ │ └── files │ │ │ │ └── clean-mpaths.sh │ │ ├── ais_deploy_cluster │ │ │ └── templates │ │ │ │ └── sysctls.json.j2 │ │ ├── ais_delete_cluster │ │ │ ├── templates │ │ │ │ └── delete_cluster.sh.j2 │ │ │ └── tasks │ │ │ │ └── main.yml │ │ ├── ais_delete_conf_state_storage │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ └── create_pv │ │ │ └── files │ │ │ └── pv.template.yaml │ ├── ais_deploy_operator.yml │ ├── ais_undeploy_operator.yml │ ├── ais_cleanup_markers.yml │ ├── install_requirements.yml │ ├── vars │ │ ├── multihome.yml │ │ └── ais_mpaths.yml │ ├── ais_downscale_cluster.yml │ ├── fetch_ca_cert.yml │ ├── create_network_definition.yml │ ├── ais_cleanup_all.yml │ ├── ais_shutdown_cluster.yml │ ├── generate_https_cert.yml │ ├── ais_deploy_cluster.yml │ ├── ais_decommission_cluster.yml │ └── docs │ │ └── generate_https_cert.md ├── security │ ├── inventory.ini │ ├── os_hardening.yaml │ └── roles │ │ ├── rsyslog │ │ └── tasks │ │ │ └── main.yml │ │ ├── journald │ │ └── tasks │ │ │ └── main.yml │ │ ├── tmp_dir │ │ └── tasks │ │ │ └── main.yml │ │ ├── sudo │ │ └── tasks │ │ │ └── main.yml │ │ ├── kernel │ │ └── tasks │ │ │ └── main.yml │ │ └── crypto_policy │ │ └── tasks │ │ └── main.yml ├── extra │ ├── oci │ │ ├── configure_networks.yml │ │ ├── growfs.yml │ │ ├── hosts-example.ini │ │ ├── roles │ │ │ └── growfs │ │ │ │ └── tasks │ │ │ │ └── main.yaml │ │ └── README.md │ └── manual │ │ └── vars │ │ └── cluster_config.yaml └── hosts-example.ini ├── tools └── state-manager │ ├── requirements.txt │ ├── .gitignore │ ├── pod_config.py │ └── ais_metadata.py ├── ais-operator-helper ├── src │ └── go.mod ├── Makefile ├── Dockerfile └── README.md ├── docs ├── diagrams │ ├── ais-overview.png │ └── certificates.jpg └── samples │ └── sample-pv.yaml ├── operator ├── config │ ├── base │ │ ├── manifests │ │ │ ├── kustomization.yaml │ │ │ └── bases │ │ │ │ └── ais-operator.clusterserviceversion.yaml │ │ ├── crd │ │ │ ├── kustomization.yaml │ │ │ └── kustomizeconfig.yaml │ │ ├── webhook │ │ │ ├── kustomization.yaml │ │ │ ├── service.yaml │ │ │ ├── kustomizeconfig.yaml │ │ │ └── manifests.yaml │ │ ├── certmanager │ │ │ ├── kustomization.yaml │ │ │ ├── kustomizeconfig.yaml │ │ │ ├── issuer.yaml │ │ │ ├── certificate_webhook.yaml │ │ │ └── certificate_metrics.yaml │ │ ├── rbac │ │ │ ├── auth_proxy_client_clusterrole.yaml │ │ │ ├── role_binding.yaml │ │ │ ├── auth_proxy_role_binding.yaml │ │ │ ├── leader_election_role_binding.yaml │ │ │ ├── auth_proxy_role.yaml │ │ │ ├── kustomization.yaml │ │ │ ├── aistore_viewer_role.yaml │ │ │ ├── aistore_editor_role.yaml │ │ │ ├── leader_election_role.yaml │ │ │ └── role.yaml │ │ ├── metallb │ │ │ └── configmap.template.yaml │ │ ├── manager │ │ │ ├── controller_manager_config.yaml │ │ │ └── kustomization.yaml │ │ ├── metrics_service.yaml │ │ └── kustomization.yaml │ ├── scorecard │ │ ├── bases │ │ │ └── config.yaml │ │ ├── patches │ │ │ ├── basic.config.yaml │ │ │ └── olm.config.yaml │ │ └── kustomization.yaml │ ├── samples │ │ └── kustomization.yaml │ └── overlays │ │ ├── prometheus │ │ ├── kustomization.yaml │ │ ├── monitor.yaml │ │ └── monitor_tls_patch.yaml │ │ └── default │ │ ├── aistores_cainjection_patch.yaml │ │ ├── aistores_keep_policy_patch.yaml │ │ ├── manager_env_patch.yaml │ │ ├── webhook │ │ ├── aistores_conversion_webhook_patch.yaml │ │ └── webhook_cainjection_patch.yaml │ │ ├── manager_config_patch.yaml │ │ ├── manager_ca_configmap_patch.yaml │ │ ├── manager_authn_ca_configmap_patch.yaml │ │ ├── manager_webhook_patch.yaml │ │ └── manager_auth_metric_patch.yaml ├── hack │ └── boilerplate.go.txt ├── .dockerignore ├── pkg │ ├── resources │ │ ├── cmn │ │ │ ├── cmn_suite_test.go │ │ │ └── services.go │ │ ├── target │ │ │ └── target_suite_test.go │ │ └── statsd │ │ │ └── configmap.go │ ├── services │ │ └── services_suite_test.go │ ├── controllers │ │ └── events.go │ └── client │ │ └── client_suite_test.go ├── scripts │ ├── rbac.yaml │ ├── kind_cluster_local.yaml │ ├── lint.sh │ ├── install_helm.sh │ ├── deploy.sh │ ├── cloud-provider-kind.sh │ ├── test.sh │ ├── test_in_cluster.sh │ ├── test_pod.yaml │ └── go_install_tool.sh ├── tests │ ├── test.dockerfile │ ├── tutils │ │ └── clientset.go │ └── ci │ │ ├── test_in_cluster.sh │ │ └── kind_cluster_ci.yaml ├── .gitignore ├── PROJECT └── api │ └── v1beta1 │ ├── groupversion_info.go │ └── util_types.go ├── sonar-project.properties ├── .gitignore ├── log-sidecar ├── Makefile ├── README.md └── Dockerfile ├── manifests ├── debug │ └── aisnode_debug.yaml └── cloud │ ├── oci-authn-lb.yaml │ └── oci-proxy-lb.yaml ├── .github ├── ISSUE_TEMPLATE │ └── documentation-issue.yml ├── workflows │ ├── docker_ais_logs.yml │ ├── docker_ais_operator_helper.yml │ ├── docker_operator.yml │ ├── publish-pages.yml │ └── publish-release.yml └── dependabot.yml ├── SECURITY.md └── LICENSE /pages/robots.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /helm/.gitignore: -------------------------------------------------------------------------------- 1 | *aisvalues.yaml -------------------------------------------------------------------------------- /helm/authn/config/authn/cert/tls.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /helm/authn/config/authn/cert/default.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /helm/authn/config/authn/default.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /auth/keycloak/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | ca.crt 2 | venv -------------------------------------------------------------------------------- /helm/operator/config/operator/default.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /monitoring/alloy/environments/prod/alloy-values.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /monitoring/alloy/environments/remote/alloy-values.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /monitoring/kube-state-metrics/environments/dev/values.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /playbooks/cloud/roles/gcp_config/files/.gitignore: -------------------------------------------------------------------------------- 1 | gcp.json -------------------------------------------------------------------------------- /tools/state-manager/requirements.txt: -------------------------------------------------------------------------------- 1 | kubernetes==32.0.1 -------------------------------------------------------------------------------- /auth/keycloak/delete-cluster.sh: -------------------------------------------------------------------------------- 1 | kind delete cluster --name=keycloak-test -------------------------------------------------------------------------------- /helm/authn/config/authn/cert/sjc4-dev.yaml: -------------------------------------------------------------------------------- 1 | tls: 2 | createCert: true -------------------------------------------------------------------------------- /playbooks/cloud/roles/aws_config/files/.gitignore: -------------------------------------------------------------------------------- 1 | config 2 | credentials -------------------------------------------------------------------------------- /playbooks/cloud/vars/aws_config.yml: -------------------------------------------------------------------------------- 1 | target_dir: .aws 2 | secret_name: aws-creds -------------------------------------------------------------------------------- /tools/state-manager/.gitignore: -------------------------------------------------------------------------------- 1 | backups 2 | restore 3 | venv 4 | __pycache__ -------------------------------------------------------------------------------- /playbooks/cloud/vars/gcp_config.yml: -------------------------------------------------------------------------------- 1 | target_dir: /var/gcp 2 | secret_name: gcp-creds -------------------------------------------------------------------------------- /playbooks/host-config/roles/config_kubelet/vars/main.yml: -------------------------------------------------------------------------------- 1 | unsafe_sysctls: 2 | - "net.*" -------------------------------------------------------------------------------- /helm/ais/charts/create-pv/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: ais-create-pv 3 | version: 0.4.0 -------------------------------------------------------------------------------- /helm/ais/charts/tls-cert/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: ais-tls-cert 3 | version: 0.1.0 -------------------------------------------------------------------------------- /helm/operator/tls-cert/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: operator-tls-cert 3 | version: 0.1.0 -------------------------------------------------------------------------------- /monitoring/alloy/environments/local/alloy-values.yaml: -------------------------------------------------------------------------------- 1 | alloy: 2 | stabilityLevel: "experimental" -------------------------------------------------------------------------------- /helm/ais/charts/cloud-secrets/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: ais-cloud-secrets 3 | version: 0.1.0 -------------------------------------------------------------------------------- /helm/cluster-issuer/issuer-chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: cluster-issuer 3 | version: 0.1.0 -------------------------------------------------------------------------------- /monitoring/images/grafana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/monitoring/images/grafana.png -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_deploy_operator/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | operator_version: v2.8.0 3 | -------------------------------------------------------------------------------- /ais-operator-helper/src/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/ais-k8s/images/cleanup-helper/src 2 | 3 | go 1.25.0 4 | -------------------------------------------------------------------------------- /docs/diagrams/ais-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/docs/diagrams/ais-overview.png -------------------------------------------------------------------------------- /docs/diagrams/certificates.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/docs/diagrams/certificates.jpg -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_decommission_cluster/defaults/main.yml: -------------------------------------------------------------------------------- 1 | storage_class_name : ais-local-storage -------------------------------------------------------------------------------- /monitoring/images/prometheus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/monitoring/images/prometheus.png -------------------------------------------------------------------------------- /operator/config/base/manifests/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - bases/ais-operator.clusterserviceversion.yaml 3 | -------------------------------------------------------------------------------- /operator/hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved. 3 | */ -------------------------------------------------------------------------------- /pages/charts/ais-operator-1.4.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-1.4.1.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-1.5.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-1.5.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-1.6.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-1.6.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-1.6.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-1.6.1.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-1.7.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-1.7.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.0.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.0.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.0.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.0.1.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.1.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.1.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.1.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.1.1.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.1.2.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.1.2.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.10.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.10.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.11.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.11.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.2.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.2.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.3.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.3.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.4.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.4.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.5.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.5.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.6.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.6.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.7.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.7.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.8.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.8.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.9.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.9.0.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.9.1.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.9.1.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.9.2.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.9.2.tgz -------------------------------------------------------------------------------- /pages/charts/ais-operator-2.9.3.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/pages/charts/ais-operator-2.9.3.tgz -------------------------------------------------------------------------------- /playbooks/security/inventory.ini: -------------------------------------------------------------------------------- 1 | [all] 2 | ansible_host=10.0.0.1 3 | ansible_host=10.0.0.2 4 | # and so on -------------------------------------------------------------------------------- /helm/authn/config/authn/cert/neb-fin.yaml: -------------------------------------------------------------------------------- 1 | tls: 2 | createCert: true 3 | certificate: 4 | ipAddresses: 5 | dnsNames: -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/install_controller_requirements/files/requirements.txt: -------------------------------------------------------------------------------- 1 | kubernetes>=12.0.0 2 | jsonpatch>=1.33.0 -------------------------------------------------------------------------------- /helm/ais-client/helmfile.yaml: -------------------------------------------------------------------------------- 1 | releases: 2 | - name: ais-client 3 | namespace: ais 4 | createNamespace: true 5 | chart: ./ -------------------------------------------------------------------------------- /auth/keycloak/docker/.gitignore: -------------------------------------------------------------------------------- 1 | *.pem 2 | *.csr 3 | # Allow our sample.env in this dir 4 | !.env 5 | # Ignore keycloak volumes 6 | db 7 | exports -------------------------------------------------------------------------------- /helm/authn/charts/authn/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: authn 3 | description: AIS Authentication Server 4 | version: 0.1.0 5 | appVersion: v3.31 -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: alloy-config 3 | description: ConfigMap for Alloy configuration 4 | version: 0.1.0 -------------------------------------------------------------------------------- /operator/config/base/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ais.nvidia.com_aistores.yaml 3 | 4 | configurations: 5 | - kustomizeconfig.yaml 6 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_deploy_operator.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "controller" 3 | gather_facts: false 4 | roles: 5 | - ais_deploy_operator 6 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_undeploy_operator.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "controller" 3 | gather_facts: no 4 | roles: 5 | - ais_undeploy_operator 6 | -------------------------------------------------------------------------------- /playbooks/cloud/roles/oci_config/files/.gitignore: -------------------------------------------------------------------------------- 1 | # Add your OCI credential files here: 2 | # oci_api_key - Your OCI API private key file 3 | oci_api_key -------------------------------------------------------------------------------- /helm/authn/config/authn/cert/oci-iad.yaml: -------------------------------------------------------------------------------- 1 | tls: 2 | certificate: 3 | ipAddresses: 4 | dnsNames: 5 | - "authn.asr.iad.oci.aistore.nvidia.com" -------------------------------------------------------------------------------- /operator/.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore build and test binaries. 3 | bin/ 4 | -------------------------------------------------------------------------------- /operator/config/base/webhook/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manifests.yaml 3 | - service.yaml 4 | 5 | configurations: 6 | - kustomizeconfig.yaml 7 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/install_ansible_collections/files/collections.yml: -------------------------------------------------------------------------------- 1 | collections: 2 | - name: community.general 3 | - name: kubernetes.core -------------------------------------------------------------------------------- /helm/authn/config/authn/cert/oci-kratos.yaml: -------------------------------------------------------------------------------- 1 | tls: 2 | certificate: 3 | ipAddresses: 4 | dnsNames: 5 | - "authn.riva.ord.oci.aistore.nvidia.com" -------------------------------------------------------------------------------- /helm/authn/config/authn/tls.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | tls: 2 | enabled: true 3 | createCert: false 4 | certPath: "/var/certs/tls.crt" 5 | keyPath: "/var/certs/tls.key" -------------------------------------------------------------------------------- /auth/keycloak/cnpg/helm/cluster/values.yaml: -------------------------------------------------------------------------------- 1 | type: postgresql 2 | mode: standalone 3 | cluster: 4 | instances: 3 5 | storage: 6 | storageClass: openebs-hostpath -------------------------------------------------------------------------------- /playbooks/host-config/roles/config_kubelet_systemd/files/kubelet-extra-args.conf: -------------------------------------------------------------------------------- 1 | [Service] 2 | Environment="KUBELET_EXTRA_ARGS=--allowed-unsafe-sysctls='net.*'" 3 | -------------------------------------------------------------------------------- /monitoring/kube-prom/dashboard-configmap/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: ais-grafana-dashboard 3 | description: ConfigMap for AIS Grafana Dashboards 4 | version: 0.1.0 -------------------------------------------------------------------------------- /playbooks/host-config/vars/ntp.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Local ntp servers to use - applied only if running the ais_ntp.yml playbook 3 | # 4 | ntp_pools: 5 | - pool-ntp-3.nvidiangn.net -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/generate_https_cert/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # cluster domain name for DNS (used for generating TLS certificates) 3 | cluster_domain: "cluster.local" -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/install_rancher_lpp/defaults/main.yaml: -------------------------------------------------------------------------------- 1 | lpp_url: https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.31/deploy/local-path-storage.yaml -------------------------------------------------------------------------------- /helm/authn/config/authn/cert/oci-ord.yaml: -------------------------------------------------------------------------------- 1 | tls: 2 | certificate: 3 | ipAddresses: 4 | dnsNames: 5 | - "ais-authn.ais" 6 | - "authn.asr.ord.oci.aistore.nvidia.com" -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/install_multus/defaults/main.yaml: -------------------------------------------------------------------------------- 1 | multus_url: https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset-thick.yml -------------------------------------------------------------------------------- /playbooks/extra/oci/configure_networks.yml: -------------------------------------------------------------------------------- 1 | - name: Configure VNICs on OCI hosts 2 | hosts: oci_hosts 3 | gather_facts: false 4 | become: true 5 | roles: 6 | - configure_networks -------------------------------------------------------------------------------- /operator/config/base/certmanager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - certificate_metrics.yaml 3 | - certificate_webhook.yaml 4 | - issuer.yaml 5 | 6 | configurations: 7 | - kustomizeconfig.yaml 8 | -------------------------------------------------------------------------------- /helm/ais/charts/ais-cluster/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: ais-cluster 3 | description: A Helm chart for deploying AIS on Kubernetes 4 | appVersion: "4.0" 5 | type: application 6 | version: 0.4.0 -------------------------------------------------------------------------------- /helm/ais-client/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create }} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ .Values.serviceAccount.name }} 6 | {{- end }} 7 | -------------------------------------------------------------------------------- /operator/config/scorecard/bases/config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scorecard.operatorframework.io/v1alpha3 2 | kind: Configuration 3 | metadata: 4 | name: config 5 | stages: 6 | - parallel: true 7 | tests: [] 8 | -------------------------------------------------------------------------------- /helm/ais-client/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: ais-client 3 | description: Deploy an AIS client pod for executing commands from within a cluster 4 | type: application 5 | version: 0.1.0 6 | appVersion: "1.0" -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/environments/remote/logs.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | loki.source.kubernetes "pod_logs" { 2 | targets = discovery.relabel.pod_logs.output 3 | forward_to = [loki.relabel.remote.receiver] 4 | } -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/install_rancher_lpp/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | - name: Install Rancher's local path provisioner (https://github.com/rancher/local-path-provisioner) 2 | shell: kubectl apply -f "{{ lpp_url }}" -------------------------------------------------------------------------------- /operator/config/samples/kustomization.yaml: -------------------------------------------------------------------------------- 1 | ## Append samples you want in your CSV to this file as resources ## 2 | resources: 3 | - ais_v1beta1_aistore.yaml 4 | # +kubebuilder:scaffold:manifestskustomizesamples 5 | -------------------------------------------------------------------------------- /sonar-project.properties: -------------------------------------------------------------------------------- 1 | sonar.projectKey=GPUSW_AIStore_AIStore_AIS-K8S 2 | sonar.qualitygate.wait=true 3 | sonar.inclusions=\ 4 | operator/cmd/**/*.go,\ 5 | operator/api/**/*.go,\ 6 | operator/pkg/**/*.go -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/environments/local/logs.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | loki.source.kubernetes "pod_logs" { 2 | targets = discovery.relabel.pod_logs.output 3 | forward_to = [loki.process.ais_extract_level.receiver] 4 | } -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/environments/prod/logs.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | loki.source.kubernetes "pod_logs" { 2 | targets = discovery.relabel.pod_logs.output 3 | forward_to = [loki.process.ais_extract_level.receiver] 4 | } -------------------------------------------------------------------------------- /auth/keycloak/docker/recreate-volumes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | sudo rm -rf db 4 | sudo rm -rf exports 5 | 6 | mkdir db 7 | mkdir exports 8 | sudo chown -R 1000:1000 $(pwd)/db 9 | sudo chown -R 1000:1000 $(pwd)/exports -------------------------------------------------------------------------------- /operator/config/overlays/prometheus/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - monitor.yaml 3 | patches: 4 | - path: monitor_tls_patch.yaml 5 | target: 6 | kind: ServiceMonitor 7 | name: controller-manager-metrics-monitor 8 | -------------------------------------------------------------------------------- /monitoring/kube-prom/values/node-exporter.yaml: -------------------------------------------------------------------------------- 1 | nodeExporter: 2 | enabled: false 3 | prometheus-node-exporter: 4 | # Don't scrape node-exporter with prom if it's disabled 5 | prometheus: 6 | monitor: 7 | enabled: false -------------------------------------------------------------------------------- /operator/config/base/rbac/auth_proxy_client_clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: metrics-reader 5 | rules: 6 | - nonResourceURLs: ["/metrics"] 7 | verbs: ["get"] 8 | -------------------------------------------------------------------------------- /playbooks/cloud/roles/oci_config/templates/config.j2: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | user={{ oci_user_ocid }} 3 | fingerprint={{ oci_fingerprint }} 4 | tenancy={{ oci_tenancy_ocid }} 5 | region={{ oci_region }} 6 | key_file=/root/.oci/OCI_PRIVATE_KEY 7 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_cleanup_markers.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ cluster }}" 3 | gather_facts: no 4 | become: true 5 | vars_files: 6 | - "vars/ais_mpaths.yml" 7 | 8 | roles: 9 | - role: ais_cleanup_markers 10 | -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: alloy-config 5 | namespace: monitoring 6 | data: 7 | config.alloy: |- 8 | {{ tpl (.Files.Get "common/config.alloy.gotmpl") . | indent 4 }} -------------------------------------------------------------------------------- /pages/charts/artifacthub-repo.yaml: -------------------------------------------------------------------------------- 1 | # Artifact Hub repository metadata file 2 | repositoryID: d73e6789-2ae3-49ef-b431-4882b7729bcf 3 | owners: 4 | - name: AIStore Team 5 | email: aistore@nvidia.com 6 | - name: Aaron Wilson 7 | email: aawilson@nvidia.com -------------------------------------------------------------------------------- /ais-operator-helper/Makefile: -------------------------------------------------------------------------------- 1 | TAG ?= latest 2 | REGISTRY_URL ?= docker.io/aistorage 3 | 4 | all: build push 5 | 6 | build: 7 | docker build -t $(REGISTRY_URL)/ais-operator-helper:$(TAG) . 8 | 9 | push: 10 | docker push $(REGISTRY_URL)/ais-operator-helper:$(TAG) -------------------------------------------------------------------------------- /monitoring/alloy/environments/local/values.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | loki: 2 | localGateway: http://loki-gateway.monitoring.svc.cluster.local/loki/api/v1/push 3 | 4 | prometheus: 5 | localGateway: http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090/api/v1/otlp/ -------------------------------------------------------------------------------- /auth/keycloak/docker/sample.env: -------------------------------------------------------------------------------- 1 | KC_BOOTSTRAP_ADMIN_USERNAME=admin 2 | KC_BOOTSTRAP_ADMIN_PASSWORD=admin 3 | KC_HTTPS_CERTIFICATE_FILE=/opt/keycloak/conf/server.crt.pem 4 | KC_HTTPS_CERTIFICATE_KEY_FILE=/opt/keycloak/conf/server.key.pem 5 | KC_HTTPS_ENABLED=true 6 | KC_HTTPS_PORT=8443 -------------------------------------------------------------------------------- /playbooks/cloud/vars/oci_config.yml: -------------------------------------------------------------------------------- 1 | target_dir: /tmp/oci 2 | secret_name: oci-creds 3 | 4 | # OCI literal values - configure these for your environment 5 | oci_tenancy_ocid: "" 6 | oci_user_ocid: "" 7 | oci_region: "" 8 | oci_fingerprint: "" 9 | oci_compartment_ocid: "" 10 | -------------------------------------------------------------------------------- /playbooks/extra/manual/vars/cluster_config.yaml: -------------------------------------------------------------------------------- 1 | base_path: "/ais" 2 | drives: 3 | - nvme0n1 4 | - nvme1n1 5 | - nvme2n1 6 | - nvme3n1 7 | - nvme4n1 8 | - nvme5n1 9 | - nvme6n1 10 | - nvme7n1 11 | - nvme8n1 12 | - nvme9n1 13 | - nvme10n1 14 | - nvme11n1 -------------------------------------------------------------------------------- /operator/config/base/webhook/service.yaml: -------------------------------------------------------------------------------- 1 | 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: webhook-service 6 | namespace: system 7 | spec: 8 | ports: 9 | - port: 443 10 | targetPort: 9443 11 | selector: 12 | control-plane: controller-manager 13 | -------------------------------------------------------------------------------- /operator/pkg/resources/cmn/cmn_suite_test.go: -------------------------------------------------------------------------------- 1 | package cmn 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | ) 9 | 10 | func TestCommon(t *testing.T) { 11 | RegisterFailHandler(Fail) 12 | RunSpecs(t, "Common Suite") 13 | } 14 | -------------------------------------------------------------------------------- /playbooks/host-config/roles/stat_tools/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: update apt-cache 4 | apt: 5 | update_cache: yes 6 | 7 | - name: install mpstat,iostat 8 | apt: name=systat state=latest 9 | 10 | 11 | - name: install bcc 12 | apt: name=bcc state=latest 13 | 14 | -------------------------------------------------------------------------------- /auth/keycloak/manifests/admin-secret.template.yaml: -------------------------------------------------------------------------------- 1 | # Creates a K8s Secret with the superuser credentials 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: ais-admin-secret 6 | namespace: ais 7 | type: Opaque 8 | stringData: 9 | SU-NAME: "${SU_NAME}" 10 | SU-PASS: "${SU_PASS}" -------------------------------------------------------------------------------- /ais-operator-helper/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.25-alpine AS builder 2 | 3 | RUN apk add --no-cache git 4 | 5 | WORKDIR /src 6 | COPY src/ ./ 7 | 8 | RUN go build -o /cleanup-helper cleanup-helper.go 9 | 10 | FROM alpine:latest 11 | 12 | COPY --from=builder /cleanup-helper /cleanup-helper 13 | -------------------------------------------------------------------------------- /operator/pkg/resources/target/target_suite_test.go: -------------------------------------------------------------------------------- 1 | package target 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | ) 9 | 10 | func TestCommon(t *testing.T) { 11 | RegisterFailHandler(Fail) 12 | RunSpecs(t, "Target Suite") 13 | } 14 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_delete_conf_host_path/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | - name: Delete all AIS configuration files from all nodes 2 | shell: find {{ host_path_prefix }} -type f -name ".ais*" -exec rm {} + 3 | ignore_errors: true 4 | when: host_path_prefix is defined and host_path_prefix != "" 5 | -------------------------------------------------------------------------------- /auth/keycloak/cnpg/helm/operator/helmfile.yaml: -------------------------------------------------------------------------------- 1 | repositories: 2 | - name: cnpg 3 | url: https://cloudnative-pg.github.io/charts 4 | 5 | releases: 6 | - name: cloudnative-pg-operator 7 | namespace: cnpg-system 8 | chart: cnpg/cloudnative-pg 9 | createNamespace: true 10 | wait: true -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_undeploy_operator/files/undeploy-operator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Undeploy AIS operator 5 | # 6 | 7 | release_version=${RELEASE:-v2.8.0} 8 | 9 | kubectl delete -f https://github.com/NVIDIA/ais-k8s/releases/download/${release_version}/ais-operator.yaml 10 | -------------------------------------------------------------------------------- /playbooks/host-config/vars/kubelet.yml: -------------------------------------------------------------------------------- 1 | # Path for kubelet config drop-in, if supported 2 | kubelet_systemd_path: /etc/systemd/system/kubelet.service.d/ 3 | # Path to kubelet config file (Depending on installation, could be /var/lib/kubelet/config.yaml) 4 | kubelet_var_path: /etc/kubernetes/kubelet-config.yaml -------------------------------------------------------------------------------- /monitoring/loki/environments/prod/values.yaml: -------------------------------------------------------------------------------- 1 | nodeAffinity: 2 | requiredDuringSchedulingIgnoredDuringExecution: 3 | nodeSelectorTerms: 4 | - matchExpressions: 5 | - key: monitoring 6 | operator: In 7 | values: 8 | - "true" 9 | 10 | storageClass: local-path -------------------------------------------------------------------------------- /helm/ais/charts/ais-cluster/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Deployed AIS with helm chart version {{ .Chart.Version }} to namespace {{ .Release.Namespace }} 2 | 3 | To learn more about the release, try: 4 | $ helm status {{ .Release.Name }} -n {{ .Release.Namespace }} 5 | $ helm get all {{ .Release.Name }} -n {{ .Release.Namespace }} -------------------------------------------------------------------------------- /operator/config/base/certmanager/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This configuration is for teaching kustomize how to update name ref and var substitution 2 | nameReference: 3 | - kind: Issuer 4 | group: cert-manager.io 5 | fieldSpecs: 6 | - kind: Certificate 7 | group: cert-manager.io 8 | path: spec/issuerRef/name -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_cleanup_markers/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Copy clean-up scripts 3 | copy: 4 | src: "clear-markers.sh" 5 | dest: "/tmp/" 6 | mode: 0777 7 | 8 | - name: Run delete scripts 9 | shell: "MPATHS={{ ais_mpaths | join(' ') | quote }} /tmp/clear-markers.sh" 10 | -------------------------------------------------------------------------------- /monitoring/vault/README.md: -------------------------------------------------------------------------------- 1 | 1. Follow the wiki page in the AIS Gitlab repo to get the appropriate DL access. 2 | 1. Copy vault.env from the Gitlab wiki 3 | 1. `export $(cat vault.env | xargs)` 4 | 1. `vault login -method=oidc role=storage-services` 5 | 1. `kubectl config use-context ` 6 | 1. `./update_secret.sh` -------------------------------------------------------------------------------- /helm/cluster-issuer/helmfile.yaml: -------------------------------------------------------------------------------- 1 | environments: 2 | sjc11: 3 | kubeContext: sjc11 4 | --- 5 | releases: 6 | # Defines a self-signed issuer for acquiring a tls cert 7 | - name: cluster-issuer 8 | chart: issuer-chart 9 | version: 0.1.0 10 | values: 11 | - "./config/{{ .Environment.Name }}.yaml" -------------------------------------------------------------------------------- /operator/config/base/metallb/configmap.template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: metallb-config 5 | namespace: metallb-system 6 | data: 7 | config: | 8 | address-pools: 9 | - name: default 10 | protocol: layer2 11 | addresses: 12 | - $EXTERNAL_ADDRESSES 13 | -------------------------------------------------------------------------------- /operator/config/scorecard/patches/basic.config.yaml: -------------------------------------------------------------------------------- 1 | - op: add 2 | path: /stages/0/tests/- 3 | value: 4 | entrypoint: 5 | - scorecard-test 6 | - basic-check-spec 7 | image: quay.io/operator-framework/scorecard-test:v1.3.0 8 | labels: 9 | suite: basic 10 | test: basic-check-spec-test 11 | -------------------------------------------------------------------------------- /playbooks/host-config/ais_gpuhost_config.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Config specific to GPU hosts 3 | # 4 | --- 5 | - hosts: '{{ ais_hosts }}' 6 | vars_files: 7 | - "vars/host_config_packages.yml" 8 | 9 | gather_facts: true 10 | roles: 11 | - ais_gpuhost_config 12 | - ais_gpuhost_device_plugin 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /playbooks/extra/oci/growfs.yml: -------------------------------------------------------------------------------- 1 | # Simple playbook to use the oci-growfs tool to expand the root volume on OCI hosts 2 | # See https://docs.oracle.com/en-us/iaas/oracle-linux/oci-utils/index.htm#oci-growfs 3 | - name: Expand OCI host root volume 4 | hosts: all 5 | gather_facts: false 6 | become: true 7 | roles: 8 | - growfs -------------------------------------------------------------------------------- /auth/keycloak/cnpg/helm/cluster/helmfile.yaml: -------------------------------------------------------------------------------- 1 | repositories: 2 | - name: cnpg 3 | url: https://cloudnative-pg.github.io/charts 4 | 5 | releases: 6 | - name: cloudnative-pg 7 | namespace: cnpg-database 8 | chart: cnpg/cluster 9 | createNamespace: true 10 | wait: true 11 | values: 12 | - values.yaml -------------------------------------------------------------------------------- /helm/operator/config/operator/keycloak.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | controllerManager: 2 | manager: 3 | env: 4 | operatorSkipVerifyCrt: "false" 5 | # Trust bundle for requests to Auth services 6 | authCAConfigmapName: aistore.nvidia.com 7 | # Trust bundle for requests to AIS API 8 | aisCAConfigmapName: aistore.nvidia.com 9 | -------------------------------------------------------------------------------- /monitoring/vault/update_secret.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl delete secret $LOCAL_ALLOY_SECRET -n monitoring --ignore-not-found 4 | 5 | vault kv get -format json -field data $VAULT_ALLOY_SECRET | jq -r 'to_entries[] | "--from-literal=\(.key)=\(.value)"' | \ 6 | xargs kubectl create secret generic -n monitoring $LOCAL_ALLOY_SECRET 7 | -------------------------------------------------------------------------------- /operator/scripts/rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: ais-rbac 5 | subjects: 6 | - kind: ServiceAccount 7 | name: default 8 | namespace: default 9 | roleRef: 10 | kind: ClusterRole 11 | name: cluster-admin 12 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /playbooks/security/os_hardening.yaml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | become: yes 3 | roles: 4 | - filesystem_kernel 5 | - journald 6 | - kernel 7 | - network_kernel 8 | - crypto_policy 9 | - tmp_dir 10 | - aide 11 | - cron 12 | - rsyslog 13 | - pam 14 | - sshd 15 | - sudo 16 | - profile 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.tgz 2 | *.lock 3 | *.state 4 | *.swp 5 | *.vscode 6 | .idea 7 | *.env 8 | 9 | *ansible.cfg 10 | 11 | *hosts.ini 12 | *hosts.yaml 13 | 14 | bundle.Dockerfile 15 | bundle/ 16 | dist/ 17 | 18 | # Allow all files in github pages directory 19 | !/pages/** 20 | 21 | # Exceptions to still ignore in pages/ 22 | **/.DS_Store -------------------------------------------------------------------------------- /helm/ais/charts/cloud-secrets/templates/gcp-secret.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.gcp_json .Values.cloud.gcpSecretName}} 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: {{ .Values.cloud.gcpSecretName }} 6 | namespace: {{ .Release.Namespace }} 7 | type: Opaque 8 | data: 9 | gcp.json: {{ .Values.gcp_json | b64enc }} 10 | {{- end }} -------------------------------------------------------------------------------- /playbooks/host-config/roles/ais_host_config_common/files/aishostconfig.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=AIS host config tweaks 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | Type=oneshot 8 | ExecStart=/usr/local/bin/ais_host_config.sh 9 | RemainAfterExit=true 10 | 11 | [Install] 12 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /operator/config/base/manager/controller_manager_config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: controller-runtime.sigs.k8s.io/v1beta1 2 | kind: ControllerManagerConfig 3 | health: 4 | healthProbeBindAddress: :8081 5 | metrics: 6 | bindAddress: :8443 7 | webhook: 8 | port: 9443 9 | leaderElection: 10 | leaderElect: true 11 | resourceName: 60c23797.nvidia.com 12 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_deploy_operator/templates/deploy-operator.sh.j2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Deploy AIS operator 5 | # 6 | 7 | curl https://raw.githubusercontent.com/NVIDIA/ais-k8s/main/operator/scripts/deploy.sh -o /tmp/deploy.sh 8 | chmod +x /tmp/deploy.sh 9 | { echo y; } | RELEASE="{{ operator_version }}" /tmp/deploy.sh 10 | -------------------------------------------------------------------------------- /operator/config/base/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | 4 | generatorOptions: 5 | disableNameSuffixHash: true 6 | 7 | configMapGenerator: 8 | - files: 9 | - controller_manager_config.yaml 10 | name: manager-config 11 | apiVersion: kustomize.config.k8s.io/v1beta1 12 | kind: Kustomization 13 | images: 14 | - name: controller -------------------------------------------------------------------------------- /operator/config/base/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: manager-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: manager-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | namespace: system 13 | -------------------------------------------------------------------------------- /playbooks/extra/oci/hosts-example.ini: -------------------------------------------------------------------------------- 1 | ; Example hosts file for configuring OCI networks 2 | ; Include: 3 | ; 1. Primary host IP to connect 4 | ; 2. Comma-separated list of additional ips to configure 5 | ; 3. Matching OCI ID of each VNIC to attach to each IP 6 | 7 | [oci_hosts] 8 | 10.51.248.1 additional_ips=10.51.248.32,10.51.248.33 ocids=vnic1-OCID,vnic2-OCID 9 | -------------------------------------------------------------------------------- /auth/keycloak/docker/openssl-san.cnf: -------------------------------------------------------------------------------- 1 | [req] 2 | default_bits = 2048 3 | prompt = no 4 | default_md = sha256 5 | distinguished_name = dn 6 | req_extensions = req_ext 7 | 8 | [dn] 9 | C = US 10 | ST = CA 11 | L = Santa Clara 12 | O = Local Testing 13 | CN = localhost 14 | 15 | [req_ext] 16 | subjectAltName = @alt_names 17 | 18 | [alt_names] 19 | DNS.1 = localhost -------------------------------------------------------------------------------- /operator/config/base/rbac/auth_proxy_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: proxy-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: proxy-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | namespace: system 13 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/install_ansible_collections/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Install ansible collections listed in the collections file 2 | command: 3 | cmd: ansible-galaxy collection install -r {{ role_path }}/files/collections.yml 4 | register: install_result 5 | changed_when: "'already installed' not in install_result.stdout" 6 | ignore_errors: yes -------------------------------------------------------------------------------- /monitoring/promtail/values.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | clients: 3 | - url: {{ .Values | get "promtail.clientUrl" }} 4 | tenant_id: {{ .Values | get "promtail.tenant" }} 5 | {{- if hasKey .Values.promtail "clusterLabel" }} 6 | external_labels: 7 | cluster: {{ .Values.promtail.clusterLabel }} 8 | {{- end }} 9 | configmap: 10 | enabled: true -------------------------------------------------------------------------------- /operator/config/base/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: leader-election-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: Role 8 | name: leader-election-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | namespace: system 13 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/install_requirements.yml: -------------------------------------------------------------------------------- 1 | - name: Install requirements on localhost 2 | hosts: localhost 3 | gather_facts: no 4 | roles: 5 | - install_ansible_collections 6 | 7 | - name: Install requirements for controller hosts 8 | hosts: controller 9 | gather_facts: no 10 | become: yes 11 | roles: 12 | - install_controller_requirements -------------------------------------------------------------------------------- /auth/keycloak/scripts/README.md: -------------------------------------------------------------------------------- 1 | This directory contains some utility scripts for setting up AIStore in a local development K8s cluster with Keycloak. 2 | 3 | Keycloak includes most realm settings in the realm export which can be imported to any deployment. 4 | For better compatibility and security, the scripts in here allow you to automatically create a new `ais-admin` user on startup. -------------------------------------------------------------------------------- /monitoring/alloy/environments/remote/values.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | loki: 2 | remoteEndpoint: {{ env "LOKI_ENDPOINT_PANOPTES" }} 3 | localGateway: 4 | 5 | mimir: 6 | remoteEndpoint: {{ env "MIMIR_ENDPOINT_PANOPTES" }} 7 | 8 | remote: 9 | label: {{ requiredEnv "CLUSTER_LABEL" }} 10 | secret: {{ requiredEnv "REMOTE_AUTH_SECRET" }} 11 | scope: {{ requiredEnv "REMOTE_AUTH_SCOPE" }} -------------------------------------------------------------------------------- /operator/config/base/rbac/auth_proxy_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: proxy-role 5 | rules: 6 | - apiGroups: ["authentication.k8s.io"] 7 | resources: 8 | - tokenreviews 9 | verbs: ["create"] 10 | - apiGroups: ["authorization.k8s.io"] 11 | resources: 12 | - subjectaccessreviews 13 | verbs: ["create"] 14 | -------------------------------------------------------------------------------- /operator/config/base/rbac/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - role.yaml 3 | - role_binding.yaml 4 | - leader_election_role.yaml 5 | - leader_election_role_binding.yaml 6 | # Comment the following 4 lines if you want to disable 7 | # the auth to protect the /metrics endpoint. 8 | - auth_proxy_role.yaml 9 | - auth_proxy_role_binding.yaml 10 | - auth_proxy_client_clusterrole.yaml 11 | -------------------------------------------------------------------------------- /log-sidecar/Makefile: -------------------------------------------------------------------------------- 1 | CONTAINER_TOOL ?= docker 2 | REGISTRY ?= docker.io 3 | REPO_NS ?= aistorage 4 | REPO_NAME ?= ais-logs 5 | TAG ?= v1.1 6 | IMG ?= $(REGISTRY)/$(REPO_NS)/$(REPO_NAME):$(TAG) 7 | 8 | .PHONY: all 9 | all: build push 10 | 11 | .PHONY: build 12 | build: 13 | $(CONTAINER_TOOL) build -t ${IMG} . 14 | 15 | .PHONY: push 16 | push: 17 | $(CONTAINER_TOOL) push ${IMG} 18 | -------------------------------------------------------------------------------- /log-sidecar/README.md: -------------------------------------------------------------------------------- 1 | This Dockerfile creates a distroless image containing only a `tail` binary. 2 | 3 | To use this image, attach a volume and provide the filename to `tail` to stdout. 4 | 5 | Example of mounting and reading from a file `./test/test.txt` in the current directory: 6 | ```bash 7 | docker run -v ./test:/test aistorage/ais-logs:v1.1 /test/test.txt 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /operator/config/base/metrics_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: controller-manager 6 | name: controller-manager-metrics-service 7 | namespace: system 8 | spec: 9 | ports: 10 | - name: https 11 | port: 8443 12 | protocol: TCP 13 | targetPort: 8443 14 | selector: 15 | control-plane: controller-manager -------------------------------------------------------------------------------- /operator/config/overlays/default/aistores_cainjection_patch.yaml: -------------------------------------------------------------------------------- 1 | # The following patch adds a directive for certmanager to inject CA into the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | annotations: 6 | cert-manager.io/inject-ca-from: CERTIFICATE_NAMESPACE_PLACEHOLDER/CERTIFICATE_NAME_PLACEHOLDER 7 | name: aistores.ais.nvidia.com 8 | -------------------------------------------------------------------------------- /monitoring/kube-state-metrics/environments/prod/values.yaml: -------------------------------------------------------------------------------- 1 | metricAllowlist: 2 | - kube_node_info 3 | - kube_node_status_condition 4 | - kube_statefulset_replicas 5 | - kube_statefulset_status_replicas 6 | - kube_statefulset_status_replicas_ready 7 | - kube_pod_container_info 8 | - kube_pod_status_phase 9 | - kube_resourcequota 10 | - kube_pod_container_status_restarts_total -------------------------------------------------------------------------------- /operator/scripts/kind_cluster_local.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | nodes: 4 | - role: control-plane 5 | - role: worker 6 | labels: 7 | ais-node: true 8 | - role: worker 9 | labels: 10 | ais-node: true 11 | - role: worker 12 | labels: 13 | ais-node: true 14 | - role: worker 15 | labels: 16 | ais-node: true 17 | -------------------------------------------------------------------------------- /operator/tests/test.dockerfile: -------------------------------------------------------------------------------- 1 | ARG GO_VERSION=1.25 2 | FROM docker.io/library/golang:${GO_VERSION}-alpine 3 | 4 | RUN apk add --no-cache bash curl git make which 5 | 6 | ENV LOCALBIN="/bin" 7 | 8 | COPY . /operator 9 | 10 | RUN cd /operator \ 11 | && go mod download \ 12 | && make kustomize controller-gen envtest golangci-lint mockgen 13 | 14 | ENTRYPOINT ["sleep", "infinity"] 15 | -------------------------------------------------------------------------------- /playbooks/cloud/ais_oci_config.yml: -------------------------------------------------------------------------------- 1 | # Run this and restart the cluster pods to update access to oci 2 | # Copies the oci_api_key containing oci credentials in roles/oci_config/files 3 | # to the controller host and recreates the kubernetes secret 4 | 5 | - hosts: controller 6 | vars_files: 7 | - "vars/oci_config.yml" 8 | 9 | gather_facts: false 10 | roles: 11 | - oci_config 12 | -------------------------------------------------------------------------------- /helm/ais-client/trust-bundle.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: trust.cert-manager.io/v1alpha1 2 | kind: Bundle 3 | metadata: 4 | name: aistore.nvidia.com 5 | spec: 6 | sources: 7 | - secret: 8 | name: ca-root-secret 9 | key: tls.crt 10 | target: 11 | configMap: 12 | key: trust-bundle.pem 13 | namespaceSelector: 14 | matchLabels: 15 | kubernetes.io/metadata.name: ais -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_undeploy_operator/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Copy operator undeploy script 3 | become: true 4 | copy: 5 | src: "undeploy-operator.sh" 6 | dest: "/tmp/" 7 | mode: 0777 8 | 9 | - name: Run undeploy operator scripts 10 | command: "/tmp/undeploy-operator.sh" 11 | register: undeployout 12 | changed_when: "'deleted' in undeployout.stdout" 13 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/fetch_ca_cert/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Get certificate value from K8s secret 2 | shell: kubectl get secret -n {{cluster}} {{ca_cert_secret}} -o jsonpath="{.data['ca\.crt']}" | base64 --decode 3 | register: cacert 4 | 5 | - name: Copy value of cacert to local file 6 | copy: 7 | content: "{{ cacert.stdout }}" 8 | dest: "{{ cacert_file }}" 9 | delegate_to: localhost -------------------------------------------------------------------------------- /playbooks/cloud/ais_aws_config.yml: -------------------------------------------------------------------------------- 1 | # Run this and restart the cluster pods to update access to aws 2 | # Copies the provided aws config and credentials files in roles/aws_config/files to the controller host 3 | # and recreates the kubernetes secret 4 | 5 | - hosts: controller 6 | vars_files: 7 | - "vars/aws_config.yml" 8 | 9 | gather_facts: false 10 | roles: 11 | - aws_config 12 | -------------------------------------------------------------------------------- /playbooks/host-config/ais_datafs_umount.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Unmount the set of ais_devices on the set of hosts ais_hosts (variables to 3 | # be specified on cmdline), but do not remove their fstab entries. 4 | # 5 | --- 6 | - hosts: "{{ ais_hosts }}" 7 | vars_files: 8 | - "vars/ais_datafs.yml" 9 | 10 | gather_facts: true 11 | roles: 12 | - role: ais_datafs 13 | ais_fs_umount: true 14 | -------------------------------------------------------------------------------- /manifests/debug/aisnode_debug.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: aisnode-debug 5 | namespace: ais 6 | labels: 7 | app: aisnode-debug 8 | spec: 9 | containers: 10 | - image: aistorage/ais-util 11 | command: 12 | - "tail" 13 | - "-f" 14 | - "/dev/null" 15 | imagePullPolicy: Always 16 | name: aisnode-debug 17 | restartPolicy: Always 18 | -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/common/config.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | {{ tpl (.Files.Get "common/common.alloy.gotmpl") . }} 2 | {{ tpl (.Files.Get "common/metrics.alloy.gotmpl") . }} 3 | {{ tpl (.Files.Get (printf "environments/%s/metrics.alloy.gotmpl" .Values.environment)) . }} 4 | {{ tpl (.Files.Get "common/logs.alloy.gotmpl") . }} 5 | {{ tpl (.Files.Get (printf "environments/%s/logs.alloy.gotmpl" .Values.environment)) . }} -------------------------------------------------------------------------------- /playbooks/host-config/ais_datafs_umount_purge.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Unmount the set of ais_devices on the set of hosts ais_hosts (variables to 3 | # be specified on cmdline), and remove their fstab entries. 4 | # 5 | --- 6 | - hosts: "{{ ais_hosts }}" 7 | vars_files: 8 | - "vars/ais_datafs.yml" 9 | 10 | gather_facts: true 11 | roles: 12 | - role: ais_datafs 13 | ais_fs_umount_purge: true 14 | 15 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/create_network_definition/files/create-network-definition.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export NAD_NAME="$NAME" 3 | export NAD_NAMESPACE="$NAMESPACE" 4 | export NAD_IFACE="$INTERFACE" 5 | source_dir=$(dirname "${BASH_SOURCE[0]}") 6 | 7 | envsubst < "${source_dir}"/nad.template.yaml > /tmp/network-attachment-def.yaml 8 | kubectl apply -f /tmp/network-attachment-def.yaml 9 | rm /tmp/network-attachment-def.yaml -------------------------------------------------------------------------------- /helm/ais/charts/cloud-secrets/templates/aws-secret.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.aws_config .Values.aws_credentials .Values.cloud.awsSecretName }} 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: {{ .Values.cloud.awsSecretName }} 6 | namespace: {{ .Release.Namespace }} 7 | type: Opaque 8 | data: 9 | config: {{ .Values.aws_config | b64enc }} 10 | credentials: {{ .Values.aws_credentials | b64enc }} 11 | {{- end }} -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/create_namespace/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Check k8s_namespace variable 2 | fail: 3 | msg: "`k8s_namespace` name must be provided!" 4 | when: k8s_namespace is undefined or k8s_namespace | length == 0 5 | 6 | - name: Create namespace if it does not exist 7 | shell: "kubectl create ns {{ k8s_namespace }} || true" 8 | register: namespaceout 9 | changed_when: "'created' in namespaceout.stdout" -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_cleanup_all/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Copy cleanup mpath scripts 3 | copy: 4 | src: "clean-mpaths.sh" 5 | dest: "/tmp/" 6 | mode: 0777 7 | 8 | - name: Run cleanup scripts 9 | shell: "MPATHS={{ ais_mpaths | join(' ') | quote }} /tmp/clean-mpaths.sh" 10 | 11 | - name: Run cleanup meta directory 12 | file: 13 | state: absent 14 | path: "/etc/ais/{{ cluster }}" 15 | -------------------------------------------------------------------------------- /playbooks/host-config/ais_datafs_mount.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Mount the set of ais_devices on the set of hosts ais_hosts (variables to 3 | # be specified on cmdline). Assumes the filesystems have already been created 4 | # and are in fstab. 5 | # 6 | --- 7 | - hosts: "{{ ais_hosts }}" 8 | vars_files: 9 | - "vars/ais_datafs.yml" 10 | 11 | gather_facts: true 12 | roles: 13 | - role: ais_datafs 14 | ais_fs_mount: true 15 | 16 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_deploy_cluster/templates/sysctls.json.j2: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | name: "net.core.somaxconn", 4 | value: "{{ somaxconn }}" 5 | }, 6 | { 7 | name: "net.ipv4.ip_local_port_range", 8 | value: "{{ ip_local_port_range }}" 9 | }, 10 | { 11 | name: "net.ipv4.tcp_tw_reuse", 12 | value: "{{ tcp_tw_reuse }}" 13 | }, 14 | { 15 | name: "net.ipv4.tcp_max_tw_buckets", 16 | value: "{{ tcp_max_tw_buckets }}" 17 | } 18 | ] -------------------------------------------------------------------------------- /operator/pkg/resources/cmn/services.go: -------------------------------------------------------------------------------- 1 | // Package cmn provides utilities for common AIS cluster resources 2 | /* 3 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package cmn 6 | 7 | func NewServiceLabels(aisName, component string) map[string]string { 8 | return map[string]string{ 9 | LabelApp: aisName, 10 | LabelAppPrefixed: aisName, 11 | LabelComponentPrefixed: component, 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /playbooks/host-config/ais_ntp.yml: -------------------------------------------------------------------------------- 1 | # 2 | # *If* your ntp is required to use a local pool server (e.g., DC firewall is blocking 3 | # full access to port 123) then use this playbook to configure a chosen set of servers 4 | # and to perform an initial possible large step sync to them. 5 | # 6 | --- 7 | - hosts: '{{ ais_hosts | default("ais") }}' 8 | vars_files: 9 | - "vars/ntp.yml" 10 | 11 | gather_facts: False 12 | roles: 13 | - ais_ntp -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_cleanup_markers/templates/clear-markers.sh.j2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Script used for removing metadata stored on each mountpath. 5 | # It should be executed on each storage target that needs a cleanup. 6 | # 7 | 8 | mpaths="{{ ais_mpaths | join(' ') | quote }}" # Adjust mpaths if needed. 9 | 10 | for path in ${mpaths}; do 11 | rm -rf "${path}"/.ais.vmd 12 | rm -rf "${path}"/.ais.markers 13 | done 14 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_delete_cluster/templates/delete_cluster.sh.j2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Perform the delete operation 4 | kubectl delete aistores.ais.nvidia.com -n {{ cluster }} {{ cluster }} 5 | 6 | # Check if there are any remaining pods in the namespace 7 | while kubectl get pods -n {{ cluster }} | grep -q ais; do 8 | echo "Waiting for pods to be deleted..." 9 | sleep 5 10 | done 11 | 12 | echo "All pods have been deleted." 13 | -------------------------------------------------------------------------------- /playbooks/extra/oci/roles/growfs/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | - name: Check if /usr/libexec/oci-growfs exists 2 | stat: 3 | path: /usr/libexec/oci-growfs 4 | register: growfs_file_status 5 | 6 | - name: Fail if /usr/libexec/oci-growfs does not exist 7 | fail: 8 | msg: "/usr/libexec/oci-growfs does not exist" 9 | when: not growfs_file_status.stat.exists 10 | 11 | - name: Run oci-growfs to expand root filesystem 12 | command: /usr/libexec/oci-growfs -y -------------------------------------------------------------------------------- /operator/config/base/certmanager/issuer.yaml: -------------------------------------------------------------------------------- 1 | # The following manifests contain a self-signed issuer CR and a certificate CR. 2 | # More document can be found at https://docs.cert-manager.io 3 | # WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. 4 | apiVersion: cert-manager.io/v1 5 | kind: Issuer 6 | metadata: 7 | name: selfsigned-issuer 8 | namespace: system 9 | spec: 10 | selfSigned: {} 11 | -------------------------------------------------------------------------------- /operator/config/base/rbac/aistore_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to view aistores. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: aistore-viewer-role 6 | rules: 7 | - apiGroups: 8 | - ais.nvidia.com 9 | resources: 10 | - aistores 11 | verbs: 12 | - get 13 | - list 14 | - watch 15 | - apiGroups: 16 | - ais.nvidia.com 17 | resources: 18 | - aistores/status 19 | verbs: 20 | - get 21 | -------------------------------------------------------------------------------- /operator/config/overlays/prometheus/monitor.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Prometheus Monitor Service (Metrics) 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: ServiceMonitor 5 | metadata: 6 | labels: 7 | control-plane: controller-manager 8 | name: controller-manager-metrics-monitor 9 | namespace: system 10 | spec: 11 | endpoints: 12 | - path: /metrics 13 | port: https 14 | selector: 15 | matchLabels: 16 | control-plane: controller-manager 17 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/vars/multihome.yml: -------------------------------------------------------------------------------- 1 | # Name of the Network Attachment Definition to be referenced as an annotation by the AIS stateful set 2 | # See the multus example: https://github.com/k8snetworkplumbingwg/multus-cni/blob/master/docs/quickstart.md#storing-a-configuration-as-a-custom-resource 3 | network_attachment: "" 4 | 5 | # Name of the interface for which to create a network attachment definition 6 | network_interface: "" 7 | 8 | attachment_namespace: "ais" -------------------------------------------------------------------------------- /operator/config/overlays/default/aistores_keep_policy_patch.yaml: -------------------------------------------------------------------------------- 1 | # The following patch adds helm.sh/resource-policy: keep annotation to prevent 2 | # the CRD from being deleted during helm uninstall, which would cascade delete 3 | # all AIStore custom resources (actual AIS clusters) 4 | apiVersion: apiextensions.k8s.io/v1 5 | kind: CustomResourceDefinition 6 | metadata: 7 | annotations: 8 | helm.sh/resource-policy: keep 9 | name: aistores.ais.nvidia.com 10 | 11 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_delete_conf_state_storage/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Delete PVCs with stateStorageClass 3 | shell: > 4 | kubectl get pvc -n {{ cluster }} -o jsonpath='{.items[?(@.spec.storageClassName=="{{ state_storage_class }}")].metadata.name}' | xargs -r kubectl delete pvc -n {{ cluster }} 5 | register: delete_pvc_output 6 | changed_when: delete_pvc_output.stdout != "" 7 | when: state_storage_class is defined and state_storage_class != "" 8 | -------------------------------------------------------------------------------- /auth/keycloak/manifests/trust-bundle.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: trust.cert-manager.io/v1alpha1 2 | kind: Bundle 3 | metadata: 4 | name: aistore.nvidia.com 5 | namespace: cert-manager 6 | spec: 7 | sources: 8 | - secret: 9 | name: ca-root-secret # default value for cluster issuer helm chart, update as needed 10 | key: tls.crt 11 | target: 12 | namespaceSelector: 13 | matchLabels: 14 | ais-trust: "true" 15 | configMap: 16 | key: ca.crt -------------------------------------------------------------------------------- /operator/config/overlays/default/manager_env_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | env: 12 | - name: OPERATOR_NAMESPACE 13 | valueFrom: 14 | fieldRef: 15 | fieldPath: metadata.namespace 16 | - name: OPERATOR_SKIP_VERIFY_CRT 17 | value: "True" -------------------------------------------------------------------------------- /helm/ais-client/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | repository: aistorage/ais-util 3 | tag: latest 4 | pullPolicy: IfNotPresent 5 | 6 | serviceAccount: 7 | create: true 8 | name: ais-client 9 | 10 | ais: 11 | endpoint: https://ais-proxy.ais.svc.cluster.local:51080 12 | caConfigMap : aistore.nvidia.com 13 | bundleFile: trust-bundle.pem 14 | 15 | resources: 16 | limits: 17 | cpu: 2 18 | memory: 2Gi 19 | requests: 20 | cpu: 2 21 | memory: 2Gi 22 | 23 | nodeSelector: -------------------------------------------------------------------------------- /operator/config/overlays/default/webhook/aistores_conversion_webhook_patch.yaml: -------------------------------------------------------------------------------- 1 | # The following patch enables a conversion webhook for the CRD 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | name: aistores.ais.nvidia.com 6 | spec: 7 | conversion: 8 | strategy: Webhook 9 | webhook: 10 | clientConfig: 11 | service: 12 | namespace: system 13 | name: webhook-service 14 | path: /convert 15 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_cleanup_all/files/clean-mpaths.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Remove ais related data from all mountpath 5 | # WARNING: will also cleanup data 6 | # 7 | 8 | mpaths=${MPATHS:-"/ais/sda /ais/sdb /ais/sdc /ais/sdd /ais/sde /ais/sdf /ais/sdg /ais/sdh /ais/sdi /ais/sdj"} # Adjust mpaths if needed. 9 | 10 | for m in ${mpaths}; do 11 | rm -rf $m/@ais 12 | rm -rf $m/@gcp 13 | rm -rf $m/@aws 14 | rm -rf $m/.ais.* 15 | done 16 | -------------------------------------------------------------------------------- /monitoring/alloy/environments/prod/values.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | loki: 2 | remoteEndpoint: {{ env "LOKI_ENDPOINT_PANOPTES" }} 3 | 4 | prometheus: 5 | localGateway: http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090/api/v1/otlp/ 6 | 7 | mimir: 8 | remoteEndpoint: {{ env "MIMIR_ENDPOINT_PANOPTES" }} 9 | 10 | remote: 11 | label: {{ requiredEnv "CLUSTER_LABEL" }} 12 | secret: {{ requiredEnv "REMOTE_AUTH_SECRET" }} 13 | scope: {{ requiredEnv "REMOTE_AUTH_SCOPE" }} -------------------------------------------------------------------------------- /operator/config/overlays/default/webhook/webhook_cainjection_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch add annotation to admission webhook config and 2 | # CERTIFICATE_NAMESPACE_PLACEHOLDER/CERTIFICATE_NAME_PLACEHOLDER will be substituted by kustomize. 3 | apiVersion: admissionregistration.k8s.io/v1 4 | kind: ValidatingWebhookConfiguration 5 | metadata: 6 | name: validating-webhook-configuration 7 | annotations: 8 | cert-manager.io/inject-ca-from: CERTIFICATE_NAMESPACE_PLACEHOLDER/CERTIFICATE_NAME_PLACEHOLDER 9 | -------------------------------------------------------------------------------- /operator/config/scorecard/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - bases/config.yaml 3 | patchesJson6902: 4 | - path: patches/basic.config.yaml 5 | target: 6 | group: scorecard.operatorframework.io 7 | version: v1alpha3 8 | kind: Configuration 9 | name: config 10 | - path: patches/olm.config.yaml 11 | target: 12 | group: scorecard.operatorframework.io 13 | version: v1alpha3 14 | kind: Configuration 15 | name: config 16 | # +kubebuilder:scaffold:patchesJson6902 17 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_downscale_cluster.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Patch AIStore Size 3 | hosts: controller 4 | vars_prompt: 5 | - name: size 6 | prompt: "Enter the new size for AIStore cluster" 7 | private: no 8 | tasks: 9 | - name: "Patch AIStore resource to change its size" 10 | shell: "kubectl patch aistore ais -n ais --type='json' -p='[{\"op\": \"replace\", \"path\": \"/spec/size\", \"value\":{{ size }}}]'" 11 | args: 12 | executable: /bin/bash 13 | -------------------------------------------------------------------------------- /auth/keycloak/docker/docker-keycloak.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | docker run --rm --name keycloak \ 4 | -v $(pwd)/db:/opt/keycloak/data/h2 \ 5 | -v $(pwd)/../realm/aistore-realm.json:/opt/keycloak/data/import/aistore-realm.json \ 6 | -v $(pwd)/server.crt.pem:/opt/keycloak/conf/server.crt.pem:ro \ 7 | -v $(pwd)/server.key.pem:/opt/keycloak/conf/server.key.pem:ro \ 8 | --env-file sample.env \ 9 | -p 8443:8443 \ 10 | quay.io/keycloak/keycloak:latest \ 11 | start-dev --import-realm -------------------------------------------------------------------------------- /playbooks/host-config/ais_host_config_pcm.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Install the PCM tool to all hosts in the 'ais' host group 3 | # 4 | --- 5 | - hosts: '{{ ais_hosts | default("ais") }}' 6 | vars_prompt: 7 | - name: pcm_install_confirmation 8 | prompt: "Do you want to install the pcm tool on {{ ais_hosts }}? Type 'yes' to confirm." 9 | default: "no" 10 | private: no 11 | become: true 12 | gather_facts: false 13 | roles: 14 | - role: pcm 15 | when: pcm_install_confirmation | bool -------------------------------------------------------------------------------- /monitoring/loki/README.md: -------------------------------------------------------------------------------- 1 | # Loki 2 | - [Main docs](https://grafana.com/docs/loki/latest/) 3 | - [Chart source](https://github.com/grafana/loki/tree/main/production/helm/loki) 4 | - [Additional values options](https://grafana.com/docs/loki/latest/setup/install/helm/reference/) 5 | 6 | # Usage 7 | 8 | ## Template a new environment 9 | 10 | `set -a; . ../oci-iad.env ; set +a; helmfile -e prod template` 11 | 12 | ## Sync a new environment 13 | 14 | `set -a; . ../oci-iad.env ; set +a; helmfile -e prod sync` -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_delete_cluster/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Copy `delete_cluster` script after processing as a template 3 | become: true 4 | template: 5 | src: "delete_cluster.sh.j2" 6 | dest: "/tmp/delete_cluster.sh" 7 | mode: 0777 8 | 9 | - name: Delete aistore cluster and wait till all pods get deleted 10 | shell: "/tmp/delete_cluster.sh" 11 | register: delete_cluster 12 | changed_when: "'configured' in delete_cluster.stdout or 'created' in delete_cluster.stdout" -------------------------------------------------------------------------------- /auth/keycloak/kind/config.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | nodes: 4 | - role: control-plane 5 | - role: worker 6 | # Mount to support openEBS local volumes 7 | extraMounts: 8 | - hostPath: /run/udev 9 | containerPath: /run/udev 10 | - role: worker 11 | extraMounts: 12 | - hostPath: /run/udev 13 | containerPath: /run/udev 14 | - role: worker 15 | extraMounts: 16 | - hostPath: /run/udev 17 | containerPath: /run/udev 18 | -------------------------------------------------------------------------------- /auth/keycloak/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==25.1.0 2 | anyio==4.12.0 3 | async-property==0.2.2 4 | certifi==2025.11.12 5 | cffi==2.0.0 6 | charset-normalizer==3.4.4 7 | cryptography==46.0.3 8 | deprecation==2.1.0 9 | exceptiongroup==1.3.1 10 | h11==0.16.0 11 | httpcore==1.0.9 12 | httpx==0.28.1 13 | idna==3.11 14 | jwcrypto==1.5.6 15 | packaging==25.0 16 | pycparser==2.23 17 | python-keycloak==5.8.1 18 | requests==2.32.5 19 | requests-toolbelt==1.0.0 20 | typing_extensions==4.15.0 21 | urllib3==2.6.0 22 | -------------------------------------------------------------------------------- /helm/ais/config/tls-cert/values-sample.yaml: -------------------------------------------------------------------------------- 1 | certificate: 2 | duration: 8760h # 1 year 3 | renewBefore: 720h # 30 days 4 | subject: 5 | organizations: 6 | - NVIDIA Corporation 7 | organizationalUnits: 8 | - NGC Storage 9 | countries: 10 | - US 11 | dnsNames: 12 | - example.com 13 | - www.example.com 14 | ipAddresses: 15 | - 192.168.0.1 16 | - 192.168.0.2 17 | emailAddress: ais@exchange.nvidia.com 18 | issuerRef: 19 | name: selfsigned 20 | kind: Issuer -------------------------------------------------------------------------------- /helm/ais/charts/cloud-secrets/templates/oci-secret.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.oci_config .Values.oci_api_key .Values.cloud.ociSecretName }} 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: {{ .Values.cloud.ociSecretName }} 6 | namespace: {{ .Release.Namespace }} 7 | type: Opaque 8 | data: 9 | config: {{ .Values.oci_config | b64enc }} 10 | {{- if and .Values.oci_cli_rc }} 11 | oci_cli_rc: {{ .Values.oci_cli_rc | b64enc }} 12 | {{- end }} 13 | oci_api_key: {{ .Values.oci_api_key | b64enc }} 14 | {{- end }} -------------------------------------------------------------------------------- /operator/config/base/rbac/aistore_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions for end users to edit aistores. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: aistore-editor-role 6 | rules: 7 | - apiGroups: 8 | - ais.nvidia.com 9 | resources: 10 | - aistores 11 | verbs: 12 | - create 13 | - delete 14 | - get 15 | - list 16 | - patch 17 | - update 18 | - watch 19 | - apiGroups: 20 | - ais.nvidia.com 21 | resources: 22 | - aistores/status 23 | verbs: 24 | - get 25 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/create_network_definition/files/nad.template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "k8s.cni.cncf.io/v1" 2 | kind: NetworkAttachmentDefinition 3 | metadata: 4 | name: $NAD_NAME 5 | namespace: $NAD_NAMESPACE 6 | spec: 7 | config: '{ 8 | "cniVersion": "0.3.0", 9 | "type": "macvlan", 10 | "master": "$NAD_IFACE", 11 | "mode": "bridge", 12 | "ipam": { 13 | "type": "whereabouts", 14 | "range": "192.168.1.0/24", 15 | "gateway": "192.168.1.1" 16 | } 17 | }' -------------------------------------------------------------------------------- /playbooks/cloud/ais_gcp_config.yml: -------------------------------------------------------------------------------- 1 | # Run this and restart the cluster pods to update access to gcp 2 | # Note: Your cluster should have the gcp-creds secret setup prior to this, 3 | # or you will need to redploy the whole cluster. 4 | # Copies the provided gcp.json containing gcp credentials in 5 | # roles/gcp_config/files to the controller host and recreates the kubernetes secret 6 | 7 | - hosts: controller 8 | vars_files: 9 | - "vars/gcp_config.yml" 10 | 11 | gather_facts: false 12 | roles: 13 | - gcp_config 14 | -------------------------------------------------------------------------------- /helm/ais/config/cloud/sjc11.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | aws_config: |- 2 | {{ readFile (printf "%s/.aws/sjc11/config" (env "HOME")) | indent 2 }} 3 | 4 | aws_credentials: |- 5 | {{ readFile (printf "%s/.aws/sjc11/credentials" (env "HOME")) | indent 2 }} 6 | 7 | gcp_json: |- 8 | {{ readFile (printf "%s/.gcp/sjc11/gcp.json" (env "HOME")) | indent 2 }} 9 | 10 | oci_config: |- 11 | {{ readFile (printf "%s/.oci/sjc11/config" (env "HOME")) | indent 2 }} 12 | 13 | oci_api_key: |- 14 | {{ readFile (printf "%s/.oci/sjc11/oci_api_key" (env "HOME")) | indent 2 }} -------------------------------------------------------------------------------- /monitoring/loki/helmfile.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | environments: 2 | prod: 3 | # Add defaults here if they include values used in other templates 4 | values: 5 | - ./environments/{{ .Environment.Name }}/values.yaml 6 | 7 | --- 8 | 9 | repositories: 10 | - name: grafana 11 | url: https://grafana.github.io/helm-charts 12 | 13 | releases: 14 | - name: loki 15 | namespace: monitoring 16 | createNamespace: true 17 | chart: grafana/loki 18 | values: 19 | - ./environments/{{ .Environment.Name }}/loki.yaml.gotmpl -------------------------------------------------------------------------------- /operator/config/base/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | name: leader-election-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | - coordination.k8s.io 10 | resources: 11 | - configmaps 12 | - leases 13 | verbs: 14 | - get 15 | - list 16 | - watch 17 | - create 18 | - update 19 | - patch 20 | - delete 21 | - apiGroups: 22 | - "" 23 | resources: 24 | - events 25 | verbs: 26 | - create 27 | - patch 28 | -------------------------------------------------------------------------------- /helm/cluster-issuer/config/sjc11.yaml: -------------------------------------------------------------------------------- 1 | # Used for setting up a private PKI 2 | ca_cert_secret: ca-root-secret 3 | 4 | clusterIssuer: 5 | name: selfsigned-issuer 6 | 7 | caCertificate: 8 | name: selfsigned-cert 9 | duration: 8760h 10 | renewBefore: 720h 11 | privateKey: 12 | algorithm: RSA 13 | encoding: PKCS1 14 | size: 4096 15 | subject: 16 | organizations: 17 | - NVIDIA Corporation 18 | organizationalUnits: 19 | - NGC Storage 20 | countries: 21 | - US 22 | 23 | Issuer: 24 | name: ca-issuer -------------------------------------------------------------------------------- /helm/cluster-issuer/config/default.yaml: -------------------------------------------------------------------------------- 1 | # Used for setting up a private PKI 2 | ca_cert_secret: ca-root-secret 3 | 4 | clusterIssuer: 5 | name: selfsigned-issuer 6 | 7 | caCertificate: 8 | name: selfsigned-cert 9 | duration: 8760h 10 | renewBefore: 720h 11 | privateKey: 12 | algorithm: RSA 13 | encoding: PKCS1 14 | size: 4096 15 | subject: 16 | organizations: 17 | - NVIDIA Corporation 18 | organizationalUnits: 19 | - NGC Storage 20 | countries: 21 | - US 22 | 23 | Issuer: 24 | name: ca-issuer -------------------------------------------------------------------------------- /monitoring/promtail/helmfile.yaml: -------------------------------------------------------------------------------- 1 | environments: 2 | default: 3 | values: 4 | - promtail: 5 | clientUrl: {{ requiredEnv "LOKI_ENDPOINT" }} 6 | tenant: "anonymous" 7 | clusterLabel: {{ requiredEnv "LOKI_LABEL" }} 8 | 9 | --- 10 | 11 | repositories: 12 | - name: grafana 13 | url: https://grafana.github.io/helm-charts 14 | 15 | --- 16 | 17 | releases: 18 | - name: promtail 19 | namespace: monitoring 20 | createNamespace: true 21 | chart: grafana/promtail 22 | values: 23 | - values.yaml.gotmpl -------------------------------------------------------------------------------- /operator/config/base/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false -------------------------------------------------------------------------------- /tools/state-manager/pod_config.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | from dataclasses import dataclass, field 5 | from typing import List, Dict 6 | 7 | 8 | @dataclass 9 | class PodConfig: 10 | name: str 11 | image: str 12 | container_name: str 13 | command: List[str] 14 | exec_cmd: str 15 | labels: Dict[str, str] = field(default_factory=dict) 16 | 17 | @property 18 | def label_selector(self) -> str: 19 | return ",".join(f"{key}={value}" for key, value in self.labels.items()) 20 | -------------------------------------------------------------------------------- /helm/authn/config/authn/nvidia.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | tls: 2 | enabled: true 3 | certPath: "/var/certs/tls.crt" 4 | keyPath: "/var/certs/tls.key" 5 | certificate: 6 | duration: 8760h # 1 year 7 | renewBefore: 720h # 30 days 8 | subject: 9 | organizations: 10 | - NVIDIA Corporation 11 | organizationalUnits: 12 | - NGC Storage 13 | countries: 14 | - US 15 | emailAddress: aistore@nvidia.com 16 | issuerRef: 17 | name: ca-issuer 18 | kind: ClusterIssuer 19 | 20 | lb: 21 | enabled: true 22 | port: 52001 -------------------------------------------------------------------------------- /monitoring/kube-prom/environments/prod/values.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | # See defaults: https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml 2 | 3 | kubeStateMetrics: 4 | enabled: false 5 | 6 | affinity: 7 | nodeLabelKey: monitoring 8 | nodeLabelValue: true 9 | 10 | slack: 11 | webhook: {{ requiredEnv "ALERTMANAGER_SLACK_WEBHOOK" }} 12 | channel: {{ requiredEnv "ALERTMANAGER_SLACK_CHANNEL" }} 13 | 14 | grafana: 15 | storageClass: oci-bv 16 | pvSize: 20Gi 17 | 18 | prometheus: 19 | storageClass: oci-bv 20 | pvSize: 50Gi -------------------------------------------------------------------------------- /monitoring/kube-state-metrics/values.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | prometheus: 2 | monitor: 3 | enabled: false 4 | 5 | {{- if hasKey .Values "affinity"}} 6 | {{- if and .Values.affinity.nodeLabelKey .Values.affinity.nodeLabelValue }} 7 | affinity: 8 | nodeAffinity: 9 | requiredDuringSchedulingIgnoredDuringExecution: 10 | nodeSelectorTerms: 11 | - matchExpressions: 12 | - key: {{ .Values.affinity.nodeLabelKey }} 13 | operator: In 14 | values: 15 | - "{{ .Values.affinity.nodeLabelValue }}" 16 | {{- end }} 17 | {{- end }} -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/generate_https_cert/templates/cert.yaml.j2: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | name: ais-server-cert 5 | namespace: {{ cluster }} 6 | spec: 7 | secretName: {{ tls_secret }} 8 | isCA: false 9 | duration: 8760h # 1 year 10 | renewBefore: 720h # 30 days 11 | dnsNames: 12 | {% for dns_name in dns_names %} 13 | - "{{ dns_name }}" 14 | {% endfor %} 15 | ipAddresses: 16 | {% for ip_addr in ip_addresses %} 17 | - {{ ip_addr }} 18 | {% endfor %} 19 | issuerRef: 20 | name: ca-issuer -------------------------------------------------------------------------------- /playbooks/host-config/ais_enable_multiqueue.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Enable the multiqueue schedulers in Ubuntu 18.04 - requires reboot for effect. 3 | # 4 | --- 5 | 6 | - hosts: '{{ ais_hosts | default("ais") }}' 7 | gather_facts: false 8 | vars_prompt: 9 | - name: multiqueue_confirmation 10 | prompt: "Are you sure you want to run ais_enable_multiqueue on {{ ais_hosts }} cluster? Type 'yes' to confirm." 11 | default: "no" 12 | private: no 13 | become: true 14 | roles: 15 | - role: ais_enable_multiqueue 16 | when: multiqueue_confirmation | bool 17 | -------------------------------------------------------------------------------- /helm/cluster-issuer/config/values-sample.yaml: -------------------------------------------------------------------------------- 1 | # Used for setting up a private PKI 2 | tls_secret: tls-secret 3 | ca_cert_secret: ca-root-secret 4 | 5 | clusterIssuer: 6 | name: selfsigned-issuer 7 | 8 | caCertificate: 9 | name: selfsigned-cert 10 | duration: 8760h 11 | renewBefore: 720h 12 | privateKey: 13 | algorithm: RSA 14 | encoding: PKCS1 15 | size: 4096 16 | subject: 17 | organizations: 18 | - NVIDIA Corporation 19 | organizationalUnits: 20 | - NGC Storage 21 | countries: 22 | - US 23 | 24 | Issuer: 25 | name: ca-issuer -------------------------------------------------------------------------------- /monitoring/kube-prom/environments/dev/values.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | # See defaults: https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml 2 | 3 | kubeStateMetrics: 4 | enabled: false 5 | 6 | affinity: 7 | nodeLabelKey: monitoring 8 | nodeLabelValue: true 9 | 10 | slack: 11 | webhook: {{ env "ALERTMANAGER_SLACK_WEBHOOK" | default "" }} 12 | channel: {{ env "ALERTMANAGER_SLACK_CHANNEL" | default "" }} 13 | 14 | grafana: 15 | storageClass: local-path 16 | pvSize: 20Gi 17 | 18 | prometheus: 19 | storageClass: local-path 20 | pvSize: 20Gi -------------------------------------------------------------------------------- /operator/config/base/webhook/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # the following config is for teaching kustomize where to look at when substituting vars. 2 | # It requires kustomize v2.1.0 or newer to work properly. 3 | nameReference: 4 | - kind: Service 5 | version: v1 6 | fieldSpecs: 7 | - kind: ValidatingWebhookConfiguration 8 | group: admissionregistration.k8s.io 9 | path: webhooks/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: ValidatingWebhookConfiguration 13 | group: admissionregistration.k8s.io 14 | path: webhooks/clientConfig/service/namespace 15 | create: true -------------------------------------------------------------------------------- /operator/scripts/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OPERATOR_DIR="$(cd "$(dirname "$0")/../"; pwd -P)" 4 | # This script is used by Makefile to run commands. 5 | source ${OPERATOR_DIR}/scripts/utils.sh 6 | 7 | case $1 in 8 | fmt) 9 | case $2 in 10 | --fix) 11 | echo "Running style fixing..." >&2 12 | 13 | gofmt -s -w ${OPERATOR_DIR} 14 | ;; 15 | *) 16 | echo "Running style check..." >&2 17 | 18 | check_gomod 19 | check_imports 20 | check_files_headers 21 | ;; 22 | esac 23 | ;; 24 | 25 | *) 26 | echo "unsupported argument $1" 27 | exit 1 28 | ;; 29 | esac 30 | -------------------------------------------------------------------------------- /playbooks/security/roles/rsyslog/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Configure rsyslog 3 | # The rsyslog software package may be used instead of the default journald logging mechanism. 4 | - name: Ensure rsyslog configuration directory exists 5 | file: 6 | path: /etc/rsyslog.d 7 | state: directory 8 | 9 | - name: Ensure rsyslog default file permissions are configured 10 | lineinfile: 11 | path: /etc/rsyslog.d/99-default-permissions.conf 12 | line: '\$FileCreateMode 0640' 13 | create: yes 14 | 15 | - name: Restart rsyslog service 16 | service: 17 | name: rsyslog 18 | state: restarted 19 | -------------------------------------------------------------------------------- /operator/.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | bin/ 8 | Dockerfile.cross 9 | 10 | # Test binary, built with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | 16 | # Go workspace file 17 | go.work 18 | 19 | # Kubernetes Generated files - skip generated files, except for vendored files 20 | !vendor/**/zz_generated.* 21 | 22 | # editor and IDE paraphernalia 23 | .idea 24 | .vscode 25 | *.swp 26 | *.swo 27 | *~ 28 | 29 | # Generated for local test runs 30 | tests/test_job_spec.yaml 31 | -------------------------------------------------------------------------------- /operator/config/overlays/default/manager_config_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | args: 12 | - "--config=controller_manager_config.yaml" 13 | volumeMounts: 14 | - name: manager-config 15 | mountPath: /controller_manager_config.yaml 16 | subPath: controller_manager_config.yaml 17 | volumes: 18 | - name: manager-config 19 | configMap: 20 | name: manager-config 21 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/ais_deploy_operator/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Operator version 2 | debug: 3 | msg: Deploying AIS Operator {{ operator_version }} 4 | 5 | - name: Copy operator deploy script 6 | become: true 7 | template: 8 | src: "deploy-operator.sh.j2" 9 | dest: "/tmp/deploy-operator.sh" 10 | mode: "0755" 11 | 12 | - name: Run deploy operator scripts 13 | shell: "/tmp/deploy-operator.sh" 14 | 15 | - name: Wait for operator to be in Running state 16 | shell: "kubectl wait --for=condition=available deployment/ais-operator-controller-manager -n ais-operator-system --timeout=5m" 17 | -------------------------------------------------------------------------------- /tools/state-manager/ais_metadata.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | from enum import Enum 5 | from typing import List 6 | 7 | 8 | class AISMetadata(Enum): 9 | smap = ".ais.smap" 10 | conf = ".ais.conf" 11 | bmd = ".ais.bmd" 12 | rmd = ".ais.rmd" 13 | override = ".ais.override_config" 14 | all = ".ais.*" 15 | 16 | @staticmethod 17 | def get_options() -> List[str]: 18 | return [e.name for e in AISMetadata] 19 | 20 | @staticmethod 21 | def get_options_str() -> str: 22 | return ",".join(AISMetadata.get_options()) 23 | -------------------------------------------------------------------------------- /operator/PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: nvidia.com 6 | layout: 7 | - go.kubebuilder.io/v4 8 | projectName: ais-operator 9 | repo: github.com/ais-operator 10 | resources: 11 | - api: 12 | crdVersion: v1 13 | namespaced: true 14 | controller: true 15 | domain: nvidia.com 16 | group: ais 17 | kind: AIStore 18 | path: github.com/ais-operator/api/v1beta1 19 | version: v1beta1 20 | version: "3" 21 | -------------------------------------------------------------------------------- /helm/ais-client/templates/rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: {{ .Values.serviceAccount.name }}-role 5 | rules: 6 | - apiGroups: [""] 7 | resources: ["pods/exec"] 8 | verbs: ["get", "list", "create"] 9 | --- 10 | apiVersion: rbac.authorization.k8s.io/v1 11 | kind: RoleBinding 12 | metadata: 13 | name: {{ .Values.serviceAccount.name }}-rolebinding 14 | subjects: 15 | - kind: ServiceAccount 16 | name: {{ .Values.serviceAccount.name }} 17 | roleRef: 18 | kind: Role 19 | name: {{ .Values.serviceAccount.name }}-role 20 | apiGroup: rbac.authorization.k8s.io 21 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/fetch_ca_cert.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Fetch a certificate from the K8s secret for use with client verification 3 | # 4 | --- 5 | - name: Fetch certificate 6 | hosts: controller 7 | gather_facts: false 8 | vars_files: 9 | - vars/https_config.yml 10 | vars: 11 | - cacert_file: ais_ca.crt 12 | pre_tasks: 13 | - name: Validate if cluster is defined 14 | fail: 15 | msg: "Variable 'cluster' not found. Add the 'cluster' variable during execution. e.g. ansible-playbook -i hosts.ini fetch_ca_cert.yml -e cluster=ais" 16 | when: cluster is undefined 17 | roles: 18 | - fetch_ca_cert 19 | -------------------------------------------------------------------------------- /playbooks/host-config/roles/ais_taint_nodes/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Apply node taints. Run for every host in the play, executes action on local host. 3 | # 4 | 5 | - name: Check whether storage node taint is applied 6 | local_action: 7 | module: shell 8 | _raw_params: kubectl describe node {{ inventory_hostname }} | grep --before=0 --after=10 '^Taints:' | grep '{{ cpu_node_taint }}' 9 | ignore_errors: True 10 | register: taintgrep 11 | 12 | - name: Apply taint to node 13 | local_action: 14 | module: command 15 | _raw_params: kubectl taint nodes {{ inventory_hostname }} {{ cpu_node_taint }} 16 | when: taintgrep|failed -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/environments/prod/metrics.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | prometheus.relabel "redirect" { 2 | forward_to = [prometheus.relabel.all_metrics.receiver] 3 | } 4 | 5 | prometheus.relabel "node_exporter" { 6 | forward_to = [prometheus.relabel.all_metrics.receiver] 7 | rule { 8 | action = "replace" 9 | target_label = "job" 10 | replacement = "node-exporter" 11 | } 12 | } 13 | 14 | prometheus.scrape "kube_state_metrics" { 15 | targets = discovery.relabel.kube_state_metrics.output 16 | job_name = "kube-state-metrics" 17 | scrape_interval = "30s" 18 | forward_to = [prometheus.relabel.all_metrics.receiver] 19 | } 20 | -------------------------------------------------------------------------------- /operator/config/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - crd/ 3 | - rbac/ 4 | - manager/ 5 | # [METRICS] Expose the controller manager metrics service. 6 | - metrics_service.yaml 7 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 8 | # crd/kustomization.yaml 9 | - webhook/ 10 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. 11 | # Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks. 12 | # 'CERTMANAGER' needs to be enabled to use ca injection 13 | - certmanager/ 14 | -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/environments/remote/metrics.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | prometheus.relabel "redirect" { 2 | forward_to = [prometheus.relabel.remote_metrics.receiver] 3 | } 4 | 5 | prometheus.relabel "node_exporter" { 6 | forward_to = [prometheus.relabel.remote_metrics.receiver] 7 | rule { 8 | action = "replace" 9 | target_label = "job" 10 | replacement = "node-exporter" 11 | } 12 | } 13 | 14 | prometheus.scrape "kube_state_metrics" { 15 | targets = discovery.relabel.kube_state_metrics.output 16 | job_name = "kube-state-metrics" 17 | scrape_interval = "30s" 18 | forward_to = [prometheus.relabel.remote_metrics.receiver] 19 | } -------------------------------------------------------------------------------- /monitoring/kube-state-metrics/helmfile.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | repositories: 2 | - name: prometheus-community 3 | url: https://prometheus-community.github.io/helm-charts 4 | 5 | environments: 6 | prod: 7 | values: 8 | - ./environments/{{ .Environment.Name }}/values.yaml 9 | dev: 10 | values: 11 | - ./environments/{{ .Environment.Name }}/values.yaml 12 | 13 | --- 14 | 15 | releases: 16 | - name: kube-state-metrics 17 | namespace: monitoring 18 | createNamespace: true 19 | chart: prometheus-community/kube-state-metrics 20 | values: 21 | - ./values.yaml.gotmpl 22 | - ./environments/{{ .Environment.Name }}/values.yaml -------------------------------------------------------------------------------- /auth/keycloak/manifests/certificate.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: cert-manager.io/v1 2 | kind: Certificate 3 | metadata: 4 | name: keycloak-cert 5 | namespace: keycloak 6 | spec: 7 | dnsNames: 8 | - localhost 9 | - "*.keycloak.svc.cluster.local" 10 | duration: 8760h 11 | emailAddresses: 12 | - aistore@nvidia.com 13 | ipAddresses: 14 | isCA: false 15 | issuerRef: 16 | kind: ClusterIssuer 17 | name: ca-issuer 18 | renewBefore: 720h 19 | secretName: keycloak-tls 20 | subject: 21 | countries: 22 | - US 23 | organizationalUnits: 24 | - NGC Storage 25 | organizations: 26 | - NVIDIA Corporation 27 | usages: 28 | - server auth -------------------------------------------------------------------------------- /manifests/cloud/oci-authn-lb.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # oci-authn-lb.yaml 3 | # This YAML file defines a Kubernetes Service that configures a network load balancer (NLB) 4 | # for the AIStore AuthN components in Oracle Cloud Infrastructure (OCI). 5 | apiVersion: v1 6 | kind: Service 7 | metadata: 8 | name: ais-authn-lb 9 | namespace: ais 10 | annotations: 11 | oci.oraclecloud.com/load-balancer-type: "nlb" 12 | oci-network-load-balancer.oraclecloud.com/internal: "true" 13 | labels: 14 | app: authn 15 | spec: 16 | selector: 17 | app: authn 18 | ports: 19 | - protocol: TCP 20 | port: 52001 21 | targetPort: 52001 22 | type: LoadBalancer 23 | -------------------------------------------------------------------------------- /playbooks/host-config/roles/pcm/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Check PCM directory exists . 4 | stat: 5 | path: ~/pcm-master 6 | register: pcm_dir_exists 7 | 8 | - name: Get PCM code as zip 9 | get_url: 10 | url: 'https://github.com/opcm/pcm/archive/master.zip' 11 | dest: '/tmp/pcm.zip' 12 | mode: 0440 13 | when: not pcm_dir_exists.stat.exists 14 | 15 | - name: Unarchive pcm.zip 16 | unarchive: 17 | src: /tmp/pcm.zip 18 | dest: ~/ 19 | remote_src: yes 20 | when: not pcm_dir_exists.stat.exists 21 | 22 | - name: Install PCM tool 23 | shell: cmake . && cmake --build . 24 | args: 25 | chdir: ~/pcm-master 26 | -------------------------------------------------------------------------------- /operator/config/overlays/prometheus/monitor_tls_patch.yaml: -------------------------------------------------------------------------------- 1 | # Patch for Prometheus ServiceMonitor to enable secure TLS configuration 2 | # using certificates managed by cert-manager 3 | - op: replace 4 | path: /spec/endpoints/0/tlsConfig 5 | value: 6 | # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize 7 | serverName: SERVICE_NAME.SERVICE_NAMESPACE.svc 8 | insecureSkipVerify: false 9 | ca: 10 | secret: 11 | name: metrics-server-cert 12 | key: ca.crt 13 | cert: 14 | secret: 15 | name: metrics-server-cert 16 | key: tls.crt 17 | keySecret: 18 | name: metrics-server-cert 19 | key: tls.key -------------------------------------------------------------------------------- /helm/operator/config/tls-cert/sjc4-1000.yaml: -------------------------------------------------------------------------------- 1 | spec: 2 | tlsSecret: operator-tls 3 | duration: 8760h # 1 year 4 | renewBefore: 720h # 30 days 5 | subject: 6 | organizations: 7 | - NVIDIA Corporation 8 | organizationalUnits: 9 | - NGC Storage 10 | countries: 11 | - US 12 | dnsNames: 13 | - "localhost" 14 | - "aistorecpu1000.sjc4.maas.cis" 15 | - "aistorecpu1001.sjc4.maas.cis" 16 | - "aistorecpu1002.sjc4.maas.cis" 17 | ipAddresses: 18 | - 127.0.0.1 19 | - 10.150.56.248 20 | - 10.150.56.245 21 | - 10.150.56.246 22 | emailAddress: ais@exchange.nvidia.com 23 | issuerRef: 24 | name: ca-issuer 25 | kind: ClusterIssuer -------------------------------------------------------------------------------- /playbooks/hosts-example.ini: -------------------------------------------------------------------------------- 1 | # The k8s node with kubectl 2 | # ansible_host is unnecessary if the name of your host is the IP 3 | # additional_hosts is used for multi-home deployments if the host has multiple IPs configured 4 | [controller] 5 | controller_host ansible_host=10.51.248.1 additional_hosts=10.51.248.32 6 | 7 | # GPU nodes 8 | [gpu-nodes] 9 | gpu-worker1 ansible_host=10.51.248.4 additional_hosts=10.51.248.35 10 | 11 | # Other k8s nodes 12 | [ais] 13 | worker1 ansible_host=10.51.248.2 additional_hosts=10.51.248.33 14 | worker2 ansible_host=10.51.248.3 additional_hosts=10.51.248.34 15 | 16 | # Add other nodes to the ais group 17 | [ais:children] 18 | controller 19 | gpu-nodes 20 | -------------------------------------------------------------------------------- /operator/config/base/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: manager-role 6 | rules: 7 | - apiGroups: 8 | - '*' 9 | resources: 10 | - '*' 11 | verbs: 12 | - '*' 13 | - apiGroups: 14 | - ais.nvidia.com 15 | resources: 16 | - aistores 17 | verbs: 18 | - create 19 | - delete 20 | - get 21 | - list 22 | - patch 23 | - update 24 | - watch 25 | - apiGroups: 26 | - ais.nvidia.com 27 | resources: 28 | - aistores/finalizers 29 | verbs: 30 | - update 31 | - apiGroups: 32 | - ais.nvidia.com 33 | resources: 34 | - aistores/status 35 | verbs: 36 | - get 37 | - patch 38 | - update 39 | -------------------------------------------------------------------------------- /operator/config/base/webhook/manifests.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: admissionregistration.k8s.io/v1 3 | kind: ValidatingWebhookConfiguration 4 | metadata: 5 | name: validating-webhook-configuration 6 | webhooks: 7 | - admissionReviewVersions: 8 | - v1 9 | - v1beta1 10 | clientConfig: 11 | service: 12 | name: webhook-service 13 | namespace: system 14 | path: /validate-ais-nvidia-com-v1beta1-aistore 15 | failurePolicy: Fail 16 | name: vaistore.kb.io 17 | rules: 18 | - apiGroups: 19 | - ais.nvidia.com 20 | apiVersions: 21 | - v1beta1 22 | operations: 23 | - CREATE 24 | - UPDATE 25 | resources: 26 | - aistores 27 | sideEffects: None 28 | -------------------------------------------------------------------------------- /operator/pkg/services/services_suite_test.go: -------------------------------------------------------------------------------- 1 | // Package services contains services for the operator to use when reconciling AIS 2 | /* 3 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package services 6 | 7 | import ( 8 | "testing" 9 | 10 | . "github.com/onsi/ginkgo/v2" 11 | . "github.com/onsi/gomega" 12 | logf "sigs.k8s.io/controller-runtime/pkg/log" 13 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 14 | ) 15 | 16 | func TestServices(t *testing.T) { 17 | RegisterFailHandler(Fail) 18 | 19 | RunSpecs(t, "Services Suite") 20 | } 21 | 22 | var _ = BeforeSuite(func() { 23 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 24 | }) 25 | -------------------------------------------------------------------------------- /playbooks/host-config/roles/config_kubelet_systemd/tasks/main.yml: -------------------------------------------------------------------------------- 1 | 2 | - name: Confirm systemd drop-in overwrite 3 | prompt: "This playbook will overwrite the kubelet extra args configuration if it is running as a systemd service. 4 | Make sure the conf file in roles/config_kubelet_systemd/files has all of the extra args your kubelet service needs. 5 | Type 'yes' to continue" 6 | register: user_confirm 7 | 8 | - name: Copy kubelet-extra-args config drop-in file for the kubelet service 9 | ansible.builtin.copy: 10 | src: "kubelet-extra-args.conf" 11 | dest: "{{ kubelet_systemd_path }}/kubelet-extra-args.conf" 12 | mode: "644" 13 | when: user_confirm 14 | 15 | -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/environments/local/metrics.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | prometheus.relabel "redirect" { 2 | forward_to = [otelcol.receiver.prometheus.local_receiver_import.receiver] 3 | } 4 | 5 | prometheus.relabel "node_exporter" { 6 | forward_to = [otelcol.receiver.prometheus.local_receiver_import.receiver] 7 | rule { 8 | action = "replace" 9 | target_label = "job" 10 | replacement = "node-exporter" 11 | } 12 | } 13 | 14 | prometheus.scrape "kube_state_metrics" { 15 | targets = discovery.relabel.kube_state_metrics.output 16 | job_name = "kube-state-metrics" 17 | scrape_interval = "30s" 18 | forward_to = [otelcol.receiver.prometheus.local_receiver_import.receiver] 19 | } 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-issue.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: "📕 Documentation Issue" 3 | description: Report a documentation issue in the Operator, Helm, Playbooks, or other 4 | title: "(short issue description)" 5 | labels: [documentation, needs-triage] 6 | body: 7 | - type: textarea 8 | id: description 9 | attributes: 10 | label: Describe the issue 11 | description: A clear and concise description of the issue. 12 | validations: 13 | required: true 14 | 15 | - type: textarea 16 | id: links 17 | attributes: 18 | label: Links 19 | description: | 20 | Include links to affected documentation page(s). 21 | validations: 22 | required: true -------------------------------------------------------------------------------- /helm/operator/config/tls-cert/sjc11.yaml: -------------------------------------------------------------------------------- 1 | spec: 2 | tlsSecret: operator-tls 3 | duration: 8760h # 1 year 4 | renewBefore: 720h # 30 days 5 | subject: 6 | organizations: 7 | - NVIDIA Corporation 8 | organizationalUnits: 9 | - NGC Storage 10 | countries: 11 | - US 12 | dnsNames: 13 | - "localhost" 14 | - "aistorecpu1004.nsv.sjc11.nvmetal.net" 15 | - "aistorecpu1005.nsv.sjc11.nvmetal.net" 16 | - "aistorecpu1006.nsv.sjc11.nvmetal.net" 17 | ipAddresses: 18 | - 127.0.0.1 19 | - 10.52.160.21 20 | - 10.52.160.20 21 | - 10.52.160.87 22 | emailAddress: ais@exchange.nvidia.com 23 | issuerRef: 24 | name: ca-issuer 25 | kind: ClusterIssuer -------------------------------------------------------------------------------- /helm/operator/config/tls-cert/sjc4-dev.yaml: -------------------------------------------------------------------------------- 1 | spec: 2 | tlsSecret: operator-tls 3 | duration: 8760h # 1 year 4 | renewBefore: 720h # 30 days 5 | subject: 6 | organizations: 7 | - NVIDIA Corporation 8 | organizationalUnits: 9 | - NGC Storage 10 | countries: 11 | - US 12 | dnsNames: 13 | - "localhost" 14 | - "aistorecpu1010.nsv.sjc4.nvmetal.net" 15 | - "aistorecpu1011.nsv.sjc4.nvmetal.net" 16 | - "aistorecpu1012.nsv.sjc4.nvmetal.net" 17 | ipAddresses: 18 | - 127.0.0.1 19 | - 10.150.56.227 20 | - 10.150.56.230 21 | - 10.150.56.225 22 | emailAddress: aistore@nvidia.com 23 | issuerRef: 24 | name: ca-issuer 25 | kind: ClusterIssuer -------------------------------------------------------------------------------- /operator/config/base/certmanager/certificate_webhook.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: cert-manager.io/v1 3 | kind: Certificate 4 | metadata: 5 | name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml 6 | namespace: system 7 | spec: 8 | # $(SERVICE_NAME) and $(SERVICE_NAMESPACE) will be substituted by kustomize 9 | # FIXME: Add configurable cluster domain (cluster.local). 10 | dnsNames: 11 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc 12 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc.cluster.local 13 | issuerRef: 14 | kind: Issuer 15 | name: selfsigned-issuer 16 | secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize 17 | -------------------------------------------------------------------------------- /playbooks/host-config/ais_host_config_common.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Apply our desired config to the 'ais' host group 3 | # 4 | # This config is intended to apply to *all* hosts; another playbook 5 | # addresses specific needs for GPU hosts. 6 | # 7 | --- 8 | - hosts: '{{ ais_hosts | default("ais") }}' 9 | vars_files: 10 | - "vars/host_config.yml" 11 | vars_prompt: 12 | - name: host_config_confirmation 13 | prompt: "Are you sure you want to run ais_host_config_common on {{ ais_hosts }} cluster? Type 'yes' to confirm." 14 | default: "no" 15 | private: no 16 | become: true 17 | gather_facts: false 18 | roles: 19 | - role: ais_host_config_common 20 | when: host_config_confirmation | bool -------------------------------------------------------------------------------- /ais-operator-helper/README.md: -------------------------------------------------------------------------------- 1 | # AIS Operator Helper Docker Image 2 | 3 | The `ais-operator-helper` Docker image contains essential utilities for the AIS Operator. These tools assist in various operational tasks to maintain and manage the AIS cluster efficiently. 4 | 5 | | Executable Name | Description | 6 | |-----------------|-------------| 7 | | [`cleanup-helper`](src/cleanup-helper.go) | The `cleanup-helper` is designed to perform cleanup operations across all nodes within an AIS cluster. It deletes all files matching the `.ais.*` pattern within a specified directory.
**Usage:**
`/cleanup-helper -dir=/etc/ais`
This command in the docker image will delete all files matching the pattern in the `/etc/ais` directory. | 8 | -------------------------------------------------------------------------------- /playbooks/host-config/roles/check_disk_info/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Gather disk, mount path info, and filesystem type 2 | ansible.builtin.shell: | 3 | lsblk -f | grep sda | awk '{print $1, $2, $6}' 4 | register: disk_info 5 | loop: "{{ ais_devices }}" 6 | ignore_errors: yes 7 | when: disk_info_confirmation == "yes" 8 | 9 | - name: Show disk, mount path info, and filesystem type 10 | debug: 11 | msg: "Host: {{ inventory_hostname }}, Disk: {{ item.item }}, FS Type: {{ item.stdout.split(' ')[1] }}, Mounted On: {{ item.stdout.split(' ')[2] }}" 12 | loop: "{{ disk_info.results | default([]) }}" 13 | when: 14 | - disk_info_confirmation == "yes" 15 | - "'results' in disk_info" 16 | - item.rc == 0 -------------------------------------------------------------------------------- /playbooks/host-config/roles/config_kubelet/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Read the YAML file 2 | ansible.builtin.slurp: 3 | path: "{{ kubelet_var_path }}" 4 | register: kubelet_conf 5 | 6 | - name: Convert YAML content to a dictionary 7 | ansible.builtin.set_fact: 8 | yaml_content: "{{ kubelet_conf['content'] | b64decode | from_yaml }}" 9 | 10 | - name: Add or update the 'allowedUnsafeSysctls' entry 11 | ansible.builtin.set_fact: 12 | yaml_content: "{{ yaml_content | combine({'allowedUnsafeSysctls': unsafe_sysctls }, recursive=True) }}" 13 | 14 | - name: Write the modified YAML back to the file 15 | ansible.builtin.copy: 16 | content: "{{ yaml_content | to_yaml }}" 17 | dest: "{{ kubelet_var_path }}" -------------------------------------------------------------------------------- /helm/ais/charts/ais-cluster/templates/proxy-lb.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.proxyLB.enabled }} 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | annotations: 6 | {{- with .Values.proxyLB.annotations }} 7 | {{- toYaml . | nindent 4 }} 8 | {{- end }} 9 | labels: 10 | app.kubernetes.io/name: {{ .Values.cluster }} 11 | name: {{ .Values.cluster }}-proxy-lb 12 | namespace: {{ .Release.Namespace }} 13 | spec: 14 | ports: 15 | - name: pub 16 | port: {{ .Values.proxyLB.port }} 17 | protocol: TCP 18 | targetPort: {{ .Values.proxySpec.portPublic }} 19 | selector: 20 | app.kubernetes.io/name: {{ .Values.cluster }} 21 | app.kubernetes.io/component: proxy 22 | type: LoadBalancer 23 | {{- end }} -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/generate_https_cert/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Copy CA Creation yaml 2 | become: true 3 | template: 4 | src: "ca.yaml.j2" 5 | dest: "/tmp/https_ca.yaml" 6 | mode: 0777 7 | lstrip_blocks: true 8 | 9 | - name: Create CA 10 | command: "kubectl apply -f /tmp/https_ca.yaml" 11 | 12 | - name: Delete existing cert if it exists 13 | shell: kubectl delete secret {{ tls_secret }} -n ais --ignore-not-found 14 | 15 | - name: Use CA to issue cert 16 | become: true 17 | template: 18 | src: "cert.yaml.j2" 19 | dest: "/tmp/https_cert.yaml" 20 | mode: 0777 21 | lstrip_blocks: true 22 | 23 | - name: Create Cert 24 | command: "kubectl apply -f /tmp/https_cert.yaml" -------------------------------------------------------------------------------- /helm/authn/config/authn/oci.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | tls: 2 | enabled: true 3 | createCert: true 4 | certPath: "/var/certs/tls.crt" 5 | keyPath: "/var/certs/tls.key" 6 | certificate: 7 | duration: 8760h # 1 year 8 | renewBefore: 720h # 30 days 9 | subject: 10 | organizations: 11 | - NVIDIA Corporation 12 | organizationalUnits: 13 | - NGC Storage 14 | countries: 15 | - US 16 | emailAddress: ais@exchange.nvidia.com 17 | issuerRef: 18 | name: ca-issuer 19 | kind: ClusterIssuer 20 | 21 | lb: 22 | enabled: true 23 | port: 52001 24 | annotations: 25 | oci.oraclecloud.com/load-balancer-type: "nlb" 26 | oci-network-load-balancer.oraclecloud.com/internal: "true" -------------------------------------------------------------------------------- /playbooks/host-config/roles/ais_host_config_common/templates/01-netcfg.yaml.j2: -------------------------------------------------------------------------------- 1 | # 2 | # Ubuntu 18.04 has some bug whereby specifying the mtu for a dhcp-configured 3 | # interface in netplan fails, so we have to resort to some matching as below. 4 | # 5 | # XXX The item.interface below doesn't seem to restrict us to matching 6 | # only that link. That's ok since our cpu/storage nodes do have differently 7 | # named ethernet links to the DGX nodes, but could affect more interfaces 8 | # than we want in some configs. 9 | # 10 | network: 11 | version: 2 12 | renderer: networkd 13 | ethernets: 14 | {{ item.interface }}: 15 | dhcp4: yes 16 | match: 17 | driver: {{ item.driver }} 18 | mtu: {{ item.mtu }} 19 | 20 | -------------------------------------------------------------------------------- /.github/workflows/docker_ais_logs.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image -- AIS Logs 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | image_tag: 7 | description: 'ais-logs image tag' 8 | required: true 9 | default: 'latest' 10 | jobs: 11 | docker: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v6 15 | - name: Login to DockerHub 16 | uses: docker/login-action@v3 17 | with: 18 | username: ${{ secrets.DOCKERHUB_USERNAME }} 19 | password: ${{ secrets.DOCKERHUB_TOKEN }} 20 | 21 | - name: Build and Push 'aistorage/ais-logs' 22 | run: | 23 | pushd $GITHUB_WORKSPACE/log-sidecar 24 | TAG="${{ inputs.image_tag }}" make all 25 | popd 26 | -------------------------------------------------------------------------------- /helm/ais/config/ais/sjc4-1000.yaml: -------------------------------------------------------------------------------- 1 | cluster: ais 2 | mpathInfo: 3 | storageClass: "ais-local-storage" 4 | size: 5.8Ti 5 | mounts: 6 | - path: "/ais/nvme0n1" 7 | - path: "/ais/nvme1n1" 8 | - path: "/ais/nvme2n1" 9 | size: 3 10 | protocol: https 11 | https: 12 | skipVerifyCert: false 13 | tlsSecret: "tls-certs" 14 | nodeImage: 15 | name: aistorage/aisnode 16 | tag: v4.0 17 | initImage: 18 | name: aistorage/ais-init 19 | tag: v4.0 20 | logSidecarImage: 21 | name: aistorage/ais-logs 22 | tag: v1.1 23 | configToUpdate: 24 | memsys: 25 | hk_time: 3m 26 | backend: 27 | aws: {} 28 | net: 29 | http: 30 | idle_conn_time: 20s 31 | idle_conns: 2048 32 | idle_conns_per_host: 128 33 | stateStorageClass: "local-path" -------------------------------------------------------------------------------- /operator/config/base/certmanager/certificate_metrics.yaml: -------------------------------------------------------------------------------- 1 | # The following manifests contain a self-signed issuer CR and a metrics certificate CR. 2 | # More document can be found at https://docs.cert-manager.io 3 | apiVersion: cert-manager.io/v1 4 | kind: Certificate 5 | metadata: 6 | name: metrics-certs # this name should match the one appeared in kustomizeconfig.yaml 7 | namespace: system 8 | spec: 9 | dnsNames: 10 | # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize 11 | # replacements in the config/default/kustomization.yaml file. 12 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc 13 | - $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc.cluster.local 14 | issuerRef: 15 | kind: Issuer 16 | name: selfsigned-issuer 17 | secretName: metrics-server-cert -------------------------------------------------------------------------------- /playbooks/host-config/vars/ais_datafs.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Vars for playbooks ais_datafs_* 3 | # 4 | 5 | # 6 | # It is recommended that you do *not* include ais_hosts here - it becomes too 7 | # easy to mkfs all filesystems by mistake (eg when extending a cluster). 8 | # 9 | # Example, defining ais_hosts at playbook run time: 10 | # ansible-playbook -i hosts.ini ais_datafs_mkfs.yml -e ais_hosts=cpu-worker-node --become --check 11 | 12 | 13 | # 14 | # Devices under /dev on which we will build XFS filesystems. Defaults must 15 | # be replaced or over-ridden with -e 16 | # 17 | # Examples on an HDD system: sda, sdb, sdc, ... 18 | # 19 | #ais_devices: 20 | # - sda 21 | # - sdb 22 | # - sdc 23 | # - sdd 24 | # - sde 25 | # - sdf 26 | # - sdg 27 | # - sdh 28 | # - sdi 29 | # - sdj 30 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | # and 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | - package-ecosystem: "docker" 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | - package-ecosystem: "pip" 17 | directory: "/" 18 | schedule: 19 | interval: "weekly" 20 | - package-ecosystem: "gomod" 21 | directory: "/" 22 | schedule: 23 | interval: "weekly" -------------------------------------------------------------------------------- /monitoring/kube-prom/values/kube-state-metrics.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | kubeStateMetrics: 2 | enabled: {{ .Values.kubeStateMetrics.enabled | default false }} 3 | 4 | {{- if .Values.kubeStateMetrics.enabled }} 5 | kube-state-metrics: 6 | {{- if hasKey .Values "affinity"}} 7 | {{- if and .Values.affinity.nodeLabelKey .Values.affinity.nodeLabelValue }} 8 | affinity: 9 | nodeAffinity: 10 | requiredDuringSchedulingIgnoredDuringExecution: 11 | nodeSelectorTerms: 12 | - matchExpressions: 13 | - key: {{ .Values.affinity.nodeLabelKey }} 14 | operator: In 15 | values: 16 | - "{{ .Values.affinity.nodeLabelValue }}" 17 | {{- end }} 18 | {{- end }} 19 | prometheus: 20 | monitor: 21 | enabled: false 22 | {{- end }} -------------------------------------------------------------------------------- /playbooks/ais-deployment/create_network_definition.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "controller" 3 | gather_facts: false 4 | vars_files: 5 | - "vars/multihome.yml" 6 | vars: 7 | - k8s_namespace: "{{ attachment_namespace }}" 8 | 9 | pre_tasks: 10 | - name: Check network_attachment variable 11 | fail: 12 | msg: "`network_attachment` name must be provided!" 13 | when: network_attachment is undefined or network_attachment | length == 0 14 | 15 | - name: Check network_interface variable 16 | fail: 17 | msg: "`network_interface` name must be provided!" 18 | when: network_interface is undefined or network_interface | length == 0 19 | 20 | roles: 21 | - install_multus 22 | - create_namespace 23 | - create_network_definition 24 | -------------------------------------------------------------------------------- /helm/ais/scripts/label-nodes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | if [ "$#" -lt 2 ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | CLUSTER="$1" 10 | NODE_ARG="$2" 11 | 12 | if [[ "$NODE_ARG" == "--all" ]]; then 13 | IFS=' ' read -ra NODES <<< "$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[*].metadata.name}')" 14 | else 15 | IFS=',' read -ra NODES <<< "$NODE_ARG" 16 | fi 17 | 18 | if [[ ${#NODES[@]} -eq 0 ]]; then 19 | echo "Error: No nodes found" 20 | exit 1 21 | fi 22 | 23 | echo "Labeling ${#NODES[@]} nodes for cluster '$CLUSTER'" 24 | 25 | for NODE in "${NODES[@]}"; do 26 | kubectl label node "$NODE" "nvidia.com/ais-proxy=$CLUSTER" "nvidia.com/ais-target=$CLUSTER" --overwrite 27 | done 28 | -------------------------------------------------------------------------------- /helm/operator/check_cert_manager.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | READY_COUNT=$(kubectl get pods -A --field-selector=status.phase=Running | grep "cert-manager" | grep -E "[0-9]/[0-9].*Running" | wc -l) 4 | 5 | if [ "$READY_COUNT" -ge 3 ]; then 6 | echo "All cert-manager pods are ready" 7 | echo "Continuing operator installation" 8 | exit 0 9 | fi 10 | 11 | echo "Not all cert-manager pods are ready. Found $READY_COUNT ready pods" 12 | echo "The AIS K8s operator requires cert-manager." 13 | echo "Run 14 | • \`kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.17.1/cert-manager.yaml\` 15 | OR 16 | • Install the cert-manager helm chart https://artifacthub.io/packages/helm/cert-manager/cert-manager 17 | Then re-run the AIS K8s operator helm chart installation." 18 | exit 1 -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/install_controller_requirements/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Ensure Python 3 and pip are installed 2 | ansible.builtin.package: 3 | name: 4 | - python3 5 | - python3-pip 6 | state: present 7 | 8 | - name: Upgrade pip to the latest version 9 | ansible.builtin.pip: 10 | name: pip 11 | state: latest 12 | executable: pip3 13 | 14 | - name: Read local requirements.txt 15 | set_fact: 16 | requirements_list: "{{ lookup('file', '{{ role_path }}/files/requirements.txt').splitlines() }}" 17 | delegate_to: localhost 18 | 19 | - name: Install Python packages from requirements file 20 | ansible.builtin.pip: 21 | name: "{{ requirements_list }}" 22 | state: present 23 | executable: pip3 24 | extra_args: --ignore-installed 25 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_cleanup_all.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "{{ cluster }}" 3 | become: true 4 | vars_files: 5 | - "vars/ais_mpaths.yml" 6 | 7 | vars_prompt: 8 | - name: "cleanup_confirmation" 9 | prompt: "Are you sure you would like to delete all AIS related (meta-)data on {{ cluster }} hosts and, mountpaths {{ ais_mpaths }}? Type 'yes' to confirm." 10 | default: "no" 11 | private: no 12 | 13 | pre_tasks: 14 | - name: check confirmation 15 | fail: 16 | msg: "cleanup not confirmed/forced" 17 | when: cleanup_confirmation != "yes" 18 | 19 | - name: check mountpath list 20 | fail: 21 | msg: "no ais_mpaths specified!" 22 | when: ais_mpaths is undefined 23 | 24 | gather_facts: no 25 | roles: 26 | - ais_cleanup_all 27 | -------------------------------------------------------------------------------- /operator/config/overlays/default/manager_ca_configmap_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | volumeMounts: 12 | - name: ais-ca 13 | mountPath: /etc/ais/ca 14 | readOnly: true 15 | volumes: 16 | - name: ais-ca 17 | configMap: 18 | # Placeholder replaced during Helm chart build with {{ .Values.controllerManager.manager.aisCAConfigmapName }} 19 | # Default value: ais-operator-ais-ca 20 | # Can be overridden in Helm values to use trust-manager bundles 21 | name: AIS_CA_CONFIGMAP_PLACEHOLDER 22 | optional: true -------------------------------------------------------------------------------- /operator/pkg/controllers/events.go: -------------------------------------------------------------------------------- 1 | // Package controllers contains k8s controller logic for AIS cluster 2 | /* 3 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package controllers 6 | 7 | // Reason's to be used by event recorder 8 | const ( 9 | EventReasonInitialized = "Initialized" 10 | EventReasonFailed = "Failed" 11 | EventReasonWaiting = "Waiting" 12 | EventReasonCreated = "Created" 13 | EventReasonReady = "Ready" 14 | EventReasonBackOff = "BackOff" 15 | EventReasonShutdownCompleted = "ShutdownCompleted" 16 | EventReasonDecommissionCompleted = "DecommissionCompleted" 17 | EventReasonDeleted = "CRDeleted" 18 | EventReasonUpdated = "CRUpdated" 19 | ) 20 | -------------------------------------------------------------------------------- /docs/samples/sample-pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: ais-aistorecpu1012-sdc # All PV names should be unique, a good practice is to use the node name and the mount path 5 | spec: 6 | accessModes: 7 | - ReadWriteOnce 8 | capacity: 9 | storage: 5T # The size of the volume 10 | claimRef: 11 | apiVersion: v1 12 | kind: PersistentVolumeClaim 13 | name: ais-data-aistore-ais-target-3 # The name of the PVC that will use this PV 14 | namespace: ais 15 | hostPath: 16 | path: /ais/sdc 17 | type: "" 18 | nodeAffinity: 19 | required: 20 | nodeSelectorTerms: 21 | - matchExpressions: 22 | - key: kubernetes.io/hostname 23 | operator: In 24 | values: 25 | - aistorecpu1012.nsv.sjc4.nvmetal.net 26 | -------------------------------------------------------------------------------- /helm/ais/scripts/delete-pvc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# -ne 2 ]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | NAMESPACE="$1" 9 | STORAGE_CLASS="$2" 10 | 11 | # Get PVCs in the specified namespace with the target storage class 12 | PVC_LIST=$(kubectl get pvc -n "$NAMESPACE" -o jsonpath="{range .items[?(@.spec.storageClassName == \"$STORAGE_CLASS\")]}{.metadata.name}{'\n'}{end}") 13 | 14 | # Delete each PVC 15 | if [ -z "$PVC_LIST" ]; then 16 | echo "No PVCs found with storage class '$STORAGE_CLASS' in namespace '$NAMESPACE'" 17 | else 18 | echo "Deleting PVCs in namespace '$NAMESPACE' with storage class '$STORAGE_CLASS':" 19 | echo "$PVC_LIST" | while read -r PVC; do 20 | kubectl delete pvc -n "$NAMESPACE" "$PVC" 21 | done 22 | fi 23 | -------------------------------------------------------------------------------- /operator/tests/tutils/clientset.go: -------------------------------------------------------------------------------- 1 | // Package tutils provides utilities for running AIS operator tests 2 | /* 3 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package tutils 6 | 7 | import ( 8 | "fmt" 9 | 10 | "k8s.io/client-go/kubernetes" 11 | "sigs.k8s.io/controller-runtime/pkg/client/config" 12 | ) 13 | 14 | // NewClientset returns a kubernetes.Clientset created w/ the current 15 | // in-cluster or KUBECONFIG environment. 16 | func NewClientset() (*kubernetes.Clientset, error) { 17 | cfg, err := config.GetConfig() 18 | if err != nil { 19 | return nil, fmt.Errorf("error loading kubeconfig: %w", err) 20 | } 21 | cs, err := kubernetes.NewForConfig(cfg) 22 | if err != nil { 23 | return nil, fmt.Errorf("error creating clientset: %w", err) 24 | } 25 | return cs, nil 26 | } 27 | -------------------------------------------------------------------------------- /operator/scripts/install_helm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ $# -ne 2 ]; then 6 | echo "Usage: $0 " 7 | echo "Example: $0 /home/user/.local/bin v3.14.4" 8 | exit 1 9 | fi 10 | 11 | DESIRED_PATH="$1" 12 | HELM_VERSION="$2" 13 | 14 | # Check if helm is already installed in the desired path 15 | if [ -f "$DESIRED_PATH/helm" ]; then 16 | echo "Helm already installed at $DESIRED_PATH/helm" 17 | exit 0 18 | fi 19 | 20 | TMP_DIR="$(mktemp -d)" 21 | mkdir -p "$DESIRED_PATH" 22 | cd "$TMP_DIR" 23 | echo "Downloading 'get-helm' script" 24 | curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 25 | chmod 700 get_helm.sh 26 | HELM_INSTALL_DIR="$DESIRED_PATH" ./get_helm.sh --version "$HELM_VERSION" --no-sudo 27 | rm -rf "$TMP_DIR" -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/create_pv/files/pv.template.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Local persistent volume on each storage target. 3 | # One PV per mountpath. 4 | # 5 | 6 | apiVersion: v1 7 | kind: PersistentVolume 8 | metadata: 9 | name: $NAME 10 | labels: 11 | type: local 12 | cluster: ais 13 | mpath: $MPATH_LABEL 14 | spec: 15 | storageClassName: ais-local-storage 16 | capacity: 17 | storage: $MPATH_SIZE # should be set to size of mpath 18 | accessModes: 19 | - ReadWriteOnce 20 | hostPath: 21 | path: $MPATH 22 | claimRef: 23 | name: $CLAIM_NAME 24 | namespace: $NAMESPACE 25 | nodeAffinity: 26 | required: 27 | nodeSelectorTerms: 28 | - matchExpressions: 29 | - key: kubernetes.io/hostname 30 | operator: In 31 | values: 32 | - $NODE 33 | -------------------------------------------------------------------------------- /helm/ais/config/ais/neb-fin-test.yaml: -------------------------------------------------------------------------------- 1 | cluster: ais 2 | mpathInfo: 3 | storageClass: "ais-local-storage" 4 | size: 930Gi 5 | mounts: 6 | - path: /ais/vdc 7 | - path: /ais/vdd 8 | - path: /ais/vde 9 | - path: /ais/vdf 10 | - path: /ais/vdg 11 | size: 3 12 | protocol: http 13 | https: 14 | skipVerifyCert: false 15 | nodeImage: 16 | tag: v4.0-fuse 17 | initImage: 18 | tag: v4.0 19 | logSidecarImage: 20 | tag: v1.1 21 | cloud: 22 | awsSecretName: "aws-creds" 23 | configToUpdate: 24 | memsys: 25 | min_free: 8GiB 26 | to_gc: 8GiB 27 | hk_time: 3m 28 | backend: 29 | aws: {} 30 | net: 31 | http: 32 | idle_conn_time: 20s 33 | idle_conns: 2048 34 | idle_conns_per_host: 128 35 | stateStorageClass: "local-path" 36 | authNSecretName: 37 | proxyLB: 38 | enabled: true -------------------------------------------------------------------------------- /monitoring/alloy/helmfile.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | environments: 2 | prod: {} 3 | local: {} 4 | remote: {} 5 | 6 | --- 7 | 8 | repositories: 9 | - name: grafana 10 | url: https://grafana.github.io/helm-charts 11 | 12 | releases: 13 | - name: alloy-config 14 | namespace: monitoring 15 | chart: ./config-chart # Local chart for configmap 16 | createNamespace: true 17 | values: 18 | - environment: {{ .Environment.Name }} 19 | - ./environments/{{ .Environment.Name }}/values.yaml.gotmpl 20 | 21 | - name: alloy 22 | namespace: monitoring 23 | createNamespace: true 24 | chart: grafana/alloy 25 | version: 0.10.1 26 | values: 27 | - ./base-alloy-values.yaml.gotmpl 28 | - ./environments/{{ .Environment.Name }}/alloy-values.yaml 29 | needs: 30 | - alloy-config 31 | -------------------------------------------------------------------------------- /helm/authn/charts/authn/values.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | # Default values for authn chart 2 | image: 3 | repository: aistorage/authn 4 | tag: "latest" 5 | pullPolicy: IfNotPresent 6 | 7 | superuser: 8 | name: "admin" 9 | password: {{ requiredEnv "AUTHN_ADMIN_PASSWORD" }} 10 | 11 | jwtSigningKey: {{ requiredEnv "JWT_SIGNING_KEY" }} 12 | 13 | tls: 14 | enabled: false 15 | createCert: false 16 | secretName: "{{ .Release.Name }}-tls-certs" 17 | certPath: "/var/certs/tls.crt" 18 | keyPath: "/var/certs/tls.key" 19 | 20 | log: 21 | level: "3" 22 | dir: "/var/log/ais/authn/" 23 | 24 | applicationPort: 52001 25 | servicePort: 52001 26 | nodePort: 30001 27 | 28 | lb: 29 | enabled: false 30 | port: 52001 31 | clusterIP: 32 | annotations: 33 | 34 | persistence: 35 | enabled: true 36 | size: 50Mi 37 | hostPath: /etc/ais/authn -------------------------------------------------------------------------------- /operator/config/overlays/default/manager_authn_ca_configmap_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | volumeMounts: 12 | - name: auth-ca 13 | mountPath: /etc/ssl/certs/auth-ca 14 | readOnly: true 15 | volumes: 16 | - name: auth-ca 17 | configMap: 18 | # Placeholder replaced during Helm chart build with {{ .Values.controllerManager.manager.authCAConfigmapName }} 19 | # Default value: ais-operator-auth-ca 20 | # Can be overridden in Helm values to use trust-manager bundles 21 | name: AUTH_CA_CONFIGMAP_PLACEHOLDER 22 | optional: true 23 | 24 | -------------------------------------------------------------------------------- /playbooks/security/roles/journald/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Journald is a system service for collecting and storing log data, introduced with systemd. It tries to make it easier 3 | # for system administrators to find interesting and relevant information among an ever-increasing amount of log messages. 4 | - name: Ensure journald is configured to compress large log files 5 | lineinfile: 6 | path: /etc/systemd/journald.conf 7 | regexp: '^Compress=' 8 | line: 'Compress=yes' 9 | create: yes 10 | 11 | - name: Ensure journald is configured to write logfiles to persistent disk 12 | lineinfile: 13 | path: /etc/systemd/journald.conf 14 | regexp: '^Storage=' 15 | line: 'Storage=persistent' 16 | create: yes 17 | 18 | - name: Restart systemd-journald service 19 | service: 20 | name: systemd-journald 21 | state: restarted 22 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/vars/ais_mpaths.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Contains list of mountpaths on hosts and their size. 3 | # Make sure to adjust the mountpaths based on host setup. 4 | # This variable is used by `ais_cleanup_all.yml`, `ais_cleanup_markers.yml` and, 5 | # `ais_deploy_cluster.yml`. 6 | # 7 | # Mountpaths can also be passed as a command line variable i.e. using -e 8 | # Eg. 9 | # ANSIBLE_CONFIG=dev-test/ansible.cfg ansible-playbook -i dev-test/host.ini \ 10 | # ais_cleanup_all.yml -e cluster=ais-1 -e ais_mpaths=["/ais/sda", "/ais/sdb",...,"/ais/sdj"] -e ais_mpath_size=9Ti 11 | # 12 | 13 | # --- 14 | # ais_mpaths: 15 | # - "/ais/sda" 16 | # - "/ais/sdb" 17 | # - "/ais/sdc" 18 | # - "/ais/sdd" 19 | # - "/ais/sde" 20 | # - "/ais/sdf" 21 | # - "/ais/sdg" 22 | # - "/ais/sdh" 23 | # - "/ais/sdi" 24 | # - "/ais/sdj" 25 | # 26 | # ais_mpath_size: 9Ti 27 | # 28 | -------------------------------------------------------------------------------- /playbooks/cloud/roles/gcp_config/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Ensure target directory exists 2 | become: true 3 | file: 4 | path: "{{ target_dir }}" 5 | state: directory 6 | 7 | - name: Copy GCP credentials JSON file to target directory 8 | become: true 9 | ansible.builtin.copy: 10 | src: "gcp.json" 11 | dest: "{{ target_dir }}/gcp.json" 12 | 13 | - name: Remove existing Kubernetes secret if it exists 14 | shell: kubectl delete secret {{ secret_name }} -n ais --ignore-not-found 15 | 16 | - name: Create new Kubernetes secret from GCP credentials file 17 | shell: "kubectl create secret -n ais generic {{ secret_name }} \ 18 | --from-file=gcp.json={{ target_dir }}/gcp.json" 19 | 20 | - name: Clean up - remove target directory and its contents 21 | become: true 22 | file: 23 | path: "{{ target_dir }}" 24 | state: absent 25 | 26 | 27 | -------------------------------------------------------------------------------- /playbooks/security/roles/tmp_dir/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # The /tmp directory is a world-writable directory used to store data used by the system and 2 | # user applications for a short period of time. This data should have no expectation of surviving 3 | # a reboot, as this directory is intended to be emptied after each reboot. 4 | 5 | # Ensure /tmp is a separate partition 6 | - name: Set up /tmp to use tmpfs 7 | command: systemctl unmask tmp.mount 8 | 9 | - name: Add entry to /etc/fstab for /tmp 10 | lineinfile: 11 | path: /etc/fstab 12 | line: "tmpfs /tmp tmpfs defaults,rw,nosuid,nodev,relatime,size=2G 0 0" 13 | 14 | - name: Reload systemd configuration 15 | command: systemctl daemon-reload 16 | 17 | - name: Mount /tmp 18 | mount: 19 | path: /tmp 20 | src: tmpfs 21 | fstype: tmpfs 22 | opts: defaults,rw,nosuid,nodev,relatime,size=2G 23 | state: mounted -------------------------------------------------------------------------------- /monitoring/kube-prom/dashboard-configmap/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: ais-grafana-dashboard 5 | namespace: monitoring 6 | labels: 7 | grafana_dashboard: "1" 8 | data: 9 | ais-dashboard.json: |- 10 | {{ .Files.Get "ais_dashboard.json" | indent 4 }} 11 | 12 | --- 13 | 14 | apiVersion: v1 15 | kind: ConfigMap 16 | metadata: 17 | name: ais-grafana-dashboard-legacy 18 | namespace: monitoring 19 | labels: 20 | grafana_dashboard: "1" 21 | data: 22 | ais-dashboard-old.json: |- 23 | {{ .Files.Get "ais_dashboard_old.json" | indent 4 }} 24 | 25 | --- 26 | 27 | apiVersion: v1 28 | kind: ConfigMap 29 | metadata: 30 | name: k8s-grafana-dashboard 31 | namespace: monitoring 32 | labels: 33 | grafana_dashboard: "1" 34 | data: 35 | k8s-dashboard.json: |- 36 | {{ .Files.Get "k8s_dashboard.json" | indent 4 }} 37 | -------------------------------------------------------------------------------- /helm/ais/config/ais/sjc112.yaml: -------------------------------------------------------------------------------- 1 | cluster: ais 2 | mpathInfo: 3 | storageClass: "ais-local-storage" 4 | size: 5.8Ti 5 | mounts: 6 | - path: /ais/nvme0n1 7 | - path: /ais/nvme1n1 8 | - path: /ais/nvme2n1 9 | - path: /ais/nvme3n1 10 | - path: /ais/nvme4n1 11 | - path: /ais/nvme5n1 12 | - path: /ais/nvme6n1 13 | - path: /ais/nvme7n1 14 | - path: /ais/nvme8n1 15 | - path: /ais/nvme9n1 16 | - path: /ais/nvme10n1 17 | - path: /ais/nvme11n1 18 | - path: /ais/nvme12n1 19 | - path: /ais/nvme13n1 20 | - path: /ais/nvme14n1 21 | - path: /ais/nvme15n1 22 | size: 3 23 | protocol: https 24 | https: 25 | skipVerifyCert: false 26 | tlsSecret: "tls-certs" 27 | nodeImage: 28 | tag: v4.1 29 | initImage: 30 | tag: v4.1 31 | logSidecarImage: 32 | tag: v1.1 33 | configToUpdate: 34 | backend: 35 | aws: {} 36 | stateStorageClass: "local-path" -------------------------------------------------------------------------------- /monitoring/alloy/config-chart/common/common.alloy.gotmpl: -------------------------------------------------------------------------------- 1 | {{- if .Values.remote }} 2 | 3 | remote.kubernetes.secret "azure_oidc" { 4 | namespace = "monitoring" 5 | name = "{{ .Values.remote.secret }}" 6 | } 7 | 8 | otelcol.auth.oauth2 "azure_oidc" { 9 | client_id = convert.nonsensitive(remote.kubernetes.secret.azure_oidc.data.azure_app_client_id) 10 | client_secret = remote.kubernetes.secret.azure_oidc.data.azure_app_client_secret 11 | scopes = ["{{ .Values.remote.scope}}"] 12 | token_url = convert.nonsensitive(remote.kubernetes.secret.azure_oidc.data.azure_token_url) 13 | } 14 | 15 | otelcol.processor.attributes "remote_insert_label" { 16 | action { 17 | key = "cluster" 18 | value = "{{ .Values.remote.label }}" 19 | action = "insert" 20 | } 21 | output { 22 | metrics = [otelcol.exporter.otlphttp.remote_metrics_export.input] 23 | } 24 | } 25 | 26 | {{- end }} -------------------------------------------------------------------------------- /playbooks/host-config/ais_host_config_sysctl.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Apply sysctl tweaks to the 'ais' host group 3 | # 4 | --- 5 | - hosts: '{{ ais_hosts | default("ais") }}' 6 | become: true 7 | gather_facts: false 8 | vars_files: 9 | - "vars/host_config_sysctl.yml" # Base variables 10 | 11 | vars_prompt: 12 | - name: host_config_confirmation 13 | prompt: "Are you sure you want to run ais_host_config_sysctl with {{ env | default('default') }} settings on {{ ais_hosts }} cluster? Type 'yes' to confirm." 14 | default: "no" 15 | private: no 16 | 17 | pre_tasks: 18 | - name: Include environment-specific sysctl overrides (if `env` is set) 19 | include_vars: "vars/environments/{{ env }}/host_config_sysctl.yml" 20 | when: env is defined 21 | tags: always 22 | 23 | roles: 24 | - role: ais_host_config_sysctl 25 | when: host_config_confirmation | bool -------------------------------------------------------------------------------- /playbooks/security/roles/sudo/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # sudo can be configured to run only from a pseudo terminal ( pseudo-pty ). 3 | - name: Ensure sudo commands use pty 4 | lineinfile: 5 | path: /etc/sudoers.d/use_pty 6 | line: "Defaults use_pty" 7 | create: yes 8 | 9 | - name: Ensure sudo log file exists 10 | lineinfile: 11 | path: /etc/sudoers.d/sudo_log 12 | line: 'Defaults logfile="/var/log/sudo.log"' 13 | create: yes 14 | 15 | - name: Set proper permissions for the sudoers.d files 16 | file: 17 | path: "{{ item }}" 18 | owner: root 19 | group: root 20 | mode: '0440' 21 | with_items: 22 | - /etc/sudoers.d/use_pty 23 | - /etc/sudoers.d/sudo_log 24 | 25 | - name: Create the sudo log file if it doesn't exist 26 | file: 27 | path: /var/log/sudo.log 28 | state: touch 29 | owner: root 30 | group: root 31 | mode: '0640' 32 | 33 | -------------------------------------------------------------------------------- /operator/scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | release_version=${RELEASE:-v0.5} 4 | 5 | function pre_deploy { 6 | read -r -p "would you like to deploy cert-manager? [y/n]" response 7 | if [[ "${response}" == "y" ]]; then 8 | kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.16.1/cert-manager.yaml 9 | 10 | # Wait for cert-manager to be ready. 11 | kubectl wait --for=condition=ready pods --all -n cert-manager --timeout=5m 12 | fi 13 | } 14 | 15 | should_build=0 16 | for arg in "$@" 17 | do 18 | case $arg in 19 | -b|--build) 20 | should_build=1 21 | esac 22 | done 23 | 24 | pre_deploy 25 | if [[ $should_build == 0 ]]; then 26 | kubectl apply -f https://github.com/NVIDIA/ais-k8s/releases/download/${release_version}/ais-operator.yaml 27 | else 28 | bin/kustomize build config/default | kubectl apply -f - 29 | fi 30 | -------------------------------------------------------------------------------- /.github/workflows/docker_ais_operator_helper.yml: -------------------------------------------------------------------------------- 1 | # Builds and pushes the ais-deploy-helper image, containing scripts and templates to prepare systems for ais deployment 2 | name: Docker Image -- AIS Operator Helper 3 | 4 | on: 5 | workflow_dispatch: 6 | inputs: 7 | image_tag: 8 | description: 'AIS Operator Helper image tag' 9 | required: true 10 | default: 'latest' 11 | jobs: 12 | docker: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v6 16 | - name: Login to DockerHub 17 | uses: docker/login-action@v3 18 | with: 19 | username: ${{ secrets.DOCKERHUB_USERNAME }} 20 | password: ${{ secrets.DOCKERHUB_TOKEN }} 21 | 22 | - name: Build and Push 'aistorage/ais-operator-helper' 23 | run: | 24 | pushd $GITHUB_WORKSPACE/ais-operator-helper 25 | TAG="${{ inputs.image_tag }}" make all 26 | popd 27 | -------------------------------------------------------------------------------- /helm/ais/config/ais/oci-iad-test.yaml: -------------------------------------------------------------------------------- 1 | cluster: ais 2 | mpathInfo: 3 | storageClass: "ais-local-storage" 4 | size: 6.2Ti 5 | mounts: 6 | - path: /ais/nvme0n1 7 | - path: /ais/nvme1n1 8 | - path: /ais/nvme2n1 9 | - path: /ais/nvme3n1 10 | - path: /ais/nvme4n1 11 | - path: /ais/nvme5n1 12 | - path: /ais/nvme6n1 13 | - path: /ais/nvme7n1 14 | - path: /ais/nvme8n1 15 | - path: /ais/nvme9n1 16 | - path: /ais/nvme10n1 17 | - path: /ais/nvme11n1 18 | size: 3 19 | nodeImage: 20 | tag: v3.26-312a648 21 | initImage: 22 | tag: v3.26-a7ac713 23 | logSidecarImage: 24 | tag: v1.1 25 | cloud: 26 | awsSecretName: "aws-creds" 27 | stateStorageClass: "local-path" 28 | proxySpec: 29 | resources: 30 | requests: 31 | cpu: "8" 32 | limits: 33 | cpu: "32" 34 | targetSpec: 35 | resources: 36 | requests: 37 | cpu: "128" 38 | limits: 39 | cpu: "192" -------------------------------------------------------------------------------- /auth/keycloak/manifests/keycloak.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: k8s.keycloak.org/v2alpha1 2 | kind: Keycloak 3 | metadata: 4 | name: keycloak-server 5 | namespace: keycloak 6 | spec: 7 | instances: 1 8 | db: 9 | vendor: postgres 10 | host: cloudnative-pg-cluster-rw.cnpg-database.svc.cluster.local 11 | database: app 12 | usernameSecret: 13 | name: keycloak-db-secret 14 | key: username 15 | passwordSecret: 16 | name: keycloak-db-secret 17 | key: password 18 | http: 19 | httpEnabled: true 20 | httpPort: 8180 21 | httpsPort: 8543 22 | tlsSecret: keycloak-tls 23 | hostname: 24 | # Sets up Keycloak server to be internally accessible by K8s service 25 | hostname: keycloak-server-service.keycloak.svc.cluster.local 26 | ingress: 27 | enabled: true 28 | proxy: 29 | headers: xforwarded # double check your reverse proxy sets and overwrites the X-Forwarded-* headers -------------------------------------------------------------------------------- /playbooks/host-config/roles/ais_enable_multiqueue/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Add line for MQ variable in grub cfg 3 | lineinfile: 4 | path: /etc/default/grub 5 | regexp: '^MQ=' 6 | line: 'MQ="scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=n"' 7 | insertbefore: '^GRUB_CMDLINE_LINUX=' 8 | firstmatch: yes 9 | backup: yes 10 | register: addmq1 11 | 12 | - name: Include MQ in GRUB_CMDLINE_LINUX 13 | lineinfile: 14 | path: /etc/default/grub 15 | regexp: '^GRUB_CMDLINE_LINUX="\$GRUB' 16 | line: 'GRUB_CMDLINE_LINUX="$GRUB_CMDLINE_LINUX $MQ"' 17 | insertafter: '^GRUB_CMDLINE_LINUX=' 18 | backup: yes 19 | register: addmq2 20 | 21 | - name: Update grub.cfg 22 | command: update-grub 23 | when: addmq1.changed or addmq2.changed 24 | 25 | - name: Note reboot required 26 | debug: 27 | msg: Manual reboot is required for MQ change to take effect 28 | when: addmq1.changed or addmq2.changed -------------------------------------------------------------------------------- /.github/workflows/docker_operator.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image -- Operator 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | image_tag: 7 | description: 'Operator image tag' 8 | required: true 9 | default: 'latest' 10 | 11 | env: 12 | AISOPERATOR_IMAGE: 'aistorage/ais-operator' 13 | 14 | jobs: 15 | docker: 16 | runs-on: ubuntu-22.04 17 | steps: 18 | 19 | - uses: actions/checkout@v6 20 | - uses: actions/setup-go@v6 21 | with: 22 | go-version: '1.22.x' 23 | - name: Login to DockerHub 24 | uses: docker/login-action@v3 25 | with: 26 | username: ${{ secrets.DOCKERHUB_USERNAME }} 27 | password: ${{ secrets.DOCKERHUB_TOKEN }} 28 | 29 | - name: aisoperator image 30 | run: | 31 | pushd $GITHUB_WORKSPACE/operator 32 | IMG="${{ env.AISOPERATOR_IMAGE }}:${{ inputs.image_tag }}" make docker-build docker-push 33 | popd -------------------------------------------------------------------------------- /helm/authn/helmfile.yaml: -------------------------------------------------------------------------------- 1 | environments: 2 | default: 3 | values: 4 | - envFile: default 5 | tls: 6 | values: 7 | - envFile: tls 8 | neb-fin: 9 | values: 10 | - envFile: nvidia 11 | kubeContext: neb-fin 12 | oci-kratos: 13 | values: 14 | - envFile: oci 15 | kubeContext: oci-kratos 16 | oci-ord: 17 | values: 18 | - envFile: oci 19 | kubeContext: oci-ord 20 | oci-iad: 21 | values: 22 | - envFile: oci 23 | kubeContext: oci-iad 24 | sjc4-dev: 25 | values: 26 | - envFile: nvidia 27 | kubeContext: sjc4-dev 28 | --- 29 | releases: 30 | - name: ais-authn 31 | namespace: ais 32 | createNamespace: true 33 | chart: charts/authn 34 | version: 0.1.0 35 | values: 36 | - "./charts/authn/values.yaml.gotmpl" 37 | - "./config/authn/{{ .Values.envFile }}.yaml.gotmpl" 38 | - "./config/authn/cert/{{ .Environment.Name }}.yaml" -------------------------------------------------------------------------------- /operator/scripts/cloud-provider-kind.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | ACTION="${1:-}" 5 | BINARY="${2:-}" 6 | LOCALBIN="${3:-./bin}" 7 | PID_FILE="${LOCALBIN}/cloud-provider-kind.pid" 8 | LOG_FILE="${LOCALBIN}/cloud-provider-kind.log" 9 | 10 | start() { 11 | [ -f "${PID_FILE}" ] && kill -0 "$(cat "${PID_FILE}")" 2>/dev/null && \ 12 | echo "cloud-provider-kind already running" && return 0 13 | 14 | "${BINARY}" > "${LOG_FILE}" 2>&1 & 15 | echo $! > "${PID_FILE}" 16 | echo "Started cloud-provider-kind" 17 | } 18 | 19 | stop() { 20 | [ ! -f "${PID_FILE}" ] && echo "cloud-provider-kind not running" && return 0 21 | 22 | kill "$(cat "${PID_FILE}")" 2>/dev/null || true 23 | rm -f "${PID_FILE}" "${LOG_FILE}" 24 | echo "Stopped cloud-provider-kind" 25 | } 26 | 27 | case "${ACTION}" in 28 | start|stop) "$ACTION" ;; 29 | *) echo "Usage: $0 {start|stop} [localbin]" >&2; exit 1 ;; 30 | esac 31 | 32 | -------------------------------------------------------------------------------- /operator/tests/ci/test_in_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run E2E operator tests in CI using a cached image and test pod 4 | 5 | set -euo pipefail 6 | 7 | IMAGE_NAME="operator-test" 8 | TEST_POD_NAME="operator-test-pod" 9 | 10 | # Apply RBAC permissions needed for the test pod 11 | kubectl apply -f scripts/rbac.yaml 12 | 13 | # Load the cached test image archive into the KinD cluster 14 | kind load image-archive /operator-test.tar --name "${KIND_CLUSTER_NAME}" 15 | 16 | # Apply the test pod manifest with environment variable substitution 17 | envsubst < scripts/test_pod.yaml | kubectl apply -f - 18 | 19 | # Wait until the pod is ready 20 | kubectl wait --for=condition=Ready "pod/${TEST_POD_NAME}" --timeout=120s 21 | 22 | # Copy the current `operator` source into the pod for testing 23 | kubectl cp . "${TEST_POD_NAME}:/operator" 24 | 25 | # Execute tests inside the pod 26 | kubectl exec "${TEST_POD_NAME}" -- bash -c "make -C /operator test-e2e" -------------------------------------------------------------------------------- /operator/api/v1beta1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | // Package contains declaration of AIS Kubernetes Custom Resource Definitions 2 | /* 3 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | // Package v1beta1 contains API Schema definitions for the ais v1beta1 API group 6 | // +kubebuilder:object:generate=true 7 | // +groupName=ais.nvidia.com 8 | package v1beta1 9 | 10 | import ( 11 | "k8s.io/apimachinery/pkg/runtime/schema" 12 | "sigs.k8s.io/controller-runtime/pkg/scheme" 13 | ) 14 | 15 | var ( 16 | // GroupVersion is group version used to register these objects 17 | GroupVersion = schema.GroupVersion{Group: "ais.nvidia.com", Version: "v1beta1"} 18 | 19 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 20 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 21 | 22 | // AddToScheme adds the types in this group-version to the given scheme. 23 | AddToScheme = SchemeBuilder.AddToScheme 24 | ) 25 | -------------------------------------------------------------------------------- /helm/ais/scripts/delete-released-pv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | STORAGE_CLASS="$1" 9 | 10 | # Get all PVs with given storage class and status Released 11 | PVS=$(kubectl get pv \ 12 | --no-headers \ 13 | -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,SC:.spec.storageClassName \ 14 | | awk -v sc="$STORAGE_CLASS" '$2 == "Released" && $3 == sc {print $1}') 15 | 16 | if [ -z "$PVS" ]; then 17 | echo "No Released PVs found for storage class '$STORAGE_CLASS'." 18 | exit 0 19 | fi 20 | 21 | echo "Deleting the following Released PVs with storage class '$STORAGE_CLASS':" 22 | echo "$PVS" 23 | 24 | read -p "Are you sure you want to delete ALL of these PVs? [y/N]: " CONFIRM 25 | case "$CONFIRM" in 26 | [yY][eE][sS]|[yY]) ;; 27 | *) echo "Aborted. No PVs were deleted."; exit 0 ;; 28 | esac 29 | 30 | for pv in $PVS; do 31 | kubectl delete pv "$pv" 32 | done -------------------------------------------------------------------------------- /playbooks/ais-deployment/roles/generate_https_cert/templates/ca.yaml.j2: -------------------------------------------------------------------------------- 1 | {{ ansible_managed | comment }} 2 | --- 3 | apiVersion: cert-manager.io/v1 4 | kind: ClusterIssuer 5 | metadata: 6 | name: selfsigned-issuer 7 | spec: 8 | selfSigned: {} 9 | --- 10 | apiVersion: cert-manager.io/v1 11 | kind: Certificate 12 | metadata: 13 | name: selfsigned-cert 14 | namespace: {{ cluster }} 15 | spec: 16 | secretName: {{ ca_cert_secret }} 17 | isCA: true 18 | commonName: selfsigned-ca 19 | duration: 8760h # 1 year 20 | renewBefore: 720h # 30 days 21 | privateKey: 22 | algorithm: RSA 23 | encoding: PKCS1 24 | size: 4096 25 | subject: 26 | organizations: 27 | - NVIDIA 28 | issuerRef: 29 | name: selfsigned-issuer 30 | kind: ClusterIssuer 31 | group: cert-manager.io 32 | --- 33 | apiVersion: cert-manager.io/v1 34 | kind: Issuer 35 | metadata: 36 | name: ca-issuer 37 | namespace: {{ cluster }} 38 | spec: 39 | ca: 40 | secretName: {{ ca_cert_secret }} -------------------------------------------------------------------------------- /helm/ais/config/tls-cert/sjc11.yaml: -------------------------------------------------------------------------------- 1 | certificate: 2 | duration: 8760h # 1 year 3 | renewBefore: 720h # 30 days 4 | subject: 5 | organizations: 6 | - NVIDIA Corporation 7 | organizationalUnits: 8 | - NGC Storage 9 | countries: 10 | - US 11 | dnsNames: 12 | # used for readiness and liveness check 13 | - "localhost" 14 | # used for registration of targets 15 | - "ais-proxy" 16 | - "ais-target" 17 | # consistent url for client pod 18 | - "ais-proxy.ais.svc.cluster.local" 19 | # used for intra cluster communication 20 | - "*.ais-proxy.ais.svc.cluster.local" 21 | - "*.ais-target.ais.svc.cluster.local" 22 | # used for ais-operator communication 23 | - "ais-proxy.ais" 24 | - "*.nsv.sjc11.nvmetal.net" 25 | ipAddresses: 26 | - 127.0.0.1 27 | - 10.52.160.21 28 | - 10.52.160.20 29 | - 10.52.160.87 30 | emailAddress: ais@exchange.nvidia.com 31 | issuerRef: 32 | name: ca-issuer 33 | kind: ClusterIssuer -------------------------------------------------------------------------------- /helm/ais/config/tls-cert/sjc112.yaml: -------------------------------------------------------------------------------- 1 | certificate: 2 | duration: 8760h # 1 year 3 | renewBefore: 720h # 30 days 4 | subject: 5 | organizations: 6 | - NVIDIA Corporation 7 | organizationalUnits: 8 | - NGC Storage 9 | countries: 10 | - US 11 | dnsNames: 12 | # used for readiness and liveness check 13 | - "localhost" 14 | # used for registration of targets 15 | - "ais-proxy" 16 | - "ais-target" 17 | # consistent url for client pod 18 | - "ais-proxy.ais.svc.cluster.local" 19 | # used for intra cluster communication 20 | - "*.ais-proxy.ais.svc.cluster.local" 21 | - "*.ais-target.ais.svc.cluster.local" 22 | # used for ais-operator communication 23 | - "ais-proxy.ais" 24 | - "*.nsv.sjc11.nvmetal.net" 25 | ipAddresses: 26 | - 127.0.0.1 27 | - 10.52.160.18 28 | - 10.52.160.16 29 | - 10.52.160.15 30 | emailAddress: ais@exchange.nvidia.com 31 | issuerRef: 32 | name: ca-issuer 33 | kind: ClusterIssuer -------------------------------------------------------------------------------- /helm/ais/config/tls-cert/sjc4-1000.yaml: -------------------------------------------------------------------------------- 1 | certificate: 2 | duration: 8760h # 1 year 3 | renewBefore: 720h # 30 days 4 | subject: 5 | organizations: 6 | - NVIDIA Corporation 7 | organizationalUnits: 8 | - NGC Storage 9 | countries: 10 | - US 11 | dnsNames: 12 | # used for readiness and liveness check 13 | - "localhost" 14 | # used for registration of targets 15 | - "ais-proxy" 16 | - "ais-target" 17 | # consistent url for client pod 18 | - "ais-proxy.ais.svc.cluster.local" 19 | # used for intra cluster communication 20 | - "*.ais-proxy.ais.svc.cluster.local" 21 | - "*.ais-target.ais.svc.cluster.local" 22 | # used for ais-operator communication 23 | - "ais-proxy.ais" 24 | - "*.sjc4.maas.cis" 25 | ipAddresses: 26 | - 127.0.0.1 27 | - 10.150.56.248 28 | - 10.150.56.245 29 | - 10.150.56.246 30 | emailAddress: ais@exchange.nvidia.com 31 | issuerRef: 32 | name: ca-issuer 33 | kind: ClusterIssuer -------------------------------------------------------------------------------- /helm/ais/config/tls-cert/sjc4-dev.yaml: -------------------------------------------------------------------------------- 1 | certificate: 2 | duration: 8760h # 1 year 3 | renewBefore: 720h # 30 days 4 | subject: 5 | organizations: 6 | - NVIDIA Corporation 7 | organizationalUnits: 8 | - NGC Storage 9 | countries: 10 | - US 11 | dnsNames: 12 | # used for readiness and liveness check 13 | - "localhost" 14 | # used for registration of targets 15 | - "ais-proxy" 16 | - "ais-target" 17 | # consistent url for client pod 18 | - "ais-proxy.ais.svc.cluster.local" 19 | # used for intra cluster communication 20 | - "*.ais-proxy.ais.svc.cluster.local" 21 | - "*.ais-target.ais.svc.cluster.local" 22 | # used for ais-operator communication 23 | - "ais-proxy.ais" 24 | - "*.nsv.sjc4.nvmetal.net" 25 | ipAddresses: 26 | - 127.0.0.1 27 | - 10.150.56.227 28 | - 10.150.56.230 29 | - 10.150.56.225 30 | emailAddress: aistore@nvidia.com 31 | issuerRef: 32 | name: ca-issuer 33 | kind: ClusterIssuer -------------------------------------------------------------------------------- /log-sidecar/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:latest as builder 2 | 3 | ARG COREUTILS_VERSION=9.7 4 | ARG TINI_VERSION=v0.19.0 5 | 6 | # Build a statically compiled version of tail to copy into the final image 7 | RUN apk add --no-cache build-base 8 | WORKDIR /src 9 | ENV FORCE_UNSAFE_CONFIGURE=1 10 | RUN wget https://ftp.gnu.org/gnu/coreutils/coreutils-${COREUTILS_VERSION}.tar.xz && \ 11 | tar xf coreutils-${COREUTILS_VERSION}.tar.xz && \ 12 | cd coreutils-${COREUTILS_VERSION} && \ 13 | ./configure LDFLAGS="-static" && \ 14 | make 15 | RUN mv /src/coreutils-${COREUTILS_VERSION}/src/tail /tail 16 | 17 | RUN wget -O /tini https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-static && \ 18 | chmod +x /tini 19 | FROM gcr.io/distroless/static-debian12 20 | COPY --from=builder /tail /usr/bin/tail 21 | COPY --from=builder /tini /usr/bin/tini 22 | 23 | # tini handles SIGTERM for static tail 24 | ENTRYPOINT ["/usr/bin/tini", "--", "/usr/bin/tail", "-n+1", "-F"] 25 | CMD ["/dev/null"] -------------------------------------------------------------------------------- /helm/ais/config/tls-cert/keycloak.yaml: -------------------------------------------------------------------------------- 1 | certificate: 2 | duration: 8760h # 1 year 3 | renewBefore: 720h # 30 days 4 | subject: 5 | organizations: 6 | - NVIDIA Corporation 7 | organizationalUnits: 8 | - NGC Storage 9 | countries: 10 | - US 11 | dnsNames: 12 | # used for readiness and liveness check 13 | - "localhost" 14 | # used for registration of targets 15 | - "ais-proxy" 16 | - "ais-target" 17 | # Node names for direct communication 18 | - "keycloak-test-worker" 19 | - "keycloak-test-worker2" 20 | - "keycloak-test-worker3" 21 | # consistent url for client pod 22 | - "ais-proxy.ais.svc.cluster.local" 23 | # used for intra cluster communication 24 | - "*.ais-proxy.ais.svc.cluster.local" 25 | - "*.ais-target.ais.svc.cluster.local" 26 | # used for ais-operator communication 27 | - "ais-proxy.ais" 28 | emailAddress: ais@exchange.nvidia.com 29 | issuerRef: 30 | name: ca-issuer 31 | kind: ClusterIssuer -------------------------------------------------------------------------------- /monitoring/kube-state-metrics/README.md: -------------------------------------------------------------------------------- 1 | # Kube State Metrics 2 | 3 | ## Overview 4 | 5 | [Kube State Metrics](https://github.com/kubernetes/kube-state-metrics) (KSM) exposes Prometheus metrics about the state of Kubernetes API objects (e.g. Deployments, Pods, Nodes). This Helmfile deploys KSM as a standalone component in the `monitoring` namespace for Alloy to scrape. 6 | 7 | ## Usage 8 | 9 | Template manifests: 10 | 11 | ```bash 12 | helmfile -e prod template 13 | ``` 14 | 15 | Deploy/sync: 16 | 17 | ```bash 18 | helmfile -e prod sync 19 | ``` 20 | 21 | Port-forward for quick inspection (optional): 22 | 23 | ```bash 24 | kubectl -n monitoring port-forward svc/kube-state-metrics 8080:8080 25 | curl -s localhost:8080/metrics | head 26 | ``` 27 | 28 | # Relevant Links 29 | 30 | - [Chart source](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) 31 | - [Default values](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml) 32 | -------------------------------------------------------------------------------- /operator/config/overlays/default/manager_webhook_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | args: 12 | - "--webhook-cert-path=/tmp/k8s-webhook-server/serving-certs" 13 | ports: 14 | - containerPort: 9443 15 | name: webhook-server 16 | protocol: TCP 17 | volumeMounts: 18 | - mountPath: /tmp/k8s-webhook-server/serving-certs 19 | name: cert 20 | readOnly: true 21 | - mountPath: /tmp/k8s-metrics-server/metrics-certs 22 | name: metrics-certs 23 | readOnly: true 24 | volumes: 25 | - name: cert 26 | secret: 27 | defaultMode: 420 28 | secretName: webhook-server-cert 29 | - name: metrics-certs 30 | secret: 31 | defaultMode: 420 32 | secretName: metrics-server-cert 33 | -------------------------------------------------------------------------------- /manifests/cloud/oci-proxy-lb.yaml: -------------------------------------------------------------------------------- 1 | # oci-proxy-lb.yaml 2 | # This YAML file defines a Kubernetes Service that configures a network load balancer (NLB) 3 | # for the AIStore proxy components in Oracle Cloud Infrastructure (OCI). The service is designed 4 | # to be internal (accessible only within the cloud environment) and is set up to be scraped by Prometheus for monitoring purposes. 5 | apiVersion: v1 6 | kind: Service 7 | metadata: 8 | annotations: 9 | oci.oraclecloud.com/load-balancer-type: "nlb" 10 | oci-network-load-balancer.oraclecloud.com/internal: "true" 11 | oci-network-load-balancer.oraclecloud.com/node-label-selector: nvidia.com/ais-proxy=ais 12 | prometheus.io/scrape: "true" 13 | labels: 14 | app.kubernetes.io/name: ais 15 | name: ais-proxy-lb 16 | namespace: ais 17 | spec: 18 | ports: 19 | - name: pub 20 | port: 51080 21 | protocol: TCP 22 | targetPort: 51080 23 | selector: 24 | app.kubernetes.io/name: ais 25 | app.kubernetes.io/component: proxy 26 | type: LoadBalancer -------------------------------------------------------------------------------- /operator/config/overlays/default/manager_auth_metric_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch inject a sidecar container which is a HTTP proxy for the 2 | # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: controller-manager 7 | namespace: system 8 | spec: 9 | template: 10 | spec: 11 | containers: 12 | - name: manager 13 | args: 14 | - "--health-probe-bind-address=:8081" 15 | - "--metrics-cert-path=/tmp/k8s-metrics-server/metrics-certs" 16 | - "--metrics-bind-address=:8443" 17 | - "--leader-elect" 18 | ports: 19 | - containerPort: 8443 20 | name: https 21 | volumeMounts: 22 | - mountPath: /tmp/k8s-metrics-server/metrics-certs 23 | name: metrics-certs 24 | readOnly: true 25 | volumes: 26 | - name: metrics-certs 27 | secret: 28 | defaultMode: 420 29 | secretName: metrics-server-cert 30 | -------------------------------------------------------------------------------- /operator/scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | current_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" 4 | 5 | # Run e2e tests on existing K8s cluster. 6 | # `USE_EXISTING_CLUSTER=true` is set while running the tests to ensure, `envtest` environment isn't used. 7 | 8 | [[ $(command -v ginkgo) ]] || go install github.com/onsi/ginkgo/v2/ginkgo 9 | 10 | LABELS="" 11 | if [[ $1 == "manual" ]]; then 12 | LABELS="override" 13 | fi 14 | 15 | # Run as many workers as the number of tests or twice the CPU core count, whichever is smaller 16 | SPEC_COUNT=$(ginkgo --dry-run --no-color --label-filter="$LABELS" "$current_dir/../tests/e2e/..." 2>&1 | awk '/Will run/{print $3;exit}') 17 | CPU_COUNT=$(nproc) 18 | WORKERS=$(( SPEC_COUNT < CPU_COUNT * 2 ? SPEC_COUNT : CPU_COUNT * 2 )) 19 | [[ -z "$WORKERS" || "$WORKERS" -lt 1 ]] && WORKERS=1 20 | 21 | TEST_STORAGECLASS="${TEST_STORAGECLASS}" USE_EXISTING_CLUSTER=true \ 22 | ginkgo -vv -p --procs "$WORKERS" --label-filter="${LABELS}" -trace -coverprofile cover.out $current_dir/../tests/e2e/... 23 | -------------------------------------------------------------------------------- /helm/ais/config/ais/neb-fin.yaml: -------------------------------------------------------------------------------- 1 | cluster: ais 2 | mpathInfo: 3 | storageClass: "ais-local-storage" 4 | size: 13.2Ti 5 | mounts: 6 | - path: /ais/vdc 7 | - path: /ais/vdd 8 | - path: /ais/vde 9 | - path: /ais/vdf 10 | - path: /ais/vdg 11 | - path: /ais/vdh 12 | - path: /ais/vdi 13 | size: 7 14 | protocol: https 15 | https: 16 | skipVerifyCert: false 17 | tlsSecret: "tls-certs" 18 | imagePullSecrets: 19 | - name: regcred 20 | nodeImage: 21 | tag: v4.1 22 | initImage: 23 | tag: v4.1 24 | logSidecarImage: 25 | tag: v1.1 26 | cloud: 27 | awsSecretName: "aws-creds" 28 | configToUpdate: 29 | memsys: 30 | min_free: 8GiB 31 | to_gc: 8GiB 32 | hk_time: 3m 33 | backend: 34 | aws: {} 35 | net: 36 | http: 37 | idle_conn_time: 20s 38 | idle_conns: 2048 39 | idle_conns_per_host: 128 40 | stateStorageClass: "local-path" 41 | authNSecretName: 42 | 43 | proxyLB: 44 | enabled: true 45 | targetSpec: 46 | annotations: 47 | "cluster-autoscaler.kubernetes.io/safe-to-evict": "false" 48 | -------------------------------------------------------------------------------- /operator/scripts/test_in_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run E2E operator tests locally from an in-cluster test pod 4 | 5 | set -euo pipefail 6 | 7 | IMAGE_NAME="operator-test" 8 | TEST_POD_NAME="operator-test-pod" 9 | 10 | cleanup() { 11 | kubectl delete pod "${TEST_POD_NAME}" --ignore-not-found 12 | kubectl delete -f scripts/rbac.yaml --ignore-not-found 13 | } 14 | trap cleanup EXIT 15 | 16 | # Apply RBAC permissions needed for the test pod 17 | kubectl apply -f scripts/rbac.yaml 18 | 19 | # Build test image and load it into the local KinD cluster 20 | docker build -t "${IMAGE_NAME}" -f tests/test.dockerfile . 21 | kind load docker-image "${IMAGE_NAME}" --name "${KIND_CLUSTER_NAME}" 22 | 23 | # Apply the test pod manifest with environment variable substitution 24 | envsubst < scripts/test_pod.yaml | kubectl apply -f - 25 | 26 | # Wait until the pod is ready 27 | kubectl wait --for=condition=Ready "pod/${TEST_POD_NAME}" --timeout=120s 28 | 29 | # Execute tests inside the pod 30 | kubectl exec "${TEST_POD_NAME}" -- bash -c "make -C /operator test-e2e" -------------------------------------------------------------------------------- /playbooks/host-config/roles/ais_ntp/tasks/main.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # 4 | # Replace standard Ubuntu NTP servers, and perform an initial sync before restarting. 5 | # The ntp package is already installed by ais_host_config_common. 6 | # 7 | 8 | - name: Remove stock Ubuntu NTP pool servers 9 | replace: 10 | path: /etc/ntp.conf 11 | regexp: '^\s*(pool.*ubuntu)' 12 | replace: '# \1' 13 | register: pool_removed 14 | 15 | - name: List our nominated server(s) 16 | lineinfile: 17 | path: /etc/ntp.conf 18 | insertafter: EOF 19 | line: 'pool {{ item }}' 20 | with_items: 21 | - "{{ ntp_pools }}" 22 | register: pool_added 23 | 24 | - name: Stop ntp 25 | service: 26 | name: ntp 27 | state: stopped 28 | when: pool_removed.changed or pool_added.changed 29 | 30 | - name: Perform initial, possibly large, resync 31 | command: ntpd -gq 32 | when: pool_removed.changed or pool_added.changed 33 | 34 | - name: Start ntp 35 | service: 36 | name: ntp 37 | state: started 38 | when: pool_removed.changed or pool_added.changed -------------------------------------------------------------------------------- /operator/tests/ci/kind_cluster_ci.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | containerdConfigPatches: 4 | - |- 5 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] 6 | SystemdCgroup = false 7 | kubeadmConfigPatches: 8 | - | 9 | kind: KubeletConfiguration 10 | apiVersion: kubelet.config.k8s.io/v1beta1 11 | featureGates: 12 | KubeletInUserNamespace: true 13 | cgroupDriver: "cgroupfs" 14 | nodes: 15 | - role: control-plane 16 | extraMounts: 17 | - hostPath: /ci-kind-logs/control-plane 18 | containerPath: /var/log 19 | - role: worker 20 | labels: 21 | ais-node: true 22 | extraMounts: 23 | - hostPath: /ci-kind-logs/worker1 24 | containerPath: /var/log 25 | - role: worker 26 | labels: 27 | ais-node: true 28 | extraMounts: 29 | - hostPath: /ci-kind-logs/worker2 30 | containerPath: /var/log 31 | - role: worker 32 | labels: 33 | ais-node: true 34 | extraMounts: 35 | - hostPath: /ci-kind-logs/worker3 36 | containerPath: /var/log 37 | -------------------------------------------------------------------------------- /operator/pkg/resources/statsd/configmap.go: -------------------------------------------------------------------------------- 1 | // Package statsd contains k8s resources required for statsd 2 | /* 3 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package statsd 6 | 7 | import ( 8 | aisv1 "github.com/ais-operator/api/v1beta1" 9 | corev1 "k8s.io/api/core/v1" 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | "k8s.io/apimachinery/pkg/types" 12 | ) 13 | 14 | const ConfigFile = "statsd.json" 15 | 16 | func ConfigMapName(ais *aisv1.AIStore) string { 17 | return ais.Name + "-statsd" 18 | } 19 | 20 | func ConfigMapNSName(ais *aisv1.AIStore) types.NamespacedName { 21 | return types.NamespacedName{ 22 | Name: ConfigMapName(ais), 23 | Namespace: ais.Namespace, 24 | } 25 | } 26 | 27 | func NewStatsDCM(ais *aisv1.AIStore) *corev1.ConfigMap { 28 | return &corev1.ConfigMap{ 29 | ObjectMeta: metav1.ObjectMeta{ 30 | Name: ConfigMapName(ais), 31 | Namespace: ais.Namespace, 32 | }, 33 | Data: map[string]string{ 34 | ConfigFile: `{ 35 | "graphiteHost": "", 36 | "graphitePort": 2003 37 | }`, 38 | }, 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /helm/ais/config/tls-cert/neb-fin.yaml: -------------------------------------------------------------------------------- 1 | certificate: 2 | duration: 8760h # 1 year 3 | renewBefore: 720h # 30 days 4 | subject: 5 | organizations: 6 | - NVIDIA Corporation 7 | organizationalUnits: 8 | - NGC Storage 9 | countries: 10 | - US 11 | dnsNames: 12 | # used for readiness and liveness check 13 | - "localhost" 14 | # used for registration of targets 15 | - "ais-proxy" 16 | - "ais-target" 17 | # consistent url for client pod 18 | - "ais-proxy.ais.svc.cluster.local" 19 | # used for intra cluster communication 20 | - "*.ais-proxy.ais.svc.cluster.local" 21 | - "*.ais-target.ais.svc.cluster.local" 22 | # used for ais-operator communication 23 | - "ais-proxy.ais" 24 | ipAddresses: 25 | - 127.0.0.1 26 | - 10.8.54.22 27 | - 10.8.0.17 28 | - 10.8.54.3 29 | - 10.11.0.3 30 | - 10.11.0.2 31 | - 10.8.3.31 32 | - 10.11.59.11 33 | # LB 34 | - 10.10.177.173 35 | - 89.169.111.167 36 | emailAddress: ais@exchange.nvidia.com 37 | issuerRef: 38 | name: ca-issuer 39 | kind: ClusterIssuer -------------------------------------------------------------------------------- /operator/scripts/test_pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: operator-test-pod 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: operator-test 9 | image: operator-test 10 | imagePullPolicy: Never 11 | securityContext: 12 | privileged: true 13 | env: 14 | - name: TEST_E2E_MODE 15 | value: "${TEST_E2E_MODE}" 16 | - name: TEST_STORAGECLASS 17 | value: "${TEST_STORAGECLASS}" 18 | - name: TEST_STORAGE_HOSTPATH 19 | value: "${TEST_STORAGE_HOSTPATH}" 20 | - name: TEST_EPHEMERAL_CLUSTER 21 | value: "${TEST_EPHEMERAL_CLUSTER}" 22 | - name: AIS_TEST_NODE_IMAGE 23 | value: "${AIS_TEST_NODE_IMAGE}" 24 | - name: AIS_TEST_PREV_NODE_IMAGE 25 | value: "${AIS_TEST_PREV_NODE_IMAGE}" 26 | - name: AIS_TEST_INIT_IMAGE 27 | value: "${AIS_TEST_INIT_IMAGE}" 28 | - name: AIS_TEST_PREV_INIT_IMAGE 29 | value: "${AIS_TEST_PREV_INIT_IMAGE}" 30 | - name: AIS_TEST_API_MODE 31 | value: "${AIS_TEST_API_MODE}" 32 | -------------------------------------------------------------------------------- /helm/ais/config/ais/sjc4-dev.yaml: -------------------------------------------------------------------------------- 1 | cluster: ais 2 | mpathInfo: 3 | storageClass: "ais-local-storage" 4 | size: 9.1Ti 5 | mounts: 6 | - path: "/ais/sda" 7 | - path: "/ais/sdb" 8 | - path: "/ais/sdc" 9 | - path: "/ais/sdd" 10 | - path: "/ais/sde" 11 | - path: "/ais/sdf" 12 | - path: "/ais/sdg" 13 | - path: "/ais/sdh" 14 | - path: "/ais/sdi" 15 | - path: "/ais/sdj" 16 | size: 3 17 | protocol: https 18 | https: 19 | skipVerifyCert: false 20 | tlsSecret: "tls-certs" 21 | authNSecretName: ais-authn-jwt-signing-key 22 | 23 | auth: 24 | serviceURL: https://ais-authn.ais:52001 25 | usernamePassword: 26 | secretName: ais-authn-su-creds 27 | secretNamespace: ais 28 | tls: 29 | insecureSkipVerify: true 30 | 31 | nodeImage: 32 | tag: latest 33 | initImage: 34 | tag: latest 35 | logSidecarImage: 36 | tag: v1.1 37 | configToUpdate: 38 | memsys: 39 | hk_time: 3m 40 | backend: 41 | aws: {} 42 | net: 43 | http: 44 | idle_conn_time: 20s 45 | idle_conns: 2048 46 | idle_conns_per_host: 128 47 | stateStorageClass: "local-path" -------------------------------------------------------------------------------- /monitoring/kube-prom/helmfile.yaml.gotmpl: -------------------------------------------------------------------------------- 1 | environments: 2 | prod: 3 | # Add defaults here if they include values used in other templates 4 | values: 5 | - ./environments/{{ .Environment.Name }}/values.yaml.gotmpl 6 | dev: 7 | values: 8 | - ./environments/{{ .Environment.Name }}/values.yaml.gotmpl 9 | 10 | --- 11 | 12 | repositories: 13 | - name: prometheus-community 14 | url: https://prometheus-community.github.io/helm-charts 15 | 16 | releases: 17 | - name: dashboard-configmap 18 | namespace: monitoring 19 | chart: ./dashboard-configmap 20 | createNamespace: true 21 | 22 | - name: prometheus 23 | namespace: monitoring 24 | createNamespace: true 25 | chart: prometheus-community/kube-prometheus-stack 26 | disableValidation: true 27 | needs: 28 | - dashboard-configmap 29 | values: 30 | - ./values/prometheus.yaml.gotmpl 31 | - ./values/alertmanager.yaml.gotmpl 32 | - ./values/kube-state-metrics.yaml.gotmpl 33 | - ./values/grafana.yaml.gotmpl 34 | - ./values/node-exporter.yaml 35 | - ./values/alert-rules.yaml -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | For AIStore, please see the [AIStore repository security documentation](https://github.com/NVIDIA/aistore/blob/main/SECURITY.md). 4 | 5 | ## Support 6 | 7 | We only provide security updates and support for the latest and most recent release versions of AIStore and the AIS K8s Operator. 8 | We strongly encourage users to keep their installations up-to-date with the latest stable releases. 9 | 10 | 11 | ## Reporting a Vulnerability 12 | 13 | If you discover a security vulnerability in the AIS K8s Operator, we encourage you to report it as soon as possible. 14 | To ensure the safety and security of our users, please do not disclose the vulnerability publicly until it has been addressed and a fix is available. 15 | 16 | **Notify the AIStore Development Team**: To report vulnerabilities, please email us at [aistore@nvidia.com](mailto:aistore@nvidia.com). 17 | Provide detailed information to help us quickly identify and address the issue. 18 | 19 | For more information on our security practices or if you have any other questions, please reach out to us at [aistore@nvidia.com](mailto:aistore@nvidia.com). -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_shutdown_cluster.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Playbook for shutting down the AIS cluster 3 | # In shutdown, all the nodes can easily get back and rejoin the cluster at any later time 4 | # User data (buckets, objects) is preserved, configuration is preserved 5 | 6 | - name: Shutdown AIS cluster 7 | hosts: "controller" 8 | vars_prompt: 9 | - name: "shutdown_confirmation" 10 | prompt: "Are you sure you would like to shut down the AIS cluster - {{ cluster }}? Type 'yes' to confirm." 11 | default: "no" 12 | private: no 13 | 14 | pre_tasks: 15 | - name: check confirmation 16 | fail: 17 | msg: "Shutdown cluster not confirmed/forced" 18 | when: shutdown_confirmation != "yes" 19 | 20 | - name: check cluster name 21 | fail: 22 | msg: "cluster name not specified!" 23 | when: cluster is undefined 24 | 25 | gather_facts: false 26 | tasks: 27 | - name: Patch AIS cluster to initiate shutdown 28 | command: kubectl patch aistores.ais.nvidia.com ais -n {{ cluster }} --type=merge -p '{"spec":{"shutdownCluster":true}}' 29 | when: shutdown_confirmation == "yes" 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 NVIDIA Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /playbooks/host-config/docs/ais_enable_multiqueue.md: -------------------------------------------------------------------------------- 1 | # ais_enable_multiqueue 2 | 3 | ## Purpose 4 | 5 | Choosing `mq-deadline` over `deadline` offers a small performance win. 6 | Multiqueue IO schedulers are not enabled by default in Ubuntu 18.04 - they become 7 | the default in later versions of the Linux kernel. 8 | 9 | If your host install process does not already enable MQ then you can use this 10 | playbook to enable MQ; the playbook changes Grub config, and requires a 11 | reboot for effect. You can see whether `mq-deadline` is available using 12 | `cat /sys/block/sda/queue/scheduler` (substituting `sda` for your devices) - if it does not appear in the available list then consider enabling it. 13 | 14 | Note that this playbook simply enables MQ IO scheduling - the selection 15 | of `mq-deadline` is performed in `ais_host_config_common.yml`. 16 | 17 | ## Usage 18 | 19 | ```console 20 | $ ansible-playbook -i hosts.ini ais_enble_multiqueue.yml -e ais_hosts=ais 21 | ``` 22 | 23 | We need only apply this against nodes that will host AIStore target 24 | nodes. 25 | 26 | The playbook notes that a reboot is required but does not initiate reboot. 27 | -------------------------------------------------------------------------------- /playbooks/cloud/roles/oci_config/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Ensure target directory exists 2 | become: true 3 | file: 4 | path: "{{ target_dir }}" 5 | state: directory 6 | 7 | - name: Copy OCI private key file to target directory 8 | become: true 9 | ansible.builtin.copy: 10 | src: "oci_api_key" 11 | dest: "{{ target_dir }}/OCI_PRIVATE_KEY" 12 | 13 | - name: Create OCI config file from template 14 | become: true 15 | template: 16 | src: "config.j2" 17 | dest: "{{ target_dir }}/config" 18 | 19 | - name: Remove existing Kubernetes secret if it exists 20 | shell: kubectl delete secret {{ secret_name }} -n ais --ignore-not-found 21 | 22 | - name: Create new Kubernetes secret from OCI credentials 23 | shell: "kubectl create secret -n ais generic {{ secret_name }} \ 24 | --from-file=config={{ target_dir }}/config \ 25 | --from-file=OCI_PRIVATE_KEY={{ target_dir }}/OCI_PRIVATE_KEY \ 26 | --from-literal=OCI_COMPARTMENT_OCID='{{ oci_compartment_ocid }}'" 27 | 28 | - name: Clean up - remove target directory and its contents 29 | become: true 30 | file: 31 | path: "{{ target_dir }}" 32 | state: absent 33 | -------------------------------------------------------------------------------- /playbooks/host-config/roles/ais_gpuhost_device_plugin/tasks/main.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Now add device plugin 3 | # kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml 4 | # 5 | # It's a daemonset, so don't repeat command on all nodes! 6 | # 7 | # https://github.com/NVIDIA/k8s-device-plugin#preparing-your-gpu-nodes 8 | # 9 | # The following uses local_action to run kubectl on the local system (the one running Ansible). 10 | # Obviously this assumes that the local system is set up to run kubectl. 11 | # 12 | 13 | # 14 | # This role should not repeat for every host in the play 15 | # 16 | 17 | - name: Check whether nvidia-device-plugin DaemonSet is installed 18 | local_action: 19 | module: command 20 | _raw_params: kubectl get ds nvidia-device-plugin-daemonset --namespace=kube-system 21 | ignore_errors: True 22 | run_once: True 23 | register: ds 24 | changed_when: ds.rc != 0 25 | 26 | - name: Install nvidia-device-plugin DaemonSet if necessary 27 | local_action: 28 | module: command 29 | _raw_params: kubectl create -f {{ nvidia_device_plugin_url }} 30 | run_once: True 31 | when: ds.rc != 0 -------------------------------------------------------------------------------- /playbooks/ais-deployment/generate_https_cert.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Create a certificate using cert-manager for use of https 3 | # based AIStore deployments 4 | # 5 | --- 6 | - name: Generate TLS certificates 7 | hosts: controller 8 | gather_facts: false 9 | vars_files: 10 | - vars/https_config.yml 11 | pre_tasks: 12 | - name: Validate if cluster is defined 13 | fail: 14 | msg: "Variable 'cluster' not found. Add the 'cluster' variable during execution. e.g. ansible-playbook -i hosts.ini generate_https_cert.yml -e cluster=ais" 15 | when: cluster is undefined 16 | vars: 17 | - k8s_namespace: "{{ cluster }}" 18 | roles: 19 | - create_namespace 20 | - generate_https_cert 21 | 22 | - name: Fetch CA certificate for client 23 | hosts: controller 24 | gather_facts: false 25 | vars_files: 26 | - vars/https_config.yml 27 | pre_tasks: 28 | - name: Check if cacert_file is defined 29 | set_fact: 30 | skip_play: "{{ cacert_file is not defined }}" 31 | tasks: 32 | - name: Fetch certificate if cacert_file is defined 33 | include_role: 34 | name: fetch_ca_cert 35 | when: not skip_play 36 | -------------------------------------------------------------------------------- /helm/operator/tls-cert/templates/cert.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: cert-manager.io/v1 3 | kind: Certificate 4 | metadata: 5 | name: operator-tls-cert 6 | namespace: {{ .Release.Namespace }} 7 | spec: 8 | secretName: {{ .Values.spec.tlsSecret }} 9 | isCA: false 10 | duration: {{ .Values.spec.duration }} 11 | renewBefore: {{ .Values.spec.renewBefore }} 12 | usages: 13 | - client auth 14 | subject: 15 | organizations: 16 | {{- range .Values.spec.subject.organizations }} 17 | - {{ . }} 18 | {{- end }} 19 | organizationalUnits: 20 | {{- range .Values.spec.subject.organizationalUnits }} 21 | - {{ . }} 22 | {{- end }} 23 | countries: 24 | {{- range .Values.spec.subject.countries }} 25 | - {{ . }} 26 | {{- end }} 27 | dnsNames: 28 | {{- range .Values.spec.dnsNames }} 29 | - "{{ . }}" 30 | {{- end }} 31 | ipAddresses: 32 | {{- range .Values.spec.ipAddresses }} 33 | - "{{ . }}" 34 | {{- end }} 35 | emailAddresses: 36 | - {{ .Values.spec.emailAddress }} 37 | issuerRef: 38 | name: {{ .Values.spec.issuerRef.name }} 39 | kind: {{ .Values.spec.issuerRef.kind }} 40 | -------------------------------------------------------------------------------- /playbooks/cloud/roles/aws_config/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Create .aws if it does not exist 2 | become: true 3 | file: 4 | path: "{{ target_dir }}" 5 | state: directory 6 | mode: '0755' # Permissions set to 755 to allow read and execute access by others 7 | 8 | # Add the aws config file to use to the roles/files directory 9 | - name: Copy aws config 10 | become: true 11 | ansible.builtin.copy: 12 | src: "config" 13 | dest: "{{ target_dir }}/config" 14 | 15 | # Add the aws credentials file to use to the roles/files directory 16 | - name: Copy aws credentials 17 | become: true 18 | ansible.builtin.copy: 19 | src: "credentials" 20 | dest: "{{ target_dir }}/credentials" 21 | 22 | - name: Delete existing kubernetes secret 23 | shell: kubectl delete secret {{ secret_name }} -n ais --ignore-not-found 24 | 25 | - name: Create kubernetes secret 26 | shell: "kubectl create secret -n ais generic {{ secret_name }} \ 27 | --from-file=config={{ target_dir }}/config \ 28 | --from-file=credentials={{ target_dir }}/credentials" 29 | 30 | - name: Clean up copied files 31 | become: true 32 | file: 33 | path: "{{ target_dir }}" 34 | state: absent -------------------------------------------------------------------------------- /helm/ais/charts/tls-cert/templates/cert.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: cert-manager.io/v1 3 | kind: Certificate 4 | metadata: 5 | name: ais-server-cert 6 | namespace: {{ .Release.Namespace }} 7 | spec: 8 | secretName: {{ .Values.https.tlsSecret }} 9 | isCA: false 10 | duration: {{ .Values.certificate.duration }} 11 | renewBefore: {{ .Values.certificate.renewBefore }} 12 | usages: 13 | - server auth 14 | subject: 15 | organizations: 16 | {{- range .Values.certificate.subject.organizations }} 17 | - {{ . }} 18 | {{- end }} 19 | organizationalUnits: 20 | {{- range .Values.certificate.subject.organizationalUnits }} 21 | - {{ . }} 22 | {{- end }} 23 | countries: 24 | {{- range .Values.certificate.subject.countries }} 25 | - {{ . }} 26 | {{- end }} 27 | dnsNames: 28 | {{- range .Values.certificate.dnsNames }} 29 | - "{{ . }}" 30 | {{- end }} 31 | ipAddresses: 32 | {{- range .Values.certificate.ipAddresses }} 33 | - "{{ . }}" 34 | {{- end }} 35 | emailAddresses: 36 | - {{ .Values.certificate.emailAddress }} 37 | issuerRef: 38 | name: {{ .Values.certificate.issuerRef.name }} 39 | kind: {{ .Values.certificate.issuerRef.kind }} 40 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_deploy_cluster.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: "controller" 3 | gather_facts: no 4 | vars_files: 5 | - "vars/ais_mpaths.yml" 6 | - "vars/https_config.yml" 7 | - "vars/multihome.yml" 8 | vars: 9 | - k8s_namespace: "{{cluster}}" 10 | 11 | pre_tasks: 12 | - name: Validate if ais_mpaths is defined 13 | fail: 14 | msg: "Variable 'ais_mpaths' not found. Refer to https://github.com/NVIDIA/ais-k8s/tree/main/docs#aistore-cluster-creation-process and populate the var in 'ais_mpaths.yml'" 15 | when: ais_mpaths is undefined 16 | 17 | - name: Validate if ais_mpath_size is defined 18 | fail: 19 | msg: "Variable 'ais_mpath_size' not found. Refer to https://github.com/NVIDIA/ais-k8s/tree/main/docs#aistore-cluster-creation-process and populate the var in 'ais_mpaths.yml'" 20 | when: ais_mpath_size is undefined 21 | 22 | - name: Validate if cluster is defined 23 | fail: 24 | msg: "Variable 'cluster' not found. Add the 'cluster' variable during execution. Use: ansible-playbook -i hosts.ini ais_deploy_cluster.yml -e cluster=ais" 25 | when: cluster is undefined 26 | 27 | roles: 28 | - create_namespace 29 | - create_pv 30 | - ais_deploy_cluster 31 | -------------------------------------------------------------------------------- /operator/scripts/go_install_tool.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # Install Go tools with versioned binaries 5 | # Usage: ./go-install-tool.sh 6 | 7 | # Verify arguments 8 | if [ $# -ne 3 ]; then 9 | echo "Error: Invalid arguments" 10 | echo "Usage: $0 " 11 | exit 1 12 | fi 13 | 14 | target_binary="$1" 15 | package_url="$2" 16 | version="$3" 17 | 18 | # Extract components 19 | install_dir=$(dirname "$target_binary") 20 | binary_name=$(basename "$target_binary") 21 | base_name=$(echo "$binary_name" | sed "s/-${version}$//") 22 | src_binary="${install_dir}/${base_name}" 23 | 24 | # Create installation directory if missing 25 | mkdir -p "$install_dir" 26 | 27 | # Install if target doesn't exist 28 | if [ ! -f "$target_binary" ]; then 29 | echo "Installing ${package_url}@${version}" 30 | 31 | # Install to temporary name 32 | GOBIN="$install_dir" go install "${package_url}@${version}" 33 | 34 | # Rename if versioned name requested 35 | if [ "$src_binary" != "$target_binary" ]; then 36 | echo "Renaming ${base_name} => ${binary_name}" 37 | mv -f "$src_binary" "$target_binary" 38 | fi 39 | fi 40 | 41 | echo "Verified: ${target_binary}" -------------------------------------------------------------------------------- /.github/workflows/publish-pages.yml: -------------------------------------------------------------------------------- 1 | name: Publish Helm Charts to Github Pages 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | 9 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 10 | permissions: 11 | contents: read 12 | pages: write 13 | id-token: write 14 | 15 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 16 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 17 | concurrency: 18 | group: "pages" 19 | cancel-in-progress: false 20 | 21 | jobs: 22 | deploy-pages: 23 | environment: 24 | name: github-pages 25 | url: ${{ steps.deployment.outputs.page_url }} 26 | runs-on: ubuntu-latest 27 | permissions: 28 | contents: read 29 | pages: write 30 | id-token: write 31 | steps: 32 | - name: Checkout 33 | uses: actions/checkout@v6 34 | - name: Setup Pages 35 | uses: actions/configure-pages@v5 36 | - name: Upload artifact 37 | uses: actions/upload-pages-artifact@v4 38 | with: 39 | path: 'pages/' 40 | - name: Deploy to GitHub Pages 41 | id: deployment 42 | uses: actions/deploy-pages@v4 -------------------------------------------------------------------------------- /auth/keycloak/prereq-helmfile.yaml: -------------------------------------------------------------------------------- 1 | repositories: 2 | - name: openebs 3 | url: https://openebs.github.io/charts 4 | - name: jetstack 5 | url: https://charts.jetstack.io 6 | - name: traefik 7 | url: https://helm.traefik.io/traefik 8 | 9 | releases: 10 | - name: openebs 11 | namespace: openebs 12 | createNamespace: true 13 | chart: openebs/openebs 14 | set: 15 | - name: engines.local.lvm.enabled 16 | value: false 17 | - name: engines.local.zfs.enabled 18 | value: false 19 | - name: engines.replicated.mayastor.enabled 20 | value: false 21 | - name: alloy.enabled 22 | value: false 23 | - name: loki.enabled 24 | value: false 25 | - name: minio.enabled 26 | value: false 27 | 28 | - name: cert-manager 29 | chart: jetstack/cert-manager 30 | namespace: cert-manager 31 | createNamespace: true 32 | set: 33 | - name: installCRDs 34 | value: true 35 | wait: true 36 | 37 | - name: trust-manager 38 | chart: jetstack/trust-manager 39 | namespace: cert-manager 40 | wait: true 41 | needs: 42 | - cert-manager 43 | 44 | - name: traefik 45 | chart: traefik/traefik 46 | namespace: traefik 47 | createNamespace: true -------------------------------------------------------------------------------- /playbooks/security/roles/kernel/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Configure Additional Process Hardening 3 | 4 | # Address space layout randomization (ASLR) is an exploit mitigation technique which randomly arranges 5 | # the address space of key data areas of a process. 6 | 7 | # The ptrace() system call provides a means by which one process (the "tracer") may observe and control 8 | # the execution of another process (the "tracee"), and examine and change the tracee's memory and registers. 9 | 10 | - name: Ensure sysctl configuration file exists 11 | file: 12 | path: /etc/sysctl.d/60-kernel_sysctl.conf 13 | state: touch 14 | 15 | - name: Ensure kernel.randomize_va_space is set in the sysctl configuration 16 | lineinfile: 17 | path: /etc/sysctl.d/60-kernel_sysctl.conf 18 | state: present 19 | regexp: '^kernel.randomize_va_space' 20 | line: 'kernel.randomize_va_space=2' 21 | 22 | - name: Ensure kernel.yama.ptrace_scope is set in the sysctl configuration 23 | lineinfile: 24 | path: /etc/sysctl.d/60-kernel_sysctl.conf 25 | state: present 26 | regexp: '^kernel.yama.ptrace_scope' 27 | line: 'kernel.yama.ptrace_scope=1' 28 | 29 | - name: Apply sysctl settings from the temporary file 30 | shell: sysctl -p /etc/sysctl.d/60-kernel_sysctl.conf 31 | 32 | 33 | -------------------------------------------------------------------------------- /operator/pkg/client/client_suite_test.go: -------------------------------------------------------------------------------- 1 | // Package client contains wrapper for k8s client 2 | /* 3 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package client 6 | 7 | import ( 8 | "testing" 9 | 10 | aisv1 "github.com/ais-operator/api/v1beta1" 11 | . "github.com/onsi/ginkgo/v2" 12 | . "github.com/onsi/gomega" 13 | "k8s.io/apimachinery/pkg/runtime" 14 | "k8s.io/client-go/kubernetes/scheme" 15 | "sigs.k8s.io/controller-runtime/pkg/client" 16 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 17 | logf "sigs.k8s.io/controller-runtime/pkg/log" 18 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 19 | ) 20 | 21 | func TestClient(t *testing.T) { 22 | RegisterFailHandler(Fail) 23 | 24 | RunSpecs(t, "Client Suite") 25 | } 26 | 27 | var _ = BeforeSuite(func() { 28 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 29 | 30 | err := scheme.AddToScheme(scheme.Scheme) 31 | Expect(err).NotTo(HaveOccurred()) 32 | 33 | err = aisv1.AddToScheme(scheme.Scheme) 34 | Expect(err).NotTo(HaveOccurred()) 35 | 36 | // +kubebuilder:scaffold:scheme 37 | }) 38 | 39 | func newFakeClient(objs []runtime.Object) client.Client { 40 | return fake.NewClientBuilder(). 41 | WithRuntimeObjects(objs...). 42 | WithScheme(scheme.Scheme). 43 | Build() 44 | } 45 | -------------------------------------------------------------------------------- /playbooks/host-config/docs/config_kubelet.md: -------------------------------------------------------------------------------- 1 | # config_kubelet.md 2 | 3 | ## Purpose 4 | 5 | Replaces `kubelet-extra-args.conf` in `/etc/systemd/system/kubelet.service.d/` on each of the kubernetes nodes and restarts the service to apply the new config. 6 | 7 | The file provided by default allows containers to apply any sysctls in the `net` namespace. As of this writing, we use it primarily to enable `net.core.somaxconn` in our containers which is an "unsafe" sysctl, i.e. not isolated between different pods on a node. See the [k8s docs on sysctls](https://kubernetes.io/docs/tasks/administer-cluster/sysctl-cluster/). If you have existing options or additional extra args to add to the kubelet, modify the [role vars](../roles/config_kubelet/vars/main.yml) 8 | 9 | ## net.core.somaxconn 10 | 11 | AIStore proxy and target Pods under load receive a very high number of 12 | socket connections from GPU client nodes. 13 | If average object size is small then the connection rate is correspondingly higher, and it is easy to 14 | overwhelm the socket listen queue depth. 15 | Modifying `net.core.somaxconn` increases this queue. 16 | 17 | ## Usage 18 | 19 | ```console 20 | $ ansible-playbook -i ../hosts.ini config_kubelet.yml 21 | ``` 22 | 23 | This should be run for all nodes that may host an AIStore Pod. 24 | -------------------------------------------------------------------------------- /helm/cluster-issuer/issuer-chart/templates/ca.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: cert-manager.io/v1 3 | kind: ClusterIssuer 4 | metadata: 5 | name: {{ .Values.clusterIssuer.name }} 6 | spec: 7 | selfSigned: {} 8 | --- 9 | apiVersion: cert-manager.io/v1 10 | kind: Certificate 11 | metadata: 12 | name: {{ .Values.caCertificate.name }} 13 | namespace: cert-manager 14 | spec: 15 | isCA: true 16 | commonName: selfsigned-ca 17 | secretName: {{ .Values.ca_cert_secret }} 18 | duration: {{ .Values.caCertificate.duration }} 19 | renewBefore: {{ .Values.caCertificate.renewBefore }} 20 | privateKey: 21 | algorithm: {{ .Values.caCertificate.privateKey.algorithm }} 22 | encoding: {{ .Values.caCertificate.privateKey.encoding }} 23 | size: {{ .Values.caCertificate.privateKey.size }} 24 | subject: 25 | organizations: {{ .Values.caCertificate.subject.organizations }} 26 | organizationalUnits: {{ .Values.caCertificate.subject.organizationalUnits }} 27 | countries: {{ .Values.caCertificate.subject.countries }} 28 | issuerRef: 29 | name: {{ .Values.clusterIssuer.name }} 30 | kind: ClusterIssuer 31 | --- 32 | apiVersion: cert-manager.io/v1 33 | kind: ClusterIssuer 34 | metadata: 35 | name: {{ .Values.Issuer.name }} 36 | spec: 37 | ca: 38 | secretName: {{ .Values.ca_cert_secret }} -------------------------------------------------------------------------------- /playbooks/security/roles/crypto_policy/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Configure system wide crypto policy 3 | - name: Create the subpolicy directory if it doesn't exist 4 | file: 5 | path: /etc/crypto-policies/policies/modules 6 | state: directory 7 | mode: '0755' 8 | 9 | - name: Create the NO-SHA1 subpolicy file 10 | copy: 11 | dest: /etc/crypto-policies/policies/modules/NO-SHA1.pmod 12 | content: | 13 | # This is a subpolicy dropping the SHA1 hash and signature support 14 | hash = -SHA1 15 | sign = -*-SHA1 16 | sha1_in_certs = 0 17 | 18 | - name: Create the NO-SSHCBC subpolicy file 19 | copy: 20 | dest: /etc/crypto-policies/policies/modules/NO-SSHCBC.pmod 21 | content: | 22 | # This is a subpolicy to disable all CBC mode ciphers 23 | # for the SSH protocol (libssh and OpenSSH) 24 | cipher@SSH = -*-CBC 25 | 26 | - name: Create the NO-WEAKMAC subpolicy file 27 | copy: 28 | dest: /etc/crypto-policies/policies/modules/NO-WEAKMAC.pmod 29 | content: | 30 | # This is a subpolicy to disable weak macs 31 | mac = -*-64 32 | 33 | - name: Update the system-wide cryptographic policy to include the NO-SHA1, NO-SSHCBC, and NO-WEAKMAC subpolicies 34 | command: update-crypto-policies --set DEFAULT:NO-SHA1:NO-SSHCBC:NO-WEAKMAC 35 | -------------------------------------------------------------------------------- /playbooks/ais-deployment/ais_decommission_cluster.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Playbook for decommissioning the AIS cluster 3 | # In decommission, all the configuration specific to AIStore is deleted from the nodes 4 | 5 | - name: Decommission cluster from controller 6 | hosts: "controller" 7 | vars_prompt: 8 | - name: "decomm_confirmation" 9 | prompt: "Are you sure you would like to decommission cluster - {{ cluster }}? Type 'yes' to confirm." 10 | default: "no" 11 | private: no 12 | - name: "cleanup_metadata" 13 | prompt: "Would you like to cleanup all metadata (api.DecommissionCluster)? Type 'yes' to confirm." 14 | default: "no" 15 | private: no 16 | - name: "cleanup_data" 17 | prompt: "Would you like to cleanup all user data (buckets and objects) from disks? Type 'yes' to confirm." 18 | default: "no" 19 | private: no 20 | 21 | pre_tasks: 22 | - name: check confirmation 23 | fail: 24 | msg: "decommission cluster not confirmed/forced" 25 | when: decomm_confirmation != "yes" 26 | 27 | - name: check cluster name 28 | fail: 29 | msg: "cluster name not specified!" 30 | when: cluster is undefined 31 | 32 | gather_facts: false 33 | roles: 34 | - role: ais_decommission_cluster 35 | vars: 36 | decommission_cluster: true 37 | -------------------------------------------------------------------------------- /helm/ais/charts/ais-cluster/values.yaml: -------------------------------------------------------------------------------- 1 | cluster: ais 2 | # size: 3 | protocol: http 4 | imagePullSecrets: 5 | # - name: 6 | nodeImage: 7 | name: aistorage/aisnode 8 | tag: v3.31 9 | initImage: 10 | name: aistorage/ais-init 11 | tag: v3.31 12 | logSidecarImage: 13 | name: aistorage/ais-logs 14 | tag: v1.1 15 | cloud: 16 | awsSecretName: 17 | gcpSecretName: 18 | ociSecretName: 19 | configToUpdate: 20 | shutdownCluster: false 21 | cleanupData: false 22 | cleanupMetadata: false 23 | apiMode: 24 | clusterDomain: 25 | stateStorageClass: 26 | authNSecretName: 27 | auth: 28 | enableExternalLB: false 29 | publicNetDNSMode: 30 | logsDir: 31 | proxySpec: 32 | tolerations: 33 | - key: "node.kubernetes.io/disk-pressure" 34 | operator: "Exists" 35 | effect: "NoExecute" 36 | hostPort: 51080 37 | servicePort: 51080 38 | portPublic: 51080 39 | portIntraControl: 51082 40 | portIntraData: 51083 41 | 42 | targetSpec: 43 | tolerations: 44 | - key: "node.kubernetes.io/disk-pressure" 45 | operator: "Exists" 46 | effect: "NoExecute" 47 | hostPort: 51081 48 | servicePort: 51081 49 | portPublic: 51081 50 | portIntraControl: 51082 51 | portIntraData: 51083 52 | 53 | proxyLB: 54 | enabled: false 55 | port: 51080 56 | annotations: 57 | prometheus.io/scrape: "true" -------------------------------------------------------------------------------- /playbooks/ais-deployment/docs/generate_https_cert.md: -------------------------------------------------------------------------------- 1 | # Generate self-signed cert with generate_https_cert 2 | 3 | ## Purpose 4 | 5 | The `generate_https_cert` playbook bootstraps a CA issuer and uses it to issue certificates for the AIS cluster, stored securely as a kubernetes secret. 6 | 7 | ## Usage 8 | 9 | To use this playbook, follow these steps: 10 | 11 | 1. Make sure you have Ansible installed on your system. 12 | 13 | 2. Create or edit your `hosts.ini` file to specify the `controller` host where you want to apply this playbook. 14 | 15 | 3. Update the variables to set namespace, DNS, and secret names in [vars/https_config.yml](../vars/https_config.yml) 16 | 17 | 4. Run the playbook using the following command: 18 | 19 | ```console 20 | $ ansible-playbook -i hosts.ini generate_https_cert.yml 21 | ``` 22 | This will execute the playbook and create the self-signed certificate on the specified controller host. 23 | 24 | To optionally output the resulting CA certificate to a local file, provide the `cacert_file` variable: 25 | 26 | ```console 27 | $ ansible-playbook -i hosts.ini generate_https_cert.yml -e cacert_file=local_ais_ca.crt -e cluster=ais 28 | ``` 29 | 30 | To fetch the certificate later, you can [use the fetch_ca_cert playbook](./ais_https_configuration.md#fetching-ca-certificate) 31 | -------------------------------------------------------------------------------- /playbooks/extra/oci/README.md: -------------------------------------------------------------------------------- 1 | ## OCI Network Config Playbook 2 | 3 | This playbook is used for our multi-home deployments in OCI. It is useful for configuring the OS on hosts to use a secondary VNIC as shown in the [OCI documentation](https://docs.oracle.com/iaas/compute-cloud-at-customer/topics/network/configuring-the-instance-os-for-a-secondary-vnic.htm#configuring-the-instance-os-for-a-secondary-vnic) 4 | 5 | The script in [roles/configure_networks/files/oci_vnic_config.sh](./roles/configure_networks/files/oci_vnic_config.sh) is taken from the Oracle-provided link above and modified to work for our use case. 6 | 7 | By default, the provided script gave us issues with the network namespaces, so the script our role uses has been modified to work with our current Oracle Linux OKE instances. 8 | 9 | Specifically, we commented out the reading from network namespaces, as this created duplicate entries in the lists the script uses for configuring networks: 10 | 11 | ```bash 12 | # for ns in "${nss[@]}"; do 13 | # oci_vcn_ip_ifaces_read $ns 14 | # done 15 | ``` 16 | 17 | We also added an exception to filter out the `cni0` docker network, as this was not picked up as a virtual interface by default: 18 | 19 | ```bash 20 | if { [ -z "$IS_VM" ] || [ -z "${VIRTUAL_IFACES[$iface]}" ]; } && [ "${iface_data[1]}" != "cni0" ]; 21 | ``` 22 | -------------------------------------------------------------------------------- /.github/workflows/publish-release.yml: -------------------------------------------------------------------------------- 1 | name: Publish ais-k8s release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | env: 9 | AISOPERATOR_IMAGE: 'aistorage/ais-operator' 10 | VERSION: ${{ github.ref_name }} 11 | 12 | jobs: 13 | release: 14 | runs-on: ubuntu-22.04 15 | steps: 16 | - name: Checkout Repo 17 | uses: actions/checkout@v6 18 | - uses: actions/setup-go@v6 19 | with: 20 | go-version: '1.25.x' 21 | - name: Login to DockerHub 22 | uses: docker/login-action@v3 23 | with: 24 | username: ${{ secrets.DOCKERHUB_USERNAME }} 25 | password: ${{ secrets.DOCKERHUB_TOKEN }} 26 | 27 | - name: Make aisoperator image 28 | run: | 29 | pushd $GITHUB_WORKSPACE/operator 30 | IMG="${{ env.AISOPERATOR_IMAGE }}:${{ env.VERSION }}" make docker-build docker-push 31 | IMG="${{ env.AISOPERATOR_IMAGE }}:latest" make docker-build docker-push 32 | popd 33 | 34 | - name: Make operator yaml file and helm chart 35 | run: | 36 | pushd $GITHUB_WORKSPACE/operator 37 | VERSION="${{ env.VERSION }}" IMG="${{ env.AISOPERATOR_IMAGE }}:${{ env.VERSION }}" make build-installer-helm 38 | popd 39 | 40 | - name: Release 41 | uses: softprops/action-gh-release@v2 42 | with: 43 | files: operator/dist/* 44 | prerelease: true 45 | -------------------------------------------------------------------------------- /auth/keycloak/scripts/prepare_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # This script creates an ais-admin user in an active keycloak 5 | # Note this REQUIRES a port-forward to already be running or a locally accessible cluster 6 | 7 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 8 | 9 | usage() { 10 | echo "Usage: $0 [CA_CRT_PATH]" >&2 11 | exit 1 12 | } 13 | 14 | # Require at least 3 args, 4th is optional 15 | if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then 16 | usage 17 | fi 18 | 19 | KEYCLOAK_HOST="$1" 20 | USER="$2" 21 | PASS="$3" 22 | CA_FILE="${4:-}" 23 | 24 | # Set up venv and requirements 25 | if [ -d "$SCRIPT_DIR/venv" ]; then 26 | echo "using pre-existing venv for keycloak ais-admin creation script" 27 | source "$SCRIPT_DIR/venv/bin/activate" 28 | else 29 | echo "venv not found, creating and installing requirements for keycloak ais-admin creation script" 30 | python3 -m venv venv 31 | source "$SCRIPT_DIR/venv/bin/activate" 32 | pip install -r requirements.txt 33 | fi 34 | 35 | # Build python arguments, conditionally add --verify-ca 36 | PY_ARGS=( 37 | "$SCRIPT_DIR/create_ais_admin.py" 38 | --host "$KEYCLOAK_HOST" 39 | --realm aistore 40 | --admin-user "$USER" 41 | --admin-pass "$PASS" 42 | ) 43 | 44 | if [ -n "$CA_FILE" ]; then 45 | PY_ARGS+=( --verify-ca "$CA_FILE" ) 46 | fi 47 | 48 | python "${PY_ARGS[@]}" -------------------------------------------------------------------------------- /helm/authn/charts/authn/templates/cert.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.tls.enabled .Values.tls.createCert}} 2 | --- 3 | apiVersion: cert-manager.io/v1 4 | kind: Certificate 5 | metadata: 6 | name: {{ .Release.Name }}-server-cert 7 | namespace: {{ .Release.Namespace }} 8 | spec: 9 | secretName: {{ .Values.tls.secretName }} 10 | isCA: false 11 | duration: {{ .Values.tls.certificate.duration }} 12 | renewBefore: {{ .Values.tls.certificate.renewBefore }} 13 | usages: 14 | - server auth 15 | subject: 16 | organizations: 17 | {{- range .Values.tls.certificate.subject.organizations }} 18 | - {{ . }} 19 | {{- end }} 20 | organizationalUnits: 21 | {{- range .Values.tls.certificate.subject.organizationalUnits }} 22 | - {{ . }} 23 | {{- end }} 24 | countries: 25 | {{- range .Values.tls.certificate.subject.countries }} 26 | - {{ . }} 27 | {{- end }} 28 | dnsNames: 29 | - localhost 30 | {{- range .Values.tls.certificate.dnsNames }} 31 | - {{ . }} 32 | {{- end }} 33 | ipAddresses: 34 | - 127.0.0.1 35 | {{- range .Values.tls.certificate.ipAddresses }} 36 | - {{ . }} 37 | {{- end }} 38 | emailAddresses: 39 | - {{ .Values.tls.certificate.emailAddress }} 40 | issuerRef: 41 | name: {{ .Values.tls.certificate.issuerRef.name }} 42 | kind: {{ .Values.tls.certificate.issuerRef.kind }} 43 | {{- end }} 44 | -------------------------------------------------------------------------------- /operator/api/v1beta1/util_types.go: -------------------------------------------------------------------------------- 1 | // Package contains declaration of AIS Kubernetes Custom Resource Definitions 2 | /* 3 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package v1beta1 6 | 7 | import ( 8 | "github.com/NVIDIA/aistore/cmn/cos" 9 | ) 10 | 11 | // Empty type is needed because declaring `map[string]struct{}` or `map[string]interface{}` 12 | // raises error "name requested for invalid type: struct{}/interface{}". 13 | // For more information see: 14 | // - https://github.com/kubernetes-sigs/controller-tools/issues/636 15 | // - https://github.com/kubernetes-sigs/kubebuilder/issues/528 16 | type Empty struct{} 17 | 18 | // Duration is wrapper over `cos.Duration` that overrides type in generated manifests. 19 | // +kubebuilder:validation:Type=string 20 | type Duration cos.Duration 21 | 22 | func (d Duration) MarshalJSON() ([]byte, error) { return cos.Duration(d).MarshalJSON() } 23 | func (d *Duration) UnmarshalJSON(b []byte) (err error) { return (*cos.Duration)(d).UnmarshalJSON(b) } 24 | 25 | // SizeIEC is wrapper over `cos.SizeIEC` that overrides type in generated manifests. 26 | // +kubebuilder:validation:Type=string 27 | type SizeIEC cos.SizeIEC 28 | 29 | func (s SizeIEC) MarshalJSON() ([]byte, error) { return cos.SizeIEC(s).MarshalJSON() } 30 | func (s *SizeIEC) UnmarshalJSON(b []byte) (err error) { return (*cos.SizeIEC)(s).UnmarshalJSON(b) } 31 | -------------------------------------------------------------------------------- /operator/config/base/manifests/bases/ais-operator.clusterserviceversion.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: operators.coreos.com/v1beta1 2 | kind: ClusterServiceVersion 3 | metadata: 4 | annotations: 5 | alm-examples: '[]' 6 | capabilities: Basic Install 7 | name: ais-operator.v0.0.0 8 | namespace: placeholder 9 | spec: 10 | apiservicedefinitions: {} 11 | customresourcedefinitions: 12 | owned: 13 | - description: AIStore is the Schema for the aistores API. 14 | displayName: AIStore 15 | kind: AIStore 16 | name: aistores.ais.nvidia.com 17 | version: v1beta1 18 | description: AIStore Operator 19 | displayName: AIS Operator 20 | icon: 21 | - base64data: "" 22 | mediatype: "" 23 | install: 24 | spec: 25 | deployments: null 26 | strategy: "" 27 | installModes: 28 | - supported: false 29 | type: OwnNamespace 30 | - supported: false 31 | type: SingleNamespace 32 | - supported: false 33 | type: MultiNamespace 34 | - supported: true 35 | type: AllNamespaces 36 | keywords: 37 | - AIStore 38 | - AIS 39 | - K8s 40 | - AI 41 | - Storage 42 | links: 43 | - name: Ais Operator 44 | url: https://ais-operator.domain 45 | maintainers: 46 | - email: aistore@nvidia.com 47 | name: AIStore Team 48 | maturity: alpha 49 | provider: 50 | name: NVIDIA Corporation 51 | url: https://aistore.nvidia.com 52 | version: 0.0.0 53 | -------------------------------------------------------------------------------- /helm/ais-client/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ .Release.Name }} 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: {{ .Release.Name }} 10 | template: 11 | metadata: 12 | labels: 13 | app: {{ .Release.Name }} 14 | spec: 15 | serviceAccountName: {{ .Values.serviceAccount.name }} 16 | nodeSelector: 17 | {{- toYaml .Values.nodeSelector | nindent 8 }} 18 | containers: 19 | - name: ais-client 20 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" 21 | imagePullPolicy: {{ .Values.image.pullPolicy }} 22 | command: [ "sleep", "infinity" ] 23 | resources: 24 | {{- toYaml .Values.resources | nindent 12 }} 25 | env: 26 | - name: AIS_ENDPOINT 27 | value: {{ .Values.ais.endpoint | quote }} 28 | - name: AIS_CLIENT_CA 29 | value: /etc/ais-ca/{{ .Values.ais.bundleFile }} 30 | volumeMounts: 31 | - name: ais-ca 32 | mountPath: /etc/ais-ca 33 | readOnly: true 34 | volumes: 35 | - name: ais-ca 36 | configMap: 37 | name: {{ .Values.ais.caConfigMap }} 38 | optional: true 39 | items: 40 | - key: {{ .Values.ais.bundleFile }} 41 | path: {{ .Values.ais.bundleFile }} 42 | -------------------------------------------------------------------------------- /operator/config/scorecard/patches/olm.config.yaml: -------------------------------------------------------------------------------- 1 | - op: add 2 | path: /stages/0/tests/- 3 | value: 4 | entrypoint: 5 | - scorecard-test 6 | - olm-bundle-validation 7 | image: quay.io/operator-framework/scorecard-test:v1.3.0 8 | labels: 9 | suite: olm 10 | test: olm-bundle-validation-test 11 | - op: add 12 | path: /stages/0/tests/- 13 | value: 14 | entrypoint: 15 | - scorecard-test 16 | - olm-crds-have-validation 17 | image: quay.io/operator-framework/scorecard-test:v1.3.0 18 | labels: 19 | suite: olm 20 | test: olm-crds-have-validation-test 21 | - op: add 22 | path: /stages/0/tests/- 23 | value: 24 | entrypoint: 25 | - scorecard-test 26 | - olm-crds-have-resources 27 | image: quay.io/operator-framework/scorecard-test:v1.3.0 28 | labels: 29 | suite: olm 30 | test: olm-crds-have-resources-test 31 | - op: add 32 | path: /stages/0/tests/- 33 | value: 34 | entrypoint: 35 | - scorecard-test 36 | - olm-spec-descriptors 37 | image: quay.io/operator-framework/scorecard-test:v1.3.0 38 | labels: 39 | suite: olm 40 | test: olm-spec-descriptors-test 41 | - op: add 42 | path: /stages/0/tests/- 43 | value: 44 | entrypoint: 45 | - scorecard-test 46 | - olm-status-descriptors 47 | image: quay.io/operator-framework/scorecard-test:v1.3.0 48 | labels: 49 | suite: olm 50 | test: olm-status-descriptors-test 51 | --------------------------------------------------------------------------------