├── .editorconfig ├── .gitattributes ├── .github ├── CODEOWNERS └── workflows │ ├── closing-soon.yml │ ├── fsdp-eks-regression.yml │ ├── fsdp-regression-test-container.yml │ ├── fsdp-regression-test-venv.yml │ └── megatron-ci-slurm.yaml ├── .gitignore ├── .markdownlint.jsonc ├── 0.docs ├── EnableIdentityCenter.png ├── IdentityCenterSetup1.png ├── IdentityCenterSetup10.png ├── IdentityCenterSetup2.png ├── IdentityCenterSetup3.png ├── IdentityCenterSetup4.png ├── IdentityCenterSetup5.png ├── IdentityCenterSetup6.png ├── IdentityCenterSetup7.png ├── IdentityCenterSetup8.png ├── IdentityCenterSetup9.png ├── batch-arch.png ├── core-infra-architecture.png ├── deploy_prometheus_grafana_cfn.png ├── deployment diagrams.png ├── diagrams-omnigraffle.graffle ├── diagrams-templates.pptx ├── eks-model-training-multi-az.drawio ├── eks-model-training-single-az.png ├── fsx-lustre-template.png ├── observability_architecture.png ├── parallelcluster-arch-diagram.png ├── parallelcluster-prerequisites-cfn.png ├── ssm-connect-user.png ├── ssm-connect.png ├── vpc-all-az.png ├── vpc-one-az.png └── vpc-template.png ├── 1.architectures ├── 0.common │ ├── 0.private-bucket.yaml │ ├── README.md │ └── hyperpod-event-bridge-email.yaml ├── 1.vpc_network │ ├── 1.vpc-multi-az.yaml │ ├── 2.vpc-one-az.yaml │ ├── README.md │ └── status.sh ├── 2.aws-parallelcluster │ ├── .gitignore │ ├── README-full-fledged.md │ ├── README.md │ ├── cluster-templates │ │ └── cluster-vanilla.yaml │ ├── infra-templates │ │ ├── parallelcluster-prerequisites-p1.yaml │ │ └── parallelcluster-prerequisites.yaml │ ├── tips-and-tricks.md │ ├── troubleshooting-guide.md │ └── utils │ │ ├── create_config.sh │ │ ├── easy-ssh.sh │ │ └── pcluster-fetch-config.sh ├── 3.aws-batch │ ├── 0.aws-batch-distributed-training-p5.yaml │ ├── 0.aws-batch-distributed-training.yaml │ ├── README.md │ └── aws-batch-distributed-training-p6.yaml ├── 4.amazon-eks │ ├── README.md │ ├── eks-g4dn-vpc.yaml │ ├── eks-g4dn.yaml │ ├── eks-g5-node-autorepair.yaml │ ├── eks-p4de-odcr-vpc.yaml │ ├── eks-p4de-odcr.yaml │ ├── eks-p5-capacity-block.yaml │ └── eks-p5-odcr-vpc.yaml ├── 5.sagemaker-hyperpod │ ├── 0.AmazonSageMakerClustersExecutionRoleTrustedEntities.json │ ├── 1.AmazonSageMakerClustersExecutionRolePolicy.json │ ├── 2.SageMakerVPC.yaml │ ├── 3.FSxLustre.yaml │ ├── LifecycleScripts │ │ └── base-config │ │ │ ├── add_users.sh │ │ │ ├── apply_hotfix.sh │ │ │ ├── config.py │ │ │ ├── hotfix │ │ │ ├── hold-lustre-client.sh │ │ │ └── mock-gpu-driver-deb.sh │ │ │ ├── lifecycle_script.py │ │ │ ├── mount_fsx.sh │ │ │ ├── mount_fsx_openzfs.sh │ │ │ ├── multi_headnode_setup │ │ │ ├── headnode_notification.sh │ │ │ └── headnode_setup.sh │ │ │ ├── observability │ │ │ ├── dcgm_metrics_config │ │ │ │ ├── dcgm-metrics-advanced.csv │ │ │ │ └── dcgm-metrics-basic.csv │ │ │ ├── install_dcgm_exporter.sh │ │ │ ├── install_efa_exporter.sh │ │ │ ├── install_node_exporter.sh │ │ │ ├── install_observability.py │ │ │ ├── install_otel_collector.sh │ │ │ ├── install_slurm_exporter.sh │ │ │ ├── otel_config │ │ │ │ ├── config-compute-template.yaml │ │ │ │ └── config-head-template.yaml │ │ │ └── stop_observability.py │ │ │ ├── on_create.sh │ │ │ ├── setup_mariadb_accounting.sh │ │ │ ├── setup_rds_accounting.sh │ │ │ ├── setup_sssd.py │ │ │ ├── setup_user_associations.sh │ │ │ ├── shared_users_sample.txt │ │ │ ├── start_slurm.sh │ │ │ └── utils │ │ │ ├── enable_slurm_log_rotation.sh │ │ │ ├── enroot.conf │ │ │ ├── fsx_ubuntu.sh │ │ │ ├── gen-keypair-ubuntu.sh │ │ │ ├── install_ansible.sh │ │ │ ├── install_docker.sh │ │ │ ├── install_enroot_pyxis.sh │ │ │ ├── motd.sh │ │ │ ├── motd.txt │ │ │ ├── mount-s3.sh │ │ │ ├── pam_adopt_cgroup_wheel.sh │ │ │ ├── slurm_fix_plugstackconf.sh │ │ │ ├── ssh-to-compute.sh │ │ │ └── update_neuron_sdk.sh │ ├── README.md │ ├── automate-smhp-slurm │ │ ├── README.md │ │ ├── automate-cluster-creation.sh │ │ └── media │ │ │ └── automate-smhp-demo.gif │ ├── create_config.sh │ ├── easy-ssh.sh │ ├── hyperpod-precheck.py │ ├── patching-backup.sh │ ├── sagemaker-hyperpod-slurm-multi-headnode.yaml │ ├── sagemaker-hyperpod.yaml │ ├── slurm-studio │ │ ├── README.md │ │ ├── media │ │ │ ├── 01-studio-hyperpod-architecture.png │ │ │ ├── 02-studio-home.png │ │ │ ├── 03-codeditor-fsx.png │ │ │ ├── 07-fsx-shared.png │ │ │ ├── 08-fsx-partitioned.png │ │ │ ├── 09-studio-user.png │ │ │ └── 10-filesystem-check.png │ │ ├── slurm_lifecycle.sh │ │ └── studio-slurm.yaml │ ├── terraform-modules │ │ ├── README.md │ │ └── hyperpod-slurm-tf │ │ │ ├── .gitignore │ │ │ ├── easy-ssh.sh │ │ │ ├── main.tf │ │ │ ├── modules │ │ │ ├── fsx_lustre │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── fsx_openzfs │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── hyperpod_cluster │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── lifecycle_script │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── private_subnet │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── s3_bucket │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── s3_endpoint │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── sagemaker_iam_role │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── security_group │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ └── vpc │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── outputs.tf │ │ │ ├── providers.tf │ │ │ ├── terraform.tfvars.example │ │ │ ├── terraform_outputs.sh │ │ │ └── variables.tf │ ├── tools │ │ ├── README.md │ │ └── dump_cluster_nodes_info.py │ └── validate-config.py ├── 6.ldap_server │ ├── README.md │ └── cf_ldap_server.yaml ├── 7.sagemaker-hyperpod-eks │ ├── LifecycleScripts │ │ └── base-config │ │ │ └── on_create.sh │ ├── README-manual-steps.md │ ├── README.md │ ├── automate-smhp-eks │ │ ├── README.md │ │ ├── automate-eks-cluster-creation.sh │ │ ├── hyperpod-eks-cluster-creation.sh │ │ └── media │ │ │ ├── automate-smhp-eks-demo.gif │ │ │ └── helper-script.png │ ├── cfn-templates │ │ ├── README.md │ │ ├── helm-chart-injector │ │ │ ├── .gitignore │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── build-layer.sh │ │ │ ├── deploy.sh │ │ │ ├── lambda_function │ │ │ │ ├── lambda_function.py │ │ │ │ └── requirements.txt │ │ │ ├── package-function.sh │ │ │ └── run-docker-build.sh │ │ ├── hyperpod-eks-full-stack.yaml │ │ ├── nested-stack-modules.png │ │ ├── nested-stacks │ │ │ ├── eks-cluster-stack.yaml │ │ │ ├── helm-chart-stack.yaml │ │ │ ├── hyperpod-cluster-stack.yaml │ │ │ ├── lifecycle-script-stack.yaml │ │ │ ├── main-stack.yaml │ │ │ ├── private-subnet-stack.yaml │ │ │ ├── s3-bucket-stack.yaml │ │ │ ├── s3-endpoint-stack.yaml │ │ │ ├── sagemaker-iam-role-stack.yaml │ │ │ ├── security-group-stack.yaml │ │ │ └── vpc-stack.yaml │ │ ├── sagemaker-studio-fsx-stack.yaml │ │ └── sagemaker-studio-stack.yaml │ ├── create_config.sh │ ├── slinky-slurm │ │ ├── .gitignore │ │ ├── Docker-Build-README.md │ │ ├── README.md │ │ ├── dlc-slurmd.Dockerfile │ │ ├── g5 │ │ │ ├── g5-custom.tfvars │ │ │ ├── g5-llama2_7b-training.sbatch │ │ │ ├── g5-params.json │ │ │ └── g5-values.yaml │ │ ├── lustre-pvc-slurm.yaml │ │ ├── lustre-storageclass.yaml │ │ ├── openzfs-pvc-slurm.yaml │ │ ├── openzfs-storageclass.yaml │ │ ├── p5 │ │ │ ├── p5-custom.tfvars │ │ │ ├── p5-llama2_7b-training.sbatch │ │ │ ├── p5-params.json │ │ │ └── p5-values.yaml │ │ └── slinky-slurm-hp-eks.png │ ├── smhp-eks-arch.png │ ├── task-governance │ │ ├── 1-imagenet-gpu-team-a.yaml │ │ ├── 2-hyperpod-cli-example-team-b.yaml │ │ ├── 3-imagenet-gpu-team-b-higher-prio.yaml │ │ ├── README.md │ │ └── visualize_topology.sh │ └── terraform-modules │ │ ├── .gitignore │ │ ├── README.md │ │ ├── hyperpod-eks-tf │ │ ├── custom.tfvars │ │ ├── main.tf │ │ ├── modules │ │ │ ├── eks_cluster │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── helm_chart │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── hyperpod_cluster │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── lifecycle_script │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── private_subnet │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── s3_bucket │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── sagemaker_iam_role │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── security_group │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── vpc │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ └── vpc_endpoints │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ ├── outputs.tf │ │ ├── providers.tf │ │ ├── rig_custom.tfvars │ │ ├── terraform.tfvars │ │ ├── variables.tf │ │ └── versions.tf │ │ ├── smhp_tf_modules.png │ │ └── terraform_outputs.sh ├── 8.accounting-database │ ├── README.md │ └── cf_database-accounting.yaml └── efa-cheatsheet.md ├── 2.ami_and_containers ├── 1.amazon_machine_image │ ├── Makefile │ ├── README.md │ ├── inventory │ │ ├── group_vars │ │ │ └── all.yml │ │ └── hosts │ ├── packer-ami.pkr.hcl │ ├── playbook-dlami-gpu.yml │ ├── playbook-dlami-neuron.yml │ ├── playbook-eks-gpu.yml │ ├── playbook-pcluster-cpu.yml │ ├── playbook-pcluster-gpu.yml │ ├── playbook-pcluster-neuron.yml │ └── roles │ │ ├── aws_cliv2 │ │ ├── molecule │ │ │ └── default │ │ │ │ ├── converge.yml │ │ │ │ ├── molecule.yml │ │ │ │ ├── prepare.yml │ │ │ │ └── verify.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── aws_efa │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── aws_efa_ofi │ │ └── tasks │ │ │ └── main.yml │ │ ├── aws_lustre │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── base │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── docker │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── neuron_driver │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── nvidia_cuda │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── nvidia_docker │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── nvidia_driver │ │ ├── defaults │ │ │ └── main.yml │ │ ├── files │ │ │ └── nvidia-persistenced-override.service │ │ └── tasks │ │ │ └── main.yml │ │ ├── nvidia_enroot_pyxis │ │ ├── defaults │ │ │ └── main.yml │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── enroot.conf │ │ ├── nvidia_gdrcopy │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── nvidia_nccl │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── observability │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ ├── packages │ │ ├── defaults │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ │ └── pytorch_neuron │ │ ├── defaults │ │ └── main.yml │ │ └── tasks │ │ └── main.yml ├── 3.pcluster_create_dlami │ ├── 01.dlami-ub2004-base-gpu.yaml │ ├── 02.dlami-ub2004-pytorch-gpu.yaml │ └── README.md ├── containers │ └── pytorch │ │ ├── 0.nvcr-pytorch-aws.dockerfile │ │ └── README.md └── tools │ └── ec2md │ ├── README.md │ └── ec2md.sh ├── 3.test_cases ├── jax │ ├── README.md │ ├── jax.sbatch │ ├── jax_paxml.Dockerfile │ └── run_paxml.sh ├── megatron │ ├── bionemo │ │ ├── 0.Dockerfile │ │ ├── 1.uniref50.slurm │ │ ├── 2.esm1nv_pretrain.slurm │ │ ├── README.md │ │ ├── bionemo_2.5 │ │ │ ├── Dockerfile │ │ │ ├── build.sh │ │ │ ├── enroot.sh │ │ │ ├── get-data.sh │ │ │ └── train-esm.sbatch │ │ ├── prepare_uniref50.py │ │ └── requirements.txt │ ├── megatron-lm │ │ ├── README.md │ │ ├── aws-megatron-lm.Dockerfile │ │ ├── kubernetes │ │ │ ├── README.md │ │ │ └── gpt3 │ │ │ │ ├── README.md │ │ │ │ └── manifests │ │ │ │ ├── .gitignore │ │ │ │ ├── getdata-job.yaml-template │ │ │ │ ├── prepdata-job.yaml-template │ │ │ │ └── pytorchjob.yaml-template │ │ ├── slurm │ │ │ ├── Makefile │ │ │ ├── README.md │ │ │ ├── gpt3 │ │ │ │ ├── 1.data-preprocessing.sbatch │ │ │ │ ├── 2.distributed-training.sbatch │ │ │ │ └── README.md │ │ │ └── llama2 │ │ │ │ ├── README.md │ │ │ │ ├── data-preproc-llama2.sbatch │ │ │ │ └── pretrain-llama2.sbatch │ │ └── test_megatron_lm.py │ ├── nemo │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── kubernetes │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── build.sh │ │ │ ├── custom_data_module.py │ │ │ ├── data-processing │ │ │ │ ├── data-processing-pod-template.yaml │ │ │ │ ├── data-processing.sh │ │ │ │ └── load_dataset.py │ │ │ ├── env_vars.json │ │ │ ├── finetune_custom_dataset.py │ │ │ ├── finetune_default_dataset.py │ │ │ ├── megatron │ │ │ │ ├── megatron-gpt-345m_merges │ │ │ │ └── megatron-gpt-345m_vocab │ │ │ ├── pretrain_custom_dataset.py │ │ │ ├── pretrain_mock_dataset.py │ │ │ ├── push.sh │ │ │ └── venv.sh │ │ └── slurm │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── env_vars.json │ │ │ ├── run.py │ │ │ └── venv.sh │ └── nemo1.0 │ │ ├── .gitignore │ │ ├── 0.NemoMegatron-aws-optimized.Dockerfile │ │ ├── 1.bmk-pretrain-gpt3-126m.sh │ │ ├── 2.bmk-pretrain-gpt3-5b.sh │ │ ├── 3.bmk-pretrain-gpt3-40b.sh │ │ ├── 4.bmk-pretrain-gpt3-175b.sh │ │ ├── 5.bmk-pretrain-llama-7b.sh │ │ ├── 6.bmk-pretrain-llama-70b.sh │ │ ├── EKS │ │ ├── 0.Dockerfile │ │ ├── README.md │ │ ├── fsx.png │ │ ├── fsx │ │ │ ├── fsx-pv.yaml │ │ │ ├── fsx-pvc.yaml │ │ │ └── fsx-storage-class.yaml │ │ ├── launcher_scripts │ │ │ └── conf │ │ │ │ ├── cluster │ │ │ │ └── k8s.yaml │ │ │ │ ├── config.yaml │ │ │ │ └── data_preparation │ │ │ │ └── gpt3 │ │ │ │ ├── data_prep.yaml │ │ │ │ └── download_gpt3_pile.yaml │ │ └── p4de-cluster-config.yaml │ │ ├── README.md │ │ ├── conf.template │ │ ├── cluster │ │ │ └── bcm.yaml │ │ └── config.yaml │ │ └── test_nemo_launcher.py └── pytorch │ ├── FSDP │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── generate-sbatch-training-files.py │ ├── kubernetes │ │ ├── README.md │ │ ├── fsdp.yaml-template │ │ ├── llama2_13b-fsdp.yaml │ │ ├── llama2_70b-fsdp.yaml │ │ ├── llama2_7b-fsdp.yaml │ │ ├── llama3_1_70b-fsdp.yaml │ │ ├── llama3_1_8b-fsdp-hpto.yaml │ │ ├── llama3_1_8b-fsdp.yaml │ │ ├── llama3_2_1b-fsdp.yaml │ │ ├── llama3_2_3b-fsdp.yaml │ │ ├── mathstral_7b-fsdp.yaml │ │ ├── mistral_8x7b-fsdp.yaml │ │ └── training_kubernetes.template │ ├── models │ │ ├── llama2_13b.txt │ │ ├── llama2_70b.txt │ │ ├── llama2_7b.txt │ │ ├── llama3_1_70b.txt │ │ ├── llama3_1_8b.txt │ │ ├── llama3_2_1b.txt │ │ ├── llama3_2_3b.txt │ │ ├── mathstral_7b.txt │ │ └── mistral_8x7b.txt │ ├── slurm │ │ ├── README.md │ │ ├── create_venv.sh │ │ ├── llama2_13b-training.sbatch │ │ ├── llama2_70b-training.sbatch │ │ ├── llama2_7b-training.sbatch │ │ ├── llama3_1_70b-training.sbatch │ │ ├── llama3_1_8b-training.sbatch │ │ ├── llama3_2_1b-training.sbatch │ │ ├── llama3_2_3b-training.sbatch │ │ ├── mathstral_7b-training.sbatch │ │ ├── mistral_8x7b-training.sbatch │ │ └── training-sub.template │ └── src │ │ ├── model_utils │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── checkpoint.py │ │ ├── concat_dataset.py │ │ └── train_utils.py │ │ ├── requirements.txt │ │ └── train.py │ ├── cpu-ddp │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── ddp.py │ ├── kubernetes │ │ ├── README.md │ │ ├── fsdp-simple.yaml │ │ └── fsdp.yaml-template │ └── slurm │ │ ├── 0.create-conda-env.sh │ │ ├── 1.conda-train.sbatch │ │ ├── 2.create-enroot-image.sh │ │ ├── 3.container-train.sbatch │ │ └── README.md │ ├── deepspeed │ ├── 0.deepspeed.dockerfile │ ├── 1.build-image.sbatch │ ├── Makefile │ ├── README.md │ └── examples_megatron_deepspeed │ │ ├── .gitignore │ │ ├── README.md │ │ └── finetune_hf_llama │ │ ├── .gitignore │ │ ├── 1.convert-weights-to-hf.sbatch │ │ ├── 2.convert-weights-to-mega-ds.sh │ │ ├── 3.finetune-llama.sh │ │ ├── README.md │ │ ├── configs │ │ └── ds_config.json │ │ ├── scripts │ │ ├── convert-weights-hf-to-megatron-deepspeed.sh │ │ ├── finetune_llama.sbatch │ │ └── finetune_llama.sh │ │ └── src │ │ └── convert_llama_weights_to_hf.py │ ├── distillation │ ├── Dockerfile │ ├── README.md │ ├── kubernetes │ │ └── distill.yaml-template │ └── src │ │ ├── distil_logits_cli.py │ │ └── requirements.txt │ ├── mosaicml-composer │ ├── mpt │ │ ├── 0.llm-foundry.Dockerfile │ │ ├── 1.c4-preprocess.sbatch │ │ ├── 2.train-mpt-manual-distributed.sbatch │ │ ├── Makefile │ │ └── README.md │ └── stable-diffusion │ │ ├── README.md │ │ ├── multi-node │ │ ├── 1.Dockerfile │ │ ├── 2.train.sbatch │ │ ├── 3.stable-diffusion-eks.yaml-template │ │ ├── 4.etcd.yaml │ │ ├── p5-model-scaling-stable-diff-throughput.png │ │ └── p5-model-scaling-stable-diff.png │ │ └── single-node │ │ ├── 0.Dockerfile │ │ ├── calculate_number_of_parameters.py │ │ └── sd_p4de_p5.png │ ├── neuronx-distributed │ └── llama3 │ │ ├── kubernetes │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── generate-jobspec.sh │ │ ├── llama3_train.yaml-template │ │ ├── src │ │ │ └── tokenize_data.py │ │ └── tokenize_data.yaml-template │ │ └── slurm │ │ └── README.md │ ├── optimum-neuron │ └── llama3 │ │ ├── kubernetes │ │ └── fine-tuning │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── generate-jobspec.sh │ │ │ └── templates │ │ │ ├── compile_peft.yaml-template │ │ │ ├── consolidation.yaml-template │ │ │ ├── launch_peft_train.yaml-template │ │ │ ├── merge_lora.yaml-template │ │ │ └── tokenize_data.yaml-template │ │ ├── slurm │ │ └── fine-tuning │ │ │ ├── README.md │ │ │ ├── finetune-llama3-8B.sh │ │ │ └── submit_jobs │ │ │ ├── 0.create_env.sh │ │ │ ├── 1.download_model.sh │ │ │ ├── 2.compile_model.sh │ │ │ ├── 3.finetune.sh │ │ │ ├── 4.model_consolidation.sh │ │ │ ├── 5.merge_lora_weights.sh │ │ │ └── 6.inference.sh │ │ └── src │ │ ├── get_model.py │ │ ├── merge_lora_weights.py │ │ ├── model_consolidation.py │ │ ├── peft_tokenize_data.py │ │ ├── run_inference.py │ │ └── train.py │ ├── picotron │ ├── .gitignore │ ├── README.md │ ├── SmolLM-1.7B │ │ ├── README.md │ │ ├── ec2 │ │ │ └── README.md │ │ └── slurm │ │ │ ├── README.md │ │ │ └── train.sbatch │ ├── create_config.py │ ├── picotron.Dockerfile │ └── train.py │ ├── torchtitan │ ├── README.md │ └── slurm │ │ ├── .gitignore │ │ ├── 0.create_conda_env.sh │ │ ├── 1.llama_3_8b_torchtitan.sh │ │ └── README.md │ ├── trl │ └── grpo │ │ ├── README.md │ │ ├── deepspeed_zero3.yaml │ │ ├── eval.py │ │ ├── grpo.Dockerfile │ │ ├── grpo_wandb.png │ │ ├── inference.py │ │ ├── train.py │ │ └── train.sbatch │ └── verl │ └── rlvr │ ├── Dockerfile │ ├── README.md │ ├── img │ └── ray-dashboard.png │ ├── job-stop.sh │ ├── observability │ ├── README.md │ └── add-ray-metrics.sh │ ├── ray-expose.sh │ ├── ray-hide.sh │ ├── recipe │ ├── run_dapo_configurable.sh │ └── run_grpo_configurable.sh │ └── setup │ ├── build-push.sh │ ├── env_vars.example │ ├── install-kuberay.sh │ ├── load_data_dapo.sh │ ├── load_data_grpo.sh │ └── raycluster.yaml ├── 4.validation_and_observability ├── 1.pytorch-env-validation │ ├── 0.pytorch-screen.Dockerfile │ ├── 1.torch-screen.sbatch │ ├── README.md │ └── pytorch-screen.py ├── 3.efa-node-exporter │ ├── Dockerfile │ ├── EKS │ │ ├── Chart.yaml │ │ ├── README.md │ │ ├── ci │ │ │ └── port-values.yaml │ │ ├── efa-exporter-values-temp.yaml │ │ └── templates │ │ │ ├── NOTES.txt │ │ │ ├── _helpers.tpl │ │ │ ├── clusterrole.yaml │ │ │ ├── clusterrolebinding.yaml │ │ │ ├── daemonset.yaml │ │ │ ├── endpoints.yaml │ │ │ ├── extra-manifests.yaml │ │ │ ├── networkpolicy.yaml │ │ │ ├── podmonitor.yaml │ │ │ ├── psp-clusterrole.yaml │ │ │ ├── psp-clusterrolebinding.yaml │ │ │ ├── psp.yaml │ │ │ ├── rbac-configmap.yaml │ │ │ ├── service.yaml │ │ │ ├── serviceaccount.yaml │ │ │ ├── servicemonitor.yaml │ │ │ └── verticalpodautoscaler.yaml │ ├── Makefile │ ├── README.md │ ├── amazon_efa_linux.go │ ├── buildspec.yaml │ ├── class_amazon_efa.go │ └── docker-compose.yml ├── 4.prometheus-grafana │ ├── 1click-dashboards-deployment │ │ ├── README.md │ │ ├── cluster-observability.yaml │ │ ├── dashboards │ │ │ ├── .gitignore │ │ │ ├── DCGM_exporter_dashboard.json │ │ │ ├── create_ml_dashboards.py │ │ │ └── requirements.txt │ │ ├── img │ │ │ └── observability-dashboard.png │ │ ├── managed-cluster-observability-pc.yaml │ │ └── prometheus-agent-collector.yaml │ ├── README-OS-grafana.md │ ├── README-grafana-alerts.md │ ├── README.md │ ├── assets │ │ ├── Observability-Architecture.png │ │ ├── add-to-channel.png │ │ ├── alert-firing.png │ │ ├── alert-test.png │ │ ├── alert-threshold.png │ │ ├── configure-contact-point.png │ │ ├── configure_grafana_alerting.png │ │ ├── create-slack-app.png │ │ ├── dcgm-dashboard.png │ │ ├── efa-node-dashboard.png │ │ ├── enable_grafana_alerting.png │ │ ├── eval-threshold.png │ │ ├── gpu-health-alert.png │ │ ├── gpu-health-by-node.png │ │ ├── gpu-health.png │ │ ├── grafana-datasource-configure.png │ │ ├── grafana-datasource.png │ │ ├── grafana-service-token-lambda-function.zip │ │ ├── grafana_users.png │ │ ├── grafana_users_admin.png │ │ ├── observability-slurm-custom-resource-function.zip │ │ ├── observability_architecture.png │ │ ├── os-grafana-set-datasource1.png │ │ ├── os-grafana-set-datasource2.png │ │ ├── os-grafana-set-datasource3.png │ │ ├── prometheus_running.png │ │ ├── retrieve-amp-endpoint.png │ │ ├── slack-app-workspace.png │ │ ├── slack-scopes.png │ │ └── slurm-dashboard.png │ ├── cluster-observability-os-grafana.yaml │ ├── cluster-observability.yaml │ ├── dcgm-metrics.csv │ └── update-prometheus.sh ├── 5.nsight │ ├── 2.generate_recipes.sh │ ├── EKS │ │ ├── Dockerfile.llama2-efa │ │ ├── custom_values.yaml │ │ ├── fsdp.yaml │ │ ├── fsdp_eks_report_screenshot.png │ │ ├── install-injector │ │ ├── label-namespace │ │ ├── move_report │ │ └── uniinstall-injector │ ├── README.md │ ├── fsdp-llama2 │ │ ├── 1.distributed-training.sbatch │ │ ├── fsdp_rep_screenshot.png │ │ ├── nsys-slurm-exec │ │ └── train.py │ ├── nccl │ │ ├── 0.nsight_nccl.sbatch │ │ ├── NCCL_Scatter_Perf.png │ │ ├── all_reduce_csv_screenshot.png │ │ ├── all_reduce_sum.png │ │ └── plot_nccl.py │ ├── nemotron │ │ ├── 1.nemotron.sbatch │ │ ├── nemo.Dockerfile │ │ ├── nemotron-15B-P5-report.png │ │ └── nemotron-slurm-exec.sh │ └── slurm-workshop-artifacts │ │ ├── get_nccl_msg_size.py │ │ ├── install_nsight.sh │ │ ├── nccl-slurm-exec-delay-duration.sh │ │ ├── nccl-slurm-exec-steps.sh │ │ ├── nccl-slurm-exec.sh │ │ └── requirements.txt └── efa-versions.py ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── conftest.py └── micro-benchmarks ├── nccl-tests ├── README.md ├── aws-batch │ └── README.md ├── buildspec.yaml ├── kubernetes │ ├── nccl-tests-gb200.yaml │ └── nccl-tests.yaml ├── nccl-tests.Dockerfile ├── nccl_to_csv.py ├── slurm │ ├── nccl-tests-ami.sbatch │ ├── nccl-tests-container.sbatch │ └── topology-aware-nccl-tests │ │ ├── README.md │ │ ├── generate_hostfile.sh │ │ ├── hostfile_topologify.py │ │ ├── nccl-tests-ami.sbatch │ │ ├── nccl-tests-container.sbatch │ │ ├── process_nccl_results.sh │ │ ├── run_unit_tests.sh │ │ ├── submit_nccl_test_ami.sh │ │ ├── submit_nccl_test_container.sh │ │ ├── test_hostfile_topologify.py │ │ └── test_requirements.txt └── test_nccl_tests.py ├── nccom-tests ├── README.md └── slurm │ ├── README.md │ └── nccom-tests.sbatch └── nvshmem ├── README.md ├── nvshmem.Dockerfile └── slurm ├── alltoall_latency.sbatch ├── shmem_put_bw_internode.sbatch └── shmem_put_bw_intranode.sbatch /.editorconfig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.editorconfig -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.gitattributes -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.github/CODEOWNERS -------------------------------------------------------------------------------- /.github/workflows/closing-soon.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.github/workflows/closing-soon.yml -------------------------------------------------------------------------------- /.github/workflows/fsdp-eks-regression.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.github/workflows/fsdp-eks-regression.yml -------------------------------------------------------------------------------- /.github/workflows/fsdp-regression-test-container.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.github/workflows/fsdp-regression-test-container.yml -------------------------------------------------------------------------------- /.github/workflows/fsdp-regression-test-venv.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.github/workflows/fsdp-regression-test-venv.yml -------------------------------------------------------------------------------- /.github/workflows/megatron-ci-slurm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.github/workflows/megatron-ci-slurm.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.gitignore -------------------------------------------------------------------------------- /.markdownlint.jsonc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/.markdownlint.jsonc -------------------------------------------------------------------------------- /0.docs/EnableIdentityCenter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/EnableIdentityCenter.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup1.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup10.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup2.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup3.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup4.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup5.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup6.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup7.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup8.png -------------------------------------------------------------------------------- /0.docs/IdentityCenterSetup9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/IdentityCenterSetup9.png -------------------------------------------------------------------------------- /0.docs/batch-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/batch-arch.png -------------------------------------------------------------------------------- /0.docs/core-infra-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/core-infra-architecture.png -------------------------------------------------------------------------------- /0.docs/deploy_prometheus_grafana_cfn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/deploy_prometheus_grafana_cfn.png -------------------------------------------------------------------------------- /0.docs/deployment diagrams.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/deployment diagrams.png -------------------------------------------------------------------------------- /0.docs/diagrams-omnigraffle.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/diagrams-omnigraffle.graffle -------------------------------------------------------------------------------- /0.docs/diagrams-templates.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/diagrams-templates.pptx -------------------------------------------------------------------------------- /0.docs/eks-model-training-multi-az.drawio: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/eks-model-training-multi-az.drawio -------------------------------------------------------------------------------- /0.docs/eks-model-training-single-az.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/eks-model-training-single-az.png -------------------------------------------------------------------------------- /0.docs/fsx-lustre-template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/fsx-lustre-template.png -------------------------------------------------------------------------------- /0.docs/observability_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/observability_architecture.png -------------------------------------------------------------------------------- /0.docs/parallelcluster-arch-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/parallelcluster-arch-diagram.png -------------------------------------------------------------------------------- /0.docs/parallelcluster-prerequisites-cfn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/parallelcluster-prerequisites-cfn.png -------------------------------------------------------------------------------- /0.docs/ssm-connect-user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/ssm-connect-user.png -------------------------------------------------------------------------------- /0.docs/ssm-connect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/ssm-connect.png -------------------------------------------------------------------------------- /0.docs/vpc-all-az.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/vpc-all-az.png -------------------------------------------------------------------------------- /0.docs/vpc-one-az.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/vpc-one-az.png -------------------------------------------------------------------------------- /0.docs/vpc-template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/0.docs/vpc-template.png -------------------------------------------------------------------------------- /1.architectures/0.common/0.private-bucket.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/0.common/0.private-bucket.yaml -------------------------------------------------------------------------------- /1.architectures/0.common/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/0.common/README.md -------------------------------------------------------------------------------- /1.architectures/0.common/hyperpod-event-bridge-email.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/0.common/hyperpod-event-bridge-email.yaml -------------------------------------------------------------------------------- /1.architectures/1.vpc_network/1.vpc-multi-az.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/1.vpc_network/1.vpc-multi-az.yaml -------------------------------------------------------------------------------- /1.architectures/1.vpc_network/2.vpc-one-az.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/1.vpc_network/2.vpc-one-az.yaml -------------------------------------------------------------------------------- /1.architectures/1.vpc_network/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/1.vpc_network/README.md -------------------------------------------------------------------------------- /1.architectures/1.vpc_network/status.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/1.vpc_network/status.sh -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/.gitignore: -------------------------------------------------------------------------------- 1 | env_vars -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/README-full-fledged.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/README-full-fledged.md -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/README.md -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/cluster-templates/cluster-vanilla.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/cluster-templates/cluster-vanilla.yaml -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites-p1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites-p1.yaml -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites.yaml -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/tips-and-tricks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/tips-and-tricks.md -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/troubleshooting-guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/troubleshooting-guide.md -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/utils/create_config.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/utils/create_config.sh -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/utils/easy-ssh.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/utils/easy-ssh.sh -------------------------------------------------------------------------------- /1.architectures/2.aws-parallelcluster/utils/pcluster-fetch-config.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/2.aws-parallelcluster/utils/pcluster-fetch-config.sh -------------------------------------------------------------------------------- /1.architectures/3.aws-batch/0.aws-batch-distributed-training-p5.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/3.aws-batch/0.aws-batch-distributed-training-p5.yaml -------------------------------------------------------------------------------- /1.architectures/3.aws-batch/0.aws-batch-distributed-training.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/3.aws-batch/0.aws-batch-distributed-training.yaml -------------------------------------------------------------------------------- /1.architectures/3.aws-batch/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/3.aws-batch/README.md -------------------------------------------------------------------------------- /1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/README.md -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/eks-g4dn.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/eks-g4dn.yaml -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/eks-p4de-odcr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/eks-p4de-odcr.yaml -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml -------------------------------------------------------------------------------- /1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/0.AmazonSageMakerClustersExecutionRoleTrustedEntities.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/0.AmazonSageMakerClustersExecutionRoleTrustedEntities.json -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/1.AmazonSageMakerClustersExecutionRolePolicy.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/1.AmazonSageMakerClustersExecutionRolePolicy.json -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/2.SageMakerVPC.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/2.SageMakerVPC.yaml -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/3.FSxLustre.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/3.FSxLustre.yaml -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/add_users.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/add_users.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/apply_hotfix.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/apply_hotfix.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/multi_headnode_setup/headnode_setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/multi_headnode_setup/headnode_setup.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_dcgm_exporter.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_dcgm_exporter.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_efa_exporter.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_efa_exporter.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_node_exporter.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_node_exporter.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_observability.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_observability.py -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/stop_observability.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/stop_observability.py -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/on_create.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/on_create.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_mariadb_accounting.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_mariadb_accounting.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_rds_accounting.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_rds_accounting.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_sssd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_sssd.py -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_user_associations.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_user_associations.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/shared_users_sample.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/shared_users_sample.txt -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enroot.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enroot.conf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_ansible.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_ansible.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_docker.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_docker.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.txt -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/mount-s3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/mount-s3.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/pam_adopt_cgroup_wheel.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/pam_adopt_cgroup_wheel.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/slurm_fix_plugstackconf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/slurm_fix_plugstackconf.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/ssh-to-compute.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/ssh-to-compute.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/update_neuron_sdk.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/update_neuron_sdk.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/README.md -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/README.md -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/media/automate-smhp-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/media/automate-smhp-demo.gif -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/create_config.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/create_config.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/easy-ssh.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/hyperpod-precheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/hyperpod-precheck.py -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/patching-backup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/patching-backup.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod-slurm-multi-headnode.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod-slurm-multi-headnode.yaml -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/README.md -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/media/01-studio-hyperpod-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/01-studio-hyperpod-architecture.png -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/media/02-studio-home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/02-studio-home.png -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/media/03-codeditor-fsx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/03-codeditor-fsx.png -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/media/07-fsx-shared.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/07-fsx-shared.png -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/media/08-fsx-partitioned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/08-fsx-partitioned.png -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/media/09-studio-user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/09-studio-user.png -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/media/10-filesystem-check.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/10-filesystem-check.png -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/slurm_lifecycle.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/slurm_lifecycle.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/slurm-studio/studio-slurm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/slurm-studio/studio-slurm.yaml -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/README.md -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/.gitignore -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/easy-ssh.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/easy-ssh.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/outputs.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/variables.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/outputs.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/outputs.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/variables.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/outputs.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/main.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/outputs.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/variables.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/outputs.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/providers.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/providers.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform.tfvars.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform.tfvars.example -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform_outputs.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform_outputs.sh -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/variables.tf -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/tools/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/tools/README.md -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/tools/dump_cluster_nodes_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/tools/dump_cluster_nodes_info.py -------------------------------------------------------------------------------- /1.architectures/5.sagemaker-hyperpod/validate-config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/5.sagemaker-hyperpod/validate-config.py -------------------------------------------------------------------------------- /1.architectures/6.ldap_server/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/6.ldap_server/README.md -------------------------------------------------------------------------------- /1.architectures/6.ldap_server/cf_ldap_server.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/6.ldap_server/cf_ldap_server.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/README-manual-steps.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/README-manual-steps.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/helper-script.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/helper-script.png -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/.gitignore -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/Dockerfile -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/build-layer.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/build-layer.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/deploy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/deploy.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/requirements.txt: -------------------------------------------------------------------------------- 1 | cfnresponse~=1.1.4 2 | PyYAML~=6.0.1 -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/package-function.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/package-function.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/run-docker-build.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/run-docker-build.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/hyperpod-eks-full-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/hyperpod-eks-full-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stack-modules.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stack-modules.png -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/eks-cluster-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/eks-cluster-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/helm-chart-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/helm-chart-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/hyperpod-cluster-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/hyperpod-cluster-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-bucket-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-bucket-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-endpoint-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-endpoint-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/sagemaker-iam-role-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/sagemaker-iam-role-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/security-group-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/security-group-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-fsx-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-fsx-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/create_config.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/create_config.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.gitignore: -------------------------------------------------------------------------------- 1 | slurm*/ 2 | 3 | *.tgz 4 | 5 | new-values.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/dlc-slurmd.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/dlc-slurmd.Dockerfile -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-custom.tfvars: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-custom.tfvars -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-llama2_7b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-llama2_7b-training.sbatch -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-params.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-params.json -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-values.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-pvc-slurm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-pvc-slurm.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-storageclass.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-storageclass.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-pvc-slurm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-pvc-slurm.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-storageclass.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-storageclass.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-custom.tfvars: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-custom.tfvars -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-llama2_7b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-llama2_7b-training.sbatch -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-params.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-params.json -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/p5/p5-values.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slinky-slurm-hp-eks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slinky-slurm-hp-eks.png -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/smhp-eks-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/smhp-eks-arch.png -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/task-governance/1-imagenet-gpu-team-a.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/task-governance/1-imagenet-gpu-team-a.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/task-governance/2-hyperpod-cli-example-team-b.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/task-governance/2-hyperpod-cli-example-team-b.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/task-governance/3-imagenet-gpu-team-b-higher-prio.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/task-governance/3-imagenet-gpu-team-b-higher-prio.yaml -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/task-governance/visualize_topology.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/task-governance/visualize_topology.sh -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/.gitignore -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/main.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/outputs.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/main.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/outputs.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/versions.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/versions.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/main.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/outputs.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/variables.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/versions.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/versions.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/main.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/providers.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/providers.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/terraform.tfvars: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/terraform.tfvars -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/versions.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/versions.tf -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/smhp_tf_modules.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/smhp_tf_modules.png -------------------------------------------------------------------------------- /1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/terraform_outputs.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/terraform_outputs.sh -------------------------------------------------------------------------------- /1.architectures/8.accounting-database/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/8.accounting-database/README.md -------------------------------------------------------------------------------- /1.architectures/8.accounting-database/cf_database-accounting.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/8.accounting-database/cf_database-accounting.yaml -------------------------------------------------------------------------------- /1.architectures/efa-cheatsheet.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/1.architectures/efa-cheatsheet.md -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/Makefile -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/README.md -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/inventory/group_vars/all.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/inventory/group_vars/all.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/inventory/hosts: -------------------------------------------------------------------------------- 1 | [images] 2 | pcluster 3 | default 4 | -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/packer-ami.pkr.hcl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/packer-ami.pkr.hcl -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/playbook-dlami-gpu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/playbook-dlami-gpu.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/playbook-dlami-neuron.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/playbook-dlami-neuron.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/playbook-eks-gpu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/playbook-eks-gpu.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-cpu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-cpu.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-gpu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-gpu.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-neuron.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-neuron.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/converge.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/converge.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/molecule.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/molecule.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/prepare.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/prepare.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/verify.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/verify.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_efa_ofi/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_efa_ofi/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/base/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/base/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/base/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/base/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/docker/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/docker/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/docker/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/docker/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/neuron_driver/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/neuron_driver/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/neuron_driver/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_docker/defaults/main.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - role: docker -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_docker/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_docker/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/templates/enroot.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/templates/enroot.conf -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/observability/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/observability/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/observability/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/observability/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/packages/defaults/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/packages/defaults/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/packages/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/packages/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/pytorch_neuron/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | -------------------------------------------------------------------------------- /2.ami_and_containers/1.amazon_machine_image/roles/pytorch_neuron/tasks/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/1.amazon_machine_image/roles/pytorch_neuron/tasks/main.yml -------------------------------------------------------------------------------- /2.ami_and_containers/3.pcluster_create_dlami/01.dlami-ub2004-base-gpu.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/3.pcluster_create_dlami/01.dlami-ub2004-base-gpu.yaml -------------------------------------------------------------------------------- /2.ami_and_containers/3.pcluster_create_dlami/02.dlami-ub2004-pytorch-gpu.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/3.pcluster_create_dlami/02.dlami-ub2004-pytorch-gpu.yaml -------------------------------------------------------------------------------- /2.ami_and_containers/3.pcluster_create_dlami/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/3.pcluster_create_dlami/README.md -------------------------------------------------------------------------------- /2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile -------------------------------------------------------------------------------- /2.ami_and_containers/containers/pytorch/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/containers/pytorch/README.md -------------------------------------------------------------------------------- /2.ami_and_containers/tools/ec2md/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/tools/ec2md/README.md -------------------------------------------------------------------------------- /2.ami_and_containers/tools/ec2md/ec2md.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/2.ami_and_containers/tools/ec2md/ec2md.sh -------------------------------------------------------------------------------- /3.test_cases/jax/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/jax/README.md -------------------------------------------------------------------------------- /3.test_cases/jax/jax.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/jax/jax.sbatch -------------------------------------------------------------------------------- /3.test_cases/jax/jax_paxml.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/jax/jax_paxml.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/jax/run_paxml.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/jax/run_paxml.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/0.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/0.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/1.uniref50.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/1.uniref50.slurm -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/2.esm1nv_pretrain.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/2.esm1nv_pretrain.slurm -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/bionemo_2.5/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/bionemo_2.5/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/bionemo_2.5/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build -t bionemo:aws . 4 | -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/bionemo_2.5/enroot.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/bionemo_2.5/enroot.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/bionemo_2.5/get-data.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/bionemo_2.5/get-data.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/bionemo_2.5/train-esm.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/bionemo_2.5/train-esm.sbatch -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/prepare_uniref50.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/prepare_uniref50.py -------------------------------------------------------------------------------- /3.test_cases/megatron/bionemo/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/bionemo/requirements.txt -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/aws-megatron-lm.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/aws-megatron-lm.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/kubernetes/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/kubernetes/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/kubernetes/gpt3/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/.gitignore -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/getdata-job.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/getdata-job.yaml-template -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/prepdata-job.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/prepdata-job.yaml-template -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/pytorchjob.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/pytorchjob.yaml-template -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/Makefile -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/gpt3/1.data-preprocessing.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/gpt3/1.data-preprocessing.sbatch -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/gpt3/2.distributed-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/gpt3/2.distributed-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/gpt3/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/gpt3/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/llama2/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/llama2/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/llama2/data-preproc-llama2.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/llama2/data-preproc-llama2.sbatch -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/slurm/llama2/pretrain-llama2.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/slurm/llama2/pretrain-llama2.sbatch -------------------------------------------------------------------------------- /3.test_cases/megatron/megatron-lm/test_megatron_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/megatron-lm/test_megatron_lm.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/build.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/build.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/custom_data_module.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/custom_data_module.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/data-processing/load_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/data-processing/load_dataset.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/env_vars.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/env_vars.json -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_merges: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_merges -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_vocab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_vocab -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/push.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/push.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/kubernetes/venv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/kubernetes/venv.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/slurm/.gitignore: -------------------------------------------------------------------------------- 1 | nemo-env 2 | 3 | -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/slurm/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/slurm/env_vars.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/slurm/env_vars.json -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/slurm/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/slurm/run.py -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo/slurm/venv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo/slurm/venv.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/.gitignore -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/0.NemoMegatron-aws-optimized.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/0.NemoMegatron-aws-optimized.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/1.bmk-pretrain-gpt3-126m.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/1.bmk-pretrain-gpt3-126m.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/2.bmk-pretrain-gpt3-5b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/2.bmk-pretrain-gpt3-5b.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/3.bmk-pretrain-gpt3-40b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/3.bmk-pretrain-gpt3-40b.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/4.bmk-pretrain-gpt3-175b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/4.bmk-pretrain-gpt3-175b.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/5.bmk-pretrain-llama-7b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/5.bmk-pretrain-llama-7b.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/6.bmk-pretrain-llama-70b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/6.bmk-pretrain-llama-70b.sh -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/0.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/0.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/fsx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/fsx.png -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pv.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pvc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pvc.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-storage-class.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-storage-class.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/cluster/k8s.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/cluster/k8s.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/config.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/data_prep.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/data_prep.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/EKS/p4de-cluster-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/EKS/p4de-cluster-config.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/README.md -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/conf.template/cluster/bcm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/conf.template/cluster/bcm.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/conf.template/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/conf.template/config.yaml -------------------------------------------------------------------------------- /3.test_cases/megatron/nemo1.0/test_nemo_launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/megatron/nemo1.0/test_nemo_launcher.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/.gitignore -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/generate-sbatch-training-files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/generate-sbatch-training-files.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/fsdp.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/fsdp.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama2_13b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama2_13b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama2_70b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama2_70b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama2_7b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama2_7b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama3_1_70b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama3_1_70b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp-hpto.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp-hpto.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama3_2_1b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama3_2_1b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/llama3_2_3b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/llama3_2_3b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/mathstral_7b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/mathstral_7b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/mistral_8x7b-fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/mistral_8x7b-fsdp.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/kubernetes/training_kubernetes.template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/kubernetes/training_kubernetes.template -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/llama2_13b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/llama2_13b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/llama2_70b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/llama2_70b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/llama2_7b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/llama2_7b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/llama3_1_70b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/llama3_1_70b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/llama3_1_8b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/llama3_1_8b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/llama3_2_1b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/llama3_2_1b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/llama3_2_3b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/llama3_2_3b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/mathstral_7b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/mathstral_7b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/models/mistral_8x7b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/models/mistral_8x7b.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/create_venv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/create_venv.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/llama3_1_70b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/llama3_1_70b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/llama3_1_8b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/llama3_1_8b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/llama3_2_1b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/llama3_2_1b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/llama3_2_3b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/llama3_2_3b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/slurm/training-sub.template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/slurm/training-sub.template -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/src/model_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/src/model_utils/arguments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/src/model_utils/arguments.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/src/model_utils/checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/src/model_utils/checkpoint.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/src/model_utils/concat_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/src/model_utils/concat_dataset.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/src/model_utils/train_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/src/model_utils/train_utils.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/src/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/src/requirements.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/FSDP/src/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/FSDP/src/train.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/.gitignore -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/ddp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/ddp.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/kubernetes/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/kubernetes/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/kubernetes/fsdp-simple.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/kubernetes/fsdp-simple.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/kubernetes/fsdp.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/kubernetes/fsdp.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/slurm/0.create-conda-env.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/slurm/0.create-conda-env.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/slurm/1.conda-train.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/slurm/1.conda-train.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/slurm/2.create-enroot-image.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/slurm/2.create-enroot-image.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/slurm/3.container-train.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/slurm/3.container-train.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/cpu-ddp/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/cpu-ddp/slurm/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/1.build-image.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/1.build-image.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/Makefile -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/.gitignore: -------------------------------------------------------------------------------- 1 | Megatron-DeepSpeed -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/.gitignore: -------------------------------------------------------------------------------- 1 | ds_config.json -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/configs/ds_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/configs/ds_config.json -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/distillation/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/distillation/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/distillation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/distillation/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/distillation/kubernetes/distill.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/distillation/kubernetes/distill.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/distillation/src/distil_logits_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/distillation/src/distil_logits_cli.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/distillation/src/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/distillation/src/requirements.txt -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/mpt/0.llm-foundry.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/mpt/0.llm-foundry.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/mpt/1.c4-preprocess.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/mpt/1.c4-preprocess.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/mpt/2.train-mpt-manual-distributed.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/mpt/2.train-mpt-manual-distributed.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/mpt/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/mpt/Makefile -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/mpt/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/mpt/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/1.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/1.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/2.train.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/2.train.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/4.etcd.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/4.etcd.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff.png -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/0.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/0.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/calculate_number_of_parameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/calculate_number_of_parameters.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/sd_p4de_p5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/sd_p4de_p5.png -------------------------------------------------------------------------------- /3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/generate-jobspec.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/generate-jobspec.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/llama3_train.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/llama3_train.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/src/tokenize_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/src/tokenize_data.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/tokenize_data.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/tokenize_data.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/neuronx-distributed/llama3/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/neuronx-distributed/llama3/slurm/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/generate-jobspec.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/generate-jobspec.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/compile_peft.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/compile_peft.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/consolidation.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/consolidation.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/merge_lora.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/merge_lora.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/tokenize_data.yaml-template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/tokenize_data.yaml-template -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/0.create_env.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/0.create_env.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/src/get_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/src/get_model.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/src/merge_lora_weights.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/src/merge_lora_weights.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/src/model_consolidation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/src/model_consolidation.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/src/peft_tokenize_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/src/peft_tokenize_data.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/src/run_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/src/run_inference.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/optimum-neuron/llama3/src/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/optimum-neuron/llama3/src/train.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/.gitignore: -------------------------------------------------------------------------------- 1 | conf 2 | *.nsys-rep 3 | hf_model* -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/SmolLM-1.7B/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/SmolLM-1.7B/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/SmolLM-1.7B/ec2/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/SmolLM-1.7B/ec2/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/train.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/train.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/create_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/create_config.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/picotron.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/picotron.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/picotron/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/picotron/train.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/torchtitan/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/torchtitan/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/torchtitan/slurm/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/torchtitan/slurm/.gitignore -------------------------------------------------------------------------------- /3.test_cases/pytorch/torchtitan/slurm/0.create_conda_env.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/torchtitan/slurm/0.create_conda_env.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/torchtitan/slurm/1.llama_3_8b_torchtitan.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/torchtitan/slurm/1.llama_3_8b_torchtitan.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/torchtitan/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/torchtitan/slurm/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/deepspeed_zero3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/deepspeed_zero3.yaml -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/eval.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/grpo.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/grpo.Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/grpo_wandb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/grpo_wandb.png -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/inference.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/train.py -------------------------------------------------------------------------------- /3.test_cases/pytorch/trl/grpo/train.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/trl/grpo/train.sbatch -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/Dockerfile -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/img/ray-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/img/ray-dashboard.png -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/job-stop.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/job-stop.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/observability/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/observability/README.md -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/observability/add-ray-metrics.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/observability/add-ray-metrics.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/ray-expose.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/ray-expose.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/ray-hide.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/ray-hide.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/recipe/run_dapo_configurable.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/recipe/run_dapo_configurable.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/recipe/run_grpo_configurable.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/recipe/run_grpo_configurable.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/setup/build-push.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/setup/build-push.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/setup/env_vars.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/setup/install-kuberay.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/setup/install-kuberay.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/setup/load_data_dapo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/setup/load_data_dapo.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/setup/load_data_grpo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/setup/load_data_grpo.sh -------------------------------------------------------------------------------- /3.test_cases/pytorch/verl/rlvr/setup/raycluster.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/3.test_cases/pytorch/verl/rlvr/setup/raycluster.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/1.pytorch-env-validation/0.pytorch-screen.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/1.pytorch-env-validation/0.pytorch-screen.Dockerfile -------------------------------------------------------------------------------- /4.validation_and_observability/1.pytorch-env-validation/1.torch-screen.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/1.pytorch-env-validation/1.torch-screen.sbatch -------------------------------------------------------------------------------- /4.validation_and_observability/1.pytorch-env-validation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/1.pytorch-env-validation/README.md -------------------------------------------------------------------------------- /4.validation_and_observability/1.pytorch-env-validation/pytorch-screen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/1.pytorch-env-validation/pytorch-screen.py -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/Dockerfile -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/Chart.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/Chart.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/README.md -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/ci/port-values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/ci/port-values.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/efa-exporter-values-temp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/efa-exporter-values-temp.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/NOTES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/NOTES.txt -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/_helpers.tpl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/_helpers.tpl -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrole.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrole.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrolebinding.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/daemonset.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/daemonset.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/endpoints.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/endpoints.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/extra-manifests.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/extra-manifests.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/networkpolicy.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/networkpolicy.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/podmonitor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/podmonitor.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrole.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrole.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/rbac-configmap.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/rbac-configmap.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/service.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/service.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/serviceaccount.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/servicemonitor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/servicemonitor.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/Makefile -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/README.md -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/buildspec.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/buildspec.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go -------------------------------------------------------------------------------- /4.validation_and_observability/3.efa-node-exporter/docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/3.efa-node-exporter/docker-compose.yml -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/README.md -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/.gitignore -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/README-OS-grafana.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/README-OS-grafana.md -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/README-grafana-alerts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/README-grafana-alerts.md -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/README.md -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/Observability-Architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/Observability-Architecture.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/add-to-channel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/add-to-channel.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/alert-firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/alert-firing.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/alert-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/alert-test.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/alert-threshold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/alert-threshold.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/configure-contact-point.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/configure-contact-point.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/configure_grafana_alerting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/configure_grafana_alerting.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/create-slack-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/create-slack-app.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/dcgm-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/dcgm-dashboard.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/efa-node-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/efa-node-dashboard.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/enable_grafana_alerting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/enable_grafana_alerting.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/eval-threshold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/eval-threshold.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-alert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-alert.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-by-node.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-by-node.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/gpu-health.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/gpu-health.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource-configure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource-configure.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/grafana-service-token-lambda-function.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/grafana-service-token-lambda-function.zip -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/grafana_users.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/grafana_users_admin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users_admin.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/observability_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/observability_architecture.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource1.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource2.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource3.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/prometheus_running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/prometheus_running.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/retrieve-amp-endpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/retrieve-amp-endpoint.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/slack-app-workspace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/slack-app-workspace.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/slack-scopes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/slack-scopes.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/assets/slurm-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/assets/slurm-dashboard.png -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/cluster-observability-os-grafana.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/cluster-observability-os-grafana.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/dcgm-metrics.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/dcgm-metrics.csv -------------------------------------------------------------------------------- /4.validation_and_observability/4.prometheus-grafana/update-prometheus.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/4.prometheus-grafana/update-prometheus.sh -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/2.generate_recipes.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/2.generate_recipes.sh -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/Dockerfile.llama2-efa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/Dockerfile.llama2-efa -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/custom_values.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/custom_values.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/fsdp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/fsdp.yaml -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/fsdp_eks_report_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/fsdp_eks_report_screenshot.png -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/install-injector: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/install-injector -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/label-namespace: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/label-namespace -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/move_report: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/move_report -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/EKS/uniinstall-injector: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/EKS/uniinstall-injector -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/README.md -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/fsdp-llama2/1.distributed-training.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/fsdp-llama2/1.distributed-training.sbatch -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/fsdp-llama2/fsdp_rep_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/fsdp-llama2/fsdp_rep_screenshot.png -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/fsdp-llama2/nsys-slurm-exec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/fsdp-llama2/nsys-slurm-exec -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/fsdp-llama2/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/fsdp-llama2/train.py -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nccl/0.nsight_nccl.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nccl/0.nsight_nccl.sbatch -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nccl/NCCL_Scatter_Perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nccl/NCCL_Scatter_Perf.png -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nccl/all_reduce_csv_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nccl/all_reduce_csv_screenshot.png -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nccl/all_reduce_sum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nccl/all_reduce_sum.png -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nccl/plot_nccl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nccl/plot_nccl.py -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nemotron/1.nemotron.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nemotron/1.nemotron.sbatch -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nemotron/nemo.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nemotron/nemo.Dockerfile -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nemotron/nemotron-15B-P5-report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nemotron/nemotron-15B-P5-report.png -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/nemotron/nemotron-slurm-exec.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/nemotron/nemotron-slurm-exec.sh -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/slurm-workshop-artifacts/get_nccl_msg_size.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/get_nccl_msg_size.py -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/slurm-workshop-artifacts/install_nsight.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/install_nsight.sh -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-delay-duration.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-delay-duration.sh -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-steps.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-steps.sh -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec.sh -------------------------------------------------------------------------------- /4.validation_and_observability/5.nsight/slurm-workshop-artifacts/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/requirements.txt -------------------------------------------------------------------------------- /4.validation_and_observability/efa-versions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/4.validation_and_observability/efa-versions.py -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/README.md -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/conftest.py -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/README.md -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/aws-batch/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/aws-batch/README.md -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/buildspec.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/buildspec.yaml -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/nccl-tests.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/nccl_to_csv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/nccl_to_csv.py -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/generate_hostfile.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/generate_hostfile.sh -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostfile_topologify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostfile_topologify.py -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-container.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-container.sbatch -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/process_nccl_results.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/process_nccl_results.sh -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/run_unit_tests.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/run_unit_tests.sh -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_ami.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_ami.sh -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_container.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_container.sh -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/test_hostfile_topologify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/test_hostfile_topologify.py -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/test_requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/test_requirements.txt -------------------------------------------------------------------------------- /micro-benchmarks/nccl-tests/test_nccl_tests.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccl-tests/test_nccl_tests.py -------------------------------------------------------------------------------- /micro-benchmarks/nccom-tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccom-tests/README.md -------------------------------------------------------------------------------- /micro-benchmarks/nccom-tests/slurm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccom-tests/slurm/README.md -------------------------------------------------------------------------------- /micro-benchmarks/nccom-tests/slurm/nccom-tests.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nccom-tests/slurm/nccom-tests.sbatch -------------------------------------------------------------------------------- /micro-benchmarks/nvshmem/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nvshmem/README.md -------------------------------------------------------------------------------- /micro-benchmarks/nvshmem/nvshmem.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nvshmem/nvshmem.Dockerfile -------------------------------------------------------------------------------- /micro-benchmarks/nvshmem/slurm/alltoall_latency.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nvshmem/slurm/alltoall_latency.sbatch -------------------------------------------------------------------------------- /micro-benchmarks/nvshmem/slurm/shmem_put_bw_internode.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nvshmem/slurm/shmem_put_bw_internode.sbatch -------------------------------------------------------------------------------- /micro-benchmarks/nvshmem/slurm/shmem_put_bw_intranode.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/HEAD/micro-benchmarks/nvshmem/slurm/shmem_put_bw_intranode.sbatch --------------------------------------------------------------------------------