├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation_request.md
    │   └── feature_request.md
    ├── pull_request_template.md
    └── workflows
    │   ├── pre-commit-check-runner-push.yml
    │   ├── repo-monitoring-cron.yml
    │   ├── security-monitoring-cron.yml
    │   └── unit-test-runner-push.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Config
├── LICENSE
├── NOTICE
├── README.md
├── THIRD-PARTY.txt
├── launcher
    ├── __init__.py
    ├── accelerator_devices.py
    ├── config_validator
    │   ├── type_validator.py
    │   └── value_validator.py
    ├── efa.py
    ├── nemo
    │   ├── README.md
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── k8s_templates
    │   │   └── training
    │   │   │   ├── Chart.yaml
    │   │   │   ├── train-script-gpu.yaml
    │   │   │   ├── train-script-trn.yaml
    │   │   │   ├── training-config.yaml
    │   │   │   ├── training.yaml
    │   │   │   └── values.yaml
    │   ├── launchers.py
    │   ├── recipe_stages.py
    │   ├── slurm_launcher.py
    │   └── stages.py
    └── telemetry.py
├── launcher_scripts
    ├── custom_model
    │   └── run_falcon.sh
    ├── custom_script
    │   ├── README.md
    │   ├── config_k8s.yaml
    │   ├── config_slurm.yaml
    │   ├── custom_allreduce.py
    │   └── run_allreduce.sh
    ├── deepseek
    │   ├── run_hf_deepseek_r1_671b_seq8k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_671b_seq8k_gpu_qlora.sh
    │   ├── run_hf_deepseek_r1_llama_70b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_llama_70b_seq16k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_llama_70b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_llama_70b_seq8k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_llama_8b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_llama_8b_seq16k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_llama_8b_seq8k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_14b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_qwen_14b_seq16k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_14b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_qwen_14b_seq8k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_32b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_qwen_32b_seq16k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_32b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_qwen_32b_seq8k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_7b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_deepseek_r1_qwen_7b_seq16k_gpu_lora.sh
    │   ├── run_hf_deepseek_r1_qwen_7b_seq8k_gpu_fine_tuning.sh
    │   └── run_hf_deepseek_r1_qwen_7b_seq8k_gpu_lora.sh
    ├── llama
    │   ├── p4_run_hf_llama3_70b_seq8k.sh
    │   ├── p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh
    │   ├── p4_run_hf_llama3_70b_seq8k_gpu_lora.sh
    │   ├── p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh
    │   ├── p4_run_hf_llama3_8b_seq8k_gpu_lora.sh
    │   ├── run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh
    │   ├── run_hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.sh
    │   ├── run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh
    │   ├── run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_llama3_3_70b_seq16k_gpu_lora.sh
    │   ├── run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_llama3_3_70b_seq8k_gpu_lora.sh
    │   ├── run_hf_llama3_405b_seq128k_gpu_qlora.sh
    │   ├── run_hf_llama3_405b_seq16k_gpu_lora.sh
    │   ├── run_hf_llama3_405b_seq16k_gpu_qlora.sh
    │   ├── run_hf_llama3_405b_seq32k_gpu_qlora.sh
    │   ├── run_hf_llama3_405b_seq8k_gpu_lora.sh
    │   ├── run_hf_llama3_405b_seq8k_gpu_qlora.sh
    │   ├── run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_llama3_70b_seq16k_gpu_lora.sh
    │   ├── run_hf_llama3_70b_seq16k_gpu_p5x128_pretrain.sh
    │   ├── run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh
    │   ├── run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_llama3_70b_seq8k_gpu_lora.sh
    │   ├── run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh
    │   ├── run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh
    │   ├── run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh
    │   ├── run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh
    │   ├── run_hf_llama3_8b_seq16k_gpu_lora.sh
    │   ├── run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh
    │   ├── run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_llama3_8b_seq8k_gpu_dpo.sh
    │   ├── run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh
    │   ├── run_hf_llama3_8b_seq8k_gpu_lora.sh
    │   ├── run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh
    │   ├── run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh
    │   ├── run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh
    │   ├── run_hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.sh
    │   ├── run_hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.sh
    │   ├── run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh
    │   └── run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh
    ├── mistral
    │   ├── run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh
    │   ├── run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh
    │   └── run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh
    └── mixtral
    │   ├── run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh
    │   ├── run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh
    │   ├── run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh
    │   ├── run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh
    │   ├── run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh
    │   ├── run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh
    │   ├── run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh
    │   └── run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh
├── main.py
├── pyproject.toml
├── recipes_collection
    ├── cluster
    │   ├── k8s.yaml
    │   ├── slurm.yaml
    │   └── sm_jobs.yaml
    ├── config.yaml
    └── recipes
    │   ├── fine-tuning
    │       ├── deepseek
    │       │   ├── hf_deepseek_r1_671b_seq8k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_671b_seq8k_gpu_qlora.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_lora.yaml
    │       │   ├── hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_fine_tuning.yaml
    │       │   └── hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_lora.yaml
    │       └── llama
    │       │   ├── hf_llama3_3_70b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_llama3_3_70b_seq16k_gpu_lora.yaml
    │       │   ├── hf_llama3_3_70b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_llama3_3_70b_seq8k_gpu_lora.yaml
    │       │   ├── hf_llama3_405b_seq128k_gpu_qlora.yaml
    │       │   ├── hf_llama3_405b_seq16k_gpu_lora.yaml
    │       │   ├── hf_llama3_405b_seq16k_gpu_qlora.yaml
    │       │   ├── hf_llama3_405b_seq32k_gpu_qlora.yaml
    │       │   ├── hf_llama3_405b_seq8k_gpu_lora.yaml
    │       │   ├── hf_llama3_405b_seq8k_gpu_qlora.yaml
    │       │   ├── hf_llama3_70b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_llama3_70b_seq16k_gpu_lora.yaml
    │       │   ├── hf_llama3_70b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_llama3_70b_seq8k_gpu_lora.yaml
    │       │   ├── hf_llama3_8b_seq16k_gpu_fine_tuning.yaml
    │       │   ├── hf_llama3_8b_seq16k_gpu_lora.yaml
    │       │   ├── hf_llama3_8b_seq8k_gpu_dpo.yaml
    │       │   ├── hf_llama3_8b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── hf_llama3_8b_seq8k_gpu_lora.yaml
    │       │   ├── hf_llama3_8b_seq8k_trn1_fine_tuning.yaml
    │       │   ├── hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.yaml
    │       │   ├── hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.yaml
    │       │   ├── hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.yaml
    │       │   ├── hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml
    │       │   ├── p4_hf_llama3_70b_seq8k_gpu_fine_tuning.yaml
    │       │   ├── p4_hf_llama3_70b_seq8k_gpu_lora.yaml
    │       │   ├── p4_hf_llama3_8b_seq8k_gpu_fine_tuning.yaml
    │       │   └── p4_hf_llama3_8b_seq8k_gpu_lora.yaml
    │   └── training
    │       ├── custom_model
    │           └── falcon.yaml
    │       ├── llama
    │           ├── hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml
    │           ├── hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.yaml
    │           ├── hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.yaml
    │           ├── hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml
    │           ├── hf_llama3_70b_seq16k_gpu_p5x128_pretrain.yaml
    │           ├── hf_llama3_70b_seq16k_gpu_p5x32_pretrain.yaml
    │           ├── hf_llama3_70b_seq16k_gpu_p5x64_pretrain.yaml
    │           ├── hf_llama3_70b_seq8k_gpu_p5x128_pretrain.yaml
    │           ├── hf_llama3_70b_seq8k_gpu_p5x32_pretrain.yaml
    │           ├── hf_llama3_70b_seq8k_gpu_p5x64_pretrain.yaml
    │           ├── hf_llama3_70b_seq8k_trn1x16_pretrain.yaml
    │           ├── hf_llama3_8b_seq16k_gpu_p5x16_pretrain.yaml
    │           ├── hf_llama3_8b_seq16k_gpu_p5x32_pretrain.yaml
    │           ├── hf_llama3_8b_seq8k_gpu_p5x16_pretrain.yaml
    │           ├── hf_llama3_8b_seq8k_gpu_p5x32_pretrain.yaml
    │           ├── hf_llama3_8b_seq8k_trn1x4_pretrain.yaml
    │           ├── megatron_llama3_1_8b_nemo.yaml
    │           └── p4_hf_llama3_70b_seq8k_gpu.yaml
    │       ├── mistral
    │           ├── hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml
    │           ├── hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml
    │           ├── hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml
    │           └── hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml
    │       └── mixtral
    │           ├── hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.yaml
    │           ├── hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml
    │           ├── hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml
    │           ├── hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.yaml
    │           ├── hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml
    │           ├── hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml
    │           ├── hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml
    │           ├── hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml
    │           ├── hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml
    │           └── hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml
├── requirements.txt
├── scripts
    └── licenseChecker.sh
├── template
    └── sm_jobs.py
├── tests
    ├── __init__.py
    ├── config_validator
    │   ├── test_type_validator.py
    │   └── test_value_validator.py
    ├── k8s_workflow
    │   ├── k8s_baseline_artifacts
    │   │   ├── llama-8b
    │   │   │   ├── k8s_template
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── config
    │   │   │   │   │   └── llama-8b_hydra.yaml
    │   │   │   │   ├── templates
    │   │   │   │   │   ├── training-config.yaml
    │   │   │   │   │   └── training.yaml
    │   │   │   │   └── values.yaml
    │   │   │   ├── llama-8b_hydra.yaml
    │   │   │   └── llama-8b_submission.sh
    │   │   └── test_custom
    │   │   │   ├── k8s_template
    │   │   │       ├── Chart.yaml
    │   │   │       ├── templates
    │   │   │       │   └── training.yaml
    │   │   │       └── values.yaml
    │   │   │   └── test_custom_submission.sh
    │   ├── test_custom_k8s_workflow.py
    │   └── test_recipe_k8s_workflow.py
    ├── slurm_workflow
    │   ├── slurm_baseline_artifacts
    │   │   ├── hf-llama3-8b
    │   │   │   ├── launch_docker_container.sh
    │   │   │   ├── llama-8b_hydra.yaml
    │   │   │   ├── sagemaker-hf-llama3-8b_submission.sh
    │   │   │   └── train_script.sh
    │   │   ├── llama-8b
    │   │   │   ├── docker_exec_script.sh
    │   │   │   ├── launch_docker_container.sh
    │   │   │   ├── llama-8b_hydra.yaml
    │   │   │   ├── sagemaker-llama-8b_submission.sh
    │   │   │   └── train_script.sh
    │   │   └── test_custom
    │   │   │   ├── docker_exec_script.sh
    │   │   │   ├── launch_docker_container.sh
    │   │   │   ├── testcustom_slurm_test_custom_submission.sh
    │   │   │   └── train_script.sh
    │   ├── test_custom_slurm_workflow.py
    │   └── test_recipe_slurm_workflow.py
    ├── sm_jobs_workflow
    │   ├── __init__.py
    │   ├── sm_jobs_baseline_artifacts
    │   │   ├── multimodal
    │   │   │   ├── llama3-2-11b
    │   │   │   │   ├── launch.py
    │   │   │   │   ├── llama3-2-11b_hydra.yaml
    │   │   │   │   ├── llama3-2-11b_submission.sh
    │   │   │   │   ├── recipe.yaml
    │   │   │   │   ├── requirements.txt
    │   │   │   │   └── sm_jobs_config.yaml
    │   │   │   └── llama3.2-11b
    │   │   │   │   ├── launch.py
    │   │   │   │   ├── llama3.2-11b_hydra.yaml
    │   │   │   │   ├── llama3.2-11b_submission.sh
    │   │   │   │   ├── recipe.yaml
    │   │   │   │   ├── requirements.txt
    │   │   │   │   └── sm_jobs_config.yaml
    │   │   ├── no_kwargs
    │   │   │   └── llama-8b
    │   │   │   │   ├── launch.py
    │   │   │   │   ├── llama-8b_hydra.yaml
    │   │   │   │   ├── llama-8b_submission.sh
    │   │   │   │   └── sm_jobs_config.yaml
    │   │   └── with_kwargs
    │   │   │   └── llama-8b
    │   │   │       ├── launch.py
    │   │   │       ├── llama-8b_hydra.yaml
    │   │   │       ├── llama-8b_submission.sh
    │   │   │       └── sm_jobs_config.yaml
    │   └── test_sm_jobs_workflow.py
    ├── test_config_files.py
    ├── test_launcher_scripts.py
    ├── test_readme.py
    ├── test_recipes.py
    └── test_utils.py
└── validations_wrapper.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | # Exclude submodule directory from coverage
3 | omit =
4 |     launcher/nemo/nemo_framework_launcher/*
5 |     template/*
6 | 
7 | [report]
8 | fail_under = 85
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: File a report to help us reproduce and fix the problem
 4 | title: ''
 5 | labels: 'bug'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Describe the bug
11 | A clear and concise description of what the bug is.
12 | 
13 | ## How to Reproduce?
14 | A clear, step-by-step set of instructions to reproduce the bug.
15 | The provided code need to be **complete** and **runnable**, if additional data is needed, please include them in the issue.
16 | 
17 | ## Expected behavior
18 | A clear and concise description of what you expected to happen.
19 | 
20 | ## Screenshots, error messages or logs
21 | If applicable, please share with us screenshots, error messages or logs to help explain your problem.
22 | 
23 | ## System information
24 | A description of your system. Please provide:
25 | - **Docker image you ran against**:
26 | - **Source code version you ran against**:
27 | - **Python version**:
28 | - **Hardware accelerator used**:
29 | 
30 | ## Additional context
31 | Add any other context about the problem here. Please provide any additional steps you have tried to solve your issue here.
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation request
 3 | about: Request improved documentation
 4 | title: ''
 5 | labels: 'documentation request'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## What did you find confusing?
11 | A clear and concise description of what you found confusing. Ex. I tried to [...] but I didn't understand how to [...]
12 | 
13 | ## Describe how documentation can be improved
14 | A clear and concise description of where documentation was lacking and how it can be improved.
15 | 
16 | ## Additional context
17 | Add any other context or screenshots about the documentation request here.
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest new functionality for this project
 4 | title: ''
 5 | labels: 'feature request'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Describe the feature you'd like
11 | A clear and concise description of the functionality you want.
12 | 
13 | ## How would this feature be used?
14 | A clear and concise description of the use case for this feature. Please provide an example, if possible.
15 | 
16 | ## Describe alternatives you've considered
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | ## Additional context
20 | Add any other context about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | ### Motivation
 4 | Explain the motivation
 5 | 
 6 | ### Changes
 7 | * List your changes
 8 | 
 9 | ### Testing
10 | Explain how the changes were tested
11 | 
12 | ## Merge Checklist
13 | Put an x in the boxes that apply. If you're unsure about any of them, don't hesitate to ask. We're here to help! This is simply a reminder of what we are going to look for before merging your pull request.
14 | 
15 | ### General
16 |  - [ ] I have read the [CONTRIBUTING](../CONTRIBUTING.md) doc
17 |  - [ ] I have run `pre-commit run --all-files` on my code. It will check for [this configuration](../.pre-commit-config.yaml).
18 |  - [ ] I have updated any necessary documentation, including [READMEs](../README.md) and API docs (if appropriate)
19 |  - [ ] I have verified the licenses used in the license-files artifact generated in the Python License Scan CI check. If the license workflow fails, kindly check the licenses used in the artifact.
20 | 
21 | ### Tests
22 |  - [ ] I have run `pytest` on my code and all unit tests passed.
23 |  - [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate)
24 | 
25 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
26 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit-check-runner-push.yml:
--------------------------------------------------------------------------------
 1 | name: Python Pre Commit Check CI After Commit
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main # Triggers on direct pushes to the main branch
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v3
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v4
18 |         with:
19 |           python-version: '3.8' # Set python version to 3.8
20 | 
21 |       - name: Install pre-commit dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install pre-commit
25 | 
26 |       - name: Run pre-commit checks
27 |         run: |
28 |           pre-commit run --all-files
29 | 


--------------------------------------------------------------------------------
/.github/workflows/repo-monitoring-cron.yml:
--------------------------------------------------------------------------------
 1 | name: Repository Monitoring
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 16 * * *'
 6 | 
 7 | concurrency:
 8 |   group: ${{ github.workflow }}-${{ github.run_id }}
 9 |   cancel-in-progress: true
10 | 
11 | permissions:
12 |   id-token: write # This is required for requesting the JWT
13 |   contents: read # This is required for actions/checkout
14 | 
15 | jobs:
16 |   check-pr-alerts:
17 |     runs-on: ubuntu-latest
18 |     if: github.event.repository.visibility == 'public'
19 |     timeout-minutes: 10
20 |     outputs:
21 |       pr_count: ${{ steps.pr-count.outputs.count }}
22 |     steps:
23 |       - name: Checkout code
24 |         uses: actions/checkout@v3
25 |       - name: Check for open PRs
26 |         id: pr-count
27 |         env:
28 |           GITHUB_TOKEN: ${{ secrets.GH_PAT }}
29 |         run: |
30 |           pr_count=$(gh pr list --state open --limit 1000 | wc -l)
31 |           echo "count=$pr_count" >> $GITHUB_OUTPUT
32 | 
33 |   check-issue-alerts:
34 |     runs-on: ubuntu-latest
35 |     if: github.event.repository.visibility == 'public'
36 |     timeout-minutes: 10
37 |     outputs:
38 |       issue_count: ${{ steps.issue-count.outputs.count }}
39 |     steps:
40 |       - name: Checkout code
41 |         uses: actions/checkout@v3
42 |       - name: Check for open issues
43 |         id: issue-count
44 |         env:
45 |           GITHUB_TOKEN: ${{ secrets.GH_PAT }}
46 |         run: |
47 |           issue_count=$(gh issue list --state open --limit 1000 | wc -l)
48 |           echo "count=$issue_count" >> $GITHUB_OUTPUT
49 | 
50 |   put-metric-data:
51 |     runs-on: ubuntu-latest
52 |     if: github.event.repository.visibility == 'public'
53 |     timeout-minutes: 10
54 |     needs: [check-pr-alerts, check-issue-alerts]
55 |     steps:
56 |       - name: Configure AWS Credentials
57 |         uses: aws-actions/configure-aws-credentials@v2
58 |         with:
59 |           role-to-assume: ${{ secrets.RUNNER_ROLE_ARN }}
60 |           role-session-name: repo-monitoring-cron-session
61 |           aws-region: us-west-2
62 | 
63 |       - name: Put PR Alert Metric Data
64 |         run: |
65 |           aws cloudwatch put-metric-data --metric-name PRAlert --namespace RepoMetrics --value ${{ needs.check-pr-alerts.outputs.pr_count }} --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
66 | 
67 |       - name: Put Issue Alert Metric Data
68 |         run: |
69 |           aws cloudwatch put-metric-data --metric-name IssueAlert --namespace RepoMetrics --value ${{ needs.check-issue-alerts.outputs.issue_count }} --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
70 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-test-runner-push.yml:
--------------------------------------------------------------------------------
 1 | name: Python Unit Test CI After Commit
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main # Triggers on direct pushes to the main branch
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v3
15 |         with:
16 |           submodules: recursive # Checkout submodules as well
17 | 
18 |       - name: Set up Python
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: '3.8' # Set python version to 3.8
22 | 
23 |       - name: Install unit test dependencies
24 |         run: |
25 |           python -m pip install --upgrade pip
26 |           pip install -r launcher/nemo/nemo_framework_launcher/requirements.txt
27 |           pip install pytest
28 |           pip install pytest-cov
29 | 
30 |       - name: Run unit tests
31 |         run: |
32 |           python -m pytest
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # log and data files
 2 | trace
 3 | .DS_Store
 4 | .hydra
 5 | .bash_history.local
 6 | results/
 7 | outputs/
 8 | tmp/
 9 | 
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 | **.pyc
15 | core.*
16 | 
17 | # Unit test / coverage reports
18 | coverage_html_report/
19 | .coverage
20 | .coverage.*
21 | .cache
22 | *.cover
23 | .hypothesis/
24 | .pytest_cache/
25 | 
26 | # Playground area
27 | mypg/
28 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "launcher/nemo/nemo_framework_launcher"]
2 | 	path = launcher/nemo/nemo_framework_launcher
3 | 	url = https://github.com/NVIDIA/NeMo-Framework-Launcher.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   # force all unspecified python hooks to run python3
 3 |   python: python3
 4 | repos:
 5 | - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |   rev: v2.3.0
 7 |   hooks:
 8 |     - id: end-of-file-fixer
 9 |       exclude: ^(tests/slurm_workflow/slurm_baseline_artifacts/|tests/k8s_workflow/k8s_baseline_artifacts/|tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/)
10 |     - id: trailing-whitespace
11 |       exclude: ^(tests/slurm_workflow/slurm_baseline_artifacts/|tests/k8s_workflow/k8s_baseline_artifacts/|tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/)
12 | - repo: https://github.com/humitos/mirrors-autoflake.git
13 |   rev: v1.3
14 |   hooks:
15 |     - id: autoflake
16 |       args: ['--in-place', '--expand-star-imports', '--ignore-init-module-imports', '--remove-all-unused-imports']
17 |       additional_dependencies: [setuptools]
18 | - repo: https://github.com/psf/black
19 |   rev: 23.3.0
20 |   hooks:
21 |     - id: black
22 |       args: [--line-length=120]
23 | - repo: https://github.com/pocc/pre-commit-hooks
24 |   rev: v1.1.1
25 |   hooks:
26 |     - id: clang-format
27 |       args: [--style=file, -i]
28 | - repo: https://github.com/pycqa/isort
29 |   rev: 5.12.0
30 |   hooks: # imports sorting
31 |     - id: isort
32 |       name: isort (python)
33 |       args: ["--profile", "black"]
34 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/Config:
--------------------------------------------------------------------------------
 1 | package.SagemakerTrainingLauncher = {
 2 |     interfaces = (1.0);
 3 | 
 4 |     # Use NoOpBuild. See https://w.amazon.com/index.php/BrazilBuildSystem/NoOpBuild
 5 |     build-system = no-op;
 6 |     build-tools = {
 7 |         1.0 = {
 8 |             NoOpBuild = 1.0;
 9 |         };
10 |     };
11 | 
12 |     # Use runtime-dependencies for when you want to bring in additional
13 |     # packages when deploying.
14 |     # Use dependencies instead if you intend for these dependencies to
15 |     # be exported to other packages that build against you.
16 |     dependencies = {
17 |         1.0 = {
18 |         };
19 |     };
20 | 
21 |     runtime-dependencies = {
22 |         1.0 = {
23 |         };
24 |     };
25 | 
26 | };
27 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/launcher/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 


--------------------------------------------------------------------------------
/launcher/nemo/README.md:
--------------------------------------------------------------------------------
 1 | # Core NeMo launching implementations
 2 | This folder contains the core launching framework for NeMo based implementations. We use the same design as the [NeMo-Framework-Launcher](https://github.com/NVIDIA/NeMo-Framework-Launcher/tree/main). Bsaically there are 2 steps:
 3 | - A stage defined in `stages.py` will prepare for the training script launching command and the cluster configs, passing these configs into the actual launcher
 4 | - A launcher defined in `launchers.py` will take the configs from the stage and generate the real launching script. Then launcher will kick off the run using corresponding cluster methods, i.e. slurm or k8s.
 5 | 
 6 | ## Stages
 7 | We support different use cases, and each will be corresponding to a stage:
 8 | - `SMTraining`: Stage to run native NeMo workload
 9 | - `SMTrainingGPURecipe`: Stage used to run our GPU recipes
10 | - `SMTrainingTrainiumRecipe`: Stage to run our Trainium recipes
11 | - `SMCustomTrainingGPU`: Stage for training with custom script on GPU
12 | - `SMCustomTrainingTrainium`: Stage for training with custom script on Trainium
13 | 
14 | ## Launchers
15 | Currently we only need our own launchers for custom jobs, because we need to manage the `torchrun` command
16 | - `SMSlurmLauncher`: Launcher for custom jobs using slurm
17 | 


--------------------------------------------------------------------------------
/launcher/nemo/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 


--------------------------------------------------------------------------------
/launcher/nemo/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | from pathlib import Path
15 | 
16 | SM_ADAPTER_REPO = "https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git"
17 | NEMO_REPO = "https://github.com/NVIDIA/NeMo.git"
18 | NEMO_REPO_TAG = "v2.0.0rc0"  # [TODO] move to v2.0.0 once it is released
19 | 
20 | SM_ADAPTER_MODEL_TYPE_TO_CODE_PATH = {
21 |     "deepseek": "examples/deepseek/deepseek_pretrain.py",
22 |     "llama": "examples/llama/llama_pretrain.py",
23 |     "mistral": "examples/mistral/mistral_pretrain.py",
24 |     "mixtral": "examples/mixtral/mixtral_pretrain.py",
25 | }
26 | 
27 | NEURONX_REPO_URI = "https://github.com/aws-neuron/neuronx-distributed-training.git"
28 | NEURONX_REPO_TAG = "main"
29 | NEURONX_CONF_PATH = "examples/conf"
30 | 
31 | # utility directory to more easily navigate to other parts of the package
32 | ROOT_DIR = Path(__file__).resolve().parent.parent.parent  # package root
33 | 


--------------------------------------------------------------------------------
/launcher/nemo/k8s_templates/training/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: Sagemaker Model Training
4 | name: sagemaker-training
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/launcher/nemo/k8s_templates/training/train-script-gpu.yaml:
--------------------------------------------------------------------------------
 1 | {{ $config := .Values.trainingConfig }}
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: train-script-gpu-{{ $config.jobName }}
 6 | data:
 7 |   train-script.sh: |
 8 |     #!/bin/bash
 9 |     set -ex
10 | 
11 |     {{- if $config.git.repo_url_or_path }}
12 |     mkdir -p $HOME/tmp
13 |     GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME
14 |     [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
15 |     git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR
16 |     GIT_CLONE_DIR=${GIT_CLONE_DIR}/
17 |     cd $GIT_CLONE_DIR
18 |     rm -rf __pycache__
19 | 
20 |       {{- if $config.git.branch }}
21 |     git checkout {{ $config.git.branch }}
22 |       {{- end }}
23 | 
24 |       {{- if $config.git.commit }}
25 |     git fetch origin {{ $config.git.commit }}
26 |     git reset --hard {{ $config.git.commit }}
27 |       {{- end }}
28 |       {{- if $config.git.update_adapter }}
29 | 
30 |     pip install . --force-reinstall --no-deps
31 | 
32 |       {{- end }}
33 |     {{- else }}
34 |     GIT_CLONE_DIR=""
35 |     {{- end }}
36 | 
37 |     {{- range $config.pre_script }}
38 |     {{ . }}
39 |     {{- end }}
40 | 
41 |     {{- if gt (int $config.nodes) 1 }}
42 |     export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --rdzv_backend=c10d --rdzv_endpoint={{ $config.jobName }}-worker-0"
43 |     {{- else }}
44 |     export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}"
45 |     {{- end }}
46 | 
47 |     echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
48 |     torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }} \
49 |     {{- if $config.scriptArgs -}}
50 |     {{ $config.scriptArgs }}
51 |     {{- end }}
52 | 
53 |     {{- range $config.post_script }}
54 |     {{ . }}
55 |     {{- end }}
56 | 
57 |     {{- if $config.git.repo_url_or_path }}
58 |     cd $HOME
59 |     rm -rf $GIT_CLONE_DIR
60 |     {{- end }}
61 | 


--------------------------------------------------------------------------------
/launcher/nemo/k8s_templates/training/training-config.yaml:
--------------------------------------------------------------------------------
1 | {{ $config := .Values.trainingConfig }}
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 |   name: training-config-{{ $config.jobName }}
6 | data:
7 |   config.yaml: |-
8 |   {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
9 | 


--------------------------------------------------------------------------------
/launcher/nemo/k8s_templates/training/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   # training image
 3 |   trainingImage: cfg.container
 4 | 
 5 |   # image pulling policy
 6 |   pullPolicy: IfNotPresent
 7 | 
 8 | 
 9 | trainingConfig:
10 |   # current job name
11 |   jobName: "nil"
12 | 
13 |   # namespace to launch job
14 |   namespace: "default"
15 | 
16 |   # script path
17 |   scriptPath: null
18 | 
19 |   # script args
20 |   scriptArgs: null
21 | 
22 |   # specify whether to use custom scripts
23 |   customScript: null
24 | 
25 |   # list of custom annotations apply to jobs
26 |   annotations: null
27 | 
28 |   # list of custom labels apply to jobs and pods
29 |   customLabels: null
30 | 
31 |   # Kueue scheduler priority class name
32 |   priority_class_name: null
33 | 
34 |   # device type, can be "gpu", "trainium" and "nil", "nil" means cpu
35 |   device: "nil"
36 | 
37 |   # number of EFA devices if the instance type support EFA
38 |   numEFADevices: 0
39 | 
40 |   # number of Neuron devices if job is for Trainium
41 |   numNeuronDevices: null
42 | 
43 |   # number of process per node
44 |   ntasksPerNode: 0
45 | 
46 |   # number of nodes to run
47 |   nodes: training.trainer.num_nodes
48 | 
49 |   # restart policy
50 |   restartPolicy: Never
51 | 
52 |   # from NeMo, not used currently
53 |   wandbKey: "nil"
54 | 
55 |   # name of service account associated with the namespace
56 |   serviceAccountName: null
57 | 
58 |   # relevant for Trainium chips, either 0 or 1
59 |   compile: 0
60 | 
61 |   # persistent volume, usually used to mount FSx
62 |   persistentVolumeClaims: null
63 | 
64 |   # temp volume, usually used to mount temp file in the host
65 |   volumes: null
66 | 
67 |   # A github repo if user might want to use script inside
68 |   git:
69 |     repo_url_or_path: null
70 |     branch: null
71 |     commit: null
72 |     token: null
73 |     update_adapter: null
74 | 
75 |   # Commands to run before training
76 |   pre_script: []
77 |   # Commands to run after training
78 |   post_script: []
79 | 
80 |   # select preferred and required labels for nodes
81 |   labelSelector:
82 |     required: null # select nodes with required labels
83 |     preferred: null # select nodes with priority which has preferred labels
84 |     weights: null # list of weights for the preferred labels
85 | 
86 |   # The clean up policy after the job completes or fails.
87 |   cleanPodPolicy: null
88 | 


--------------------------------------------------------------------------------
/launcher/telemetry.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import sys
 4 | import time
 5 | from dataclasses import asdict, dataclass, field
 6 | from typing import List
 7 | 
 8 | CW_NAME_SPACE = "RecipesTelemetry"
 9 | 
10 | 
11 | @dataclass
12 | class Metric:
13 |     Name: str = None
14 |     Unit: str = None
15 | 
16 | 
17 | @dataclass
18 | class MetricDirective:
19 |     Namespace: str = ""
20 |     Dimensions: List[List[str]] = None
21 |     Metrics: List[Metric] = None
22 | 
23 | 
24 | @dataclass
25 | class Metadata:
26 |     CloudWatchMetrics: List[MetricDirective] = field(default_factory=lambda: [MetricDirective])
27 |     Timestamp: int = None
28 | 
29 | 
30 | @dataclass
31 | class CWTelemetryStart:
32 |     account_id: str = ""
33 |     training_start_time: int = 0
34 |     num_nodes: int = 0
35 |     job_name: str = ""
36 |     cluster_type: str = ""
37 |     instance_type: str = ""
38 |     _aws: Metadata = None
39 |     job_id: int = 0
40 |     recipe: str = ""
41 |     container: str = ""
42 | 
43 | 
44 | class Telemetry:
45 |     def __init__(self, log_path="/var/log/aws/clusters/sagemaker-hyperpod-recipes-telemetry.log"):
46 |         self.log_path = log_path
47 | 
48 |     def get_account_id(self):
49 |         import boto3
50 | 
51 |         client = boto3.client("sts")
52 |         return client.get_caller_identity()["Account"]
53 | 
54 |     def publish_cw_log(self, log):
55 |         save_log = asdict(log)
56 |         with open(self.log_path, "a") as f:
57 |             f.write(json.dumps(save_log, separators=(",", ":")) + "\n")
58 | 
59 |     def start(
60 |         self,
61 |         cluster_type=None,
62 |         instance_type=None,
63 |         num_nodes=None,
64 |         job_id=None,
65 |         container=None,
66 |     ):
67 |         if not os.path.exists(self.log_path):
68 |             return
69 |         account_id = self.get_account_id()
70 |         cw_telemetry_start = CWTelemetryStart(account_id=account_id)
71 |         cw_telemetry_start.training_start_time = int(time.time() * 1000)
72 |         cw_telemetry_start.num_nodes = int(num_nodes)
73 |         cw_telemetry_start.cluster_type = cluster_type
74 |         cw_telemetry_start.instance_type = instance_type
75 |         cw_telemetry_start.job_id = job_id
76 |         cw_telemetry_start.container = container
77 | 
78 |         recipe = ""
79 |         for arg in sys.argv:
80 |             if arg.startswith("recipes="):
81 |                 recipe = arg.split("=")[1]
82 |         cw_telemetry_start.recipe = recipe
83 | 
84 |         metadata = Metadata(
85 |             Timestamp=int(time.time() * 1000),
86 |             CloudWatchMetrics=[
87 |                 MetricDirective(
88 |                     Namespace=CW_NAME_SPACE,
89 |                     Dimensions=[[]],
90 |                     Metrics=[Metric(Name="num_nodes", Unit="Count")],
91 |                 )
92 |             ],
93 |         )
94 |         cw_telemetry_start._aws = metadata
95 |         self.publish_cw_log(cw_telemetry_start)
96 | 


--------------------------------------------------------------------------------
/launcher_scripts/custom_model/run_falcon.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
10 | VAL_DIR=${VAL_DIR} # Location of talidation dataset
11 | 
12 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, ect
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 | recipes=training/custom_model/falcon \
17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 | recipes.run.name="hf-falcon" \
19 | recipes.exp_manager.exp_dir=$EXP_DIR \
20 | recipes.trainer.num_nodes=4 \
21 | recipes.model.train_batch_size=2 \
22 | recipes.model.data.train_dir=$TRAIN_DIR \
23 | recipes.model.data.val_dir=$VAL_DIR \
24 | 


--------------------------------------------------------------------------------
/launcher_scripts/custom_script/README.md:
--------------------------------------------------------------------------------
 1 | # Config for running with custom scripts
 2 | Custom config allows user to use launcher to run some custom jobs that does not use our recipe. We use hydra format for the configs, same as our recipes. Please refer to the `config.yaml` as the template, which also aligns with the `config.yaml` in the recipe folder with some extra configs on cluster and custom script.
 3 | ## Config fields
 4 | Here are some essential fields that user might want to override during for custom training
 5 | - training_cfg: This field contains most configs about the training runs
 6 |     - entry_script: Path to the entry script of training/fine-tuning. This path can be one in the container mounts.
 7 |     - script_args: The args that will be used to run this script
 8 |     - run: All runtime configs
 9 |         - name: Current run name
10 |         - nodes: Number of nodes to use
11 |         - ntasks_per_node: Number of devices to use per node
12 |         - results_dir: Directories to store the result. It is recommended to keep it as `${base_results_dir}/${.name}` so everything will be in `base_results_dir`
13 | - cluster: All cluster based configs
14 |     - cluster_type: Type of the cluster, can be slrum(bcm) or k8s
15 |     - instance_type: Instance type to use, if null will use default instance type in cluster.
16 |     - cluster_config: The detailed cluster config, will be different between slrum and k8s. For details please refer to recipe's doc about cluster setup.
17 |       - namespace: Namespace to launch jobs
18 |       - custom_labels: k8s labels applied to job and also each pod running the job, see more details about labels in https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
19 |       - annotations: k8s annotations added to the job, see more details in https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
20 |       - priority_class_name: Kueue scheduler priority class name, see more details in https://kueue.sigs.k8s.io/
21 |       - label_selector: k8s NodeAffinity functionality. To allow node selection based on required labels or priority scheduling based on preferred labels.
22 |       - service_account_name: aws eks service account name. To give pods credentials to call aws services.
23 |       - persistent_volume_claims: specify multiple persistent volume claims to mount job pod.
24 | The rest of the configs are similar to the recipe configs.
25 | ## Launch
26 | To launch the job, simply run inside the `SagemakerTrainingLauncher/launcher folder` with command `python main.py --config-path examples/custom_script/ --config-name config` or use your own config folder.
27 | 


--------------------------------------------------------------------------------
/launcher_scripts/custom_script/config_slurm.yaml:
--------------------------------------------------------------------------------
 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 2 | 
 3 | defaults:
 4 |   - override hydra/job_logging: stdout
 5 | 
 6 | hydra:
 7 |   run:
 8 |     dir: .
 9 |   output_subdir: null
10 | 
11 | git:
12 |   repo_url_or_path: null
13 |   branch: null
14 |   commit: null
15 |   token: null
16 | 
17 | training_cfg:
18 | 
19 |   entry_script: null # Path to the entry script of training/fine-tuning. This path should be inside container or relative path in git repo
20 |   # script_args:
21 |   #   - "--some_args" : "debug"
22 |   #   - "--some_other_args" : 1
23 |   run:
24 |     name: test_custom # Current run name
25 |     nodes: 2 # Number of nodes to use for current training
26 |     ntasks_per_node: 8 # Number of devices to use per node
27 | 
28 | cluster:
29 |   #Example slurm cluster
30 | 
31 |   cluster_type: slurm
32 |   instance_type: p5.48xlarge
33 |   cluster_config:
34 |     exclusive: True
35 |     job_name_prefix: testcustom_slurm_
36 |     slurm_create_submission_file_only: False # Setting to True if just want to create submission file
37 |     srun_args:
38 |       # - "--no-container-mount-home"
39 | 
40 | base_results_dir: null  # Location to store the results, checkpoints and logs.
41 | container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
42 |   - null
43 | container: null # container to use
44 | slurm_docker_cfg: # Will only be used with docker on slurm
45 |   docker_args:
46 |     # - "--runtime=nvidia" # this is required if the docker runtime version is low
47 |   post_launch_commands: # commands will run after launching the docker container using bash
48 | 
49 | env_vars:
50 |   NCCL_DEBUG: DEBUG # Logging level for NCCL. Set to "INFO" for debug information
51 | 


--------------------------------------------------------------------------------
/launcher_scripts/custom_script/custom_allreduce.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | print("init process group")
 5 | dist.init_process_group("nccl")
 6 | print("rank:", dist.get_rank())
 7 | torch.cuda.set_device(dist.get_rank() % 8)
 8 | tensor = torch.randn(4, 4, device="cuda")
 9 | print(f"[{dist.get_rank()}] tensor {tensor}")
10 | dist.all_reduce(tensor)
11 | print(f"[{dist.get_rank()}] tensor {tensor} after reduce")
12 | 


--------------------------------------------------------------------------------
/launcher_scripts/custom_script/run_allreduce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /recipes_collection/config.yaml
 4 | 
 5 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 6 | 
 7 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
 8 | VAL_DIR=${VAL_DIR} # Location of talidation dataset
 9 | 
10 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, ect
11 | 
12 | 
13 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
14 | --config-path=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/launcher_scripts/custom_script \
15 | --config-name=config_slurm \
16 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
17 | training_cfg.entry_script=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/laucher_scripts/custom_script/custom_allreduce.py \
18 | container_mounts=[${SAGEMAKER_TRAINING_LAUNCHER_DIR}] \
19 | container=<mycontainer>\
20 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-671b-seq8k-gpu-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=5 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_qlora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-671b-seq8k-gpu-qlora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-70b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=16 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-70b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-70b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=10 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-70b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-8b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.data.train_dir="$TRAIN_DIR" \
25 |     recipes.model.data.val_dir="$VAL_DIR" \
26 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
27 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
28 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-8b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-8b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-llama-8b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=4 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=4 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=6 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=4 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
16 |     recipes=training/llama/p4_hf_llama3_70b_seq8k_gpu \
17 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
18 |     recipes.run.name="hf-llama3-70b" \
19 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
20 |     recipes.trainer.num_nodes=32 \
21 |     recipes.model.train_batch_size=1 \
22 |     recipes.model.data.train_dir="$TRAIN_DIR" \
23 |     recipes.model.data.val_dir="$VAL_DIR" \
24 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=32 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=20 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-8b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=4 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-8b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
10 | VAL_DIR=${VAL_DIR} # Location of talidation dataset
11 | 
12 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, etc
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
16 |     recipes=training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain \
17 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
18 |     recipes.run.name="hf-llama3-2-11b" \
19 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
20 |     recipes.model.data.train_dir="$TRAIN_DIR" \
21 |     recipes.model.data.val_dir="$VAL_DIR" \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-2-1b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-2-3b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 7 | 
 8 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
 9 | VAL_DIR=${VAL_DIR} # Location of talidation dataset
10 | 
11 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, etc
12 | 
13 | 
14 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
15 |     recipes=training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain \
16 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
17 |     recipes.run.name="hf-llama3-2-90b" \
18 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
19 |     recipes.model.data.train_dir="$TRAIN_DIR" \
20 |     recipes.model.data.val_dir="$VAL_DIR" \
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=16 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=10 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-405b-seq131072-qlora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-405b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=6 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-405b-qlora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_405b_seq32k_gpu_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_405b_seq32k_gpu_qlora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-405b-qlora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-405b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=6 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-405b-qlora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=16 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x128_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x128_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-70b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-70b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-70b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=10 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_70b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-70b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x128_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-70b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-70b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-70b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | COMPILE="${COMPILE}" # Set to 1 to compile the model, 0 to load a pre-compiled model
10 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
11 | MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model
12 | 
13 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
14 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
15 |     instance_type="trn1.32xlarge" \
16 |     recipes=training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain \
17 |     recipes.run.name="hf-llama3-70b" \
18 |     recipes.run.compile="$COMPILE" \
19 |     recipes.trainer.max_steps=50 \
20 |     recipes.data.train_dir="$TRAIN_DIR" \
21 |     recipes.model.model_config="$MODEL_CONFIG" \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-8b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.data.train_dir="$TRAIN_DIR" \
25 |     recipes.model.data.val_dir="$VAL_DIR" \
26 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
27 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
28 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_8b_seq16k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-8b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-8b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-8b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_dpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_dpo \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-8b-dpo" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_fine_tuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-8b-fine-tuning" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama3-8b-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=2 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-8b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/llama/hf_llama3_8b_seq8k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-llama3-8b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | COMPILE="${COMPILE}"
10 | COMPILER_CACHE_PATH="${COMPILER_CACHE_PATH}"
11 | TOKENIZER_TYPE="${TOKENIZER_TYPE}"
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | RESUME_FROM_CHECKPOINT_DIR="${RESUME_FROM_CHECKPOINT_DIR}"
15 | MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model
16 | 
17 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
18 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
19 |     instance_type="trn1.32xlarge" \
20 |     recipes=fine-tuning/llama/hf_llama3_8b_seq8k_trn1_fine_tuning \
21 |     recipes.run.name="hf-llama3-8b-sft" \
22 |     recipes.run.compile="$COMPILE" \
23 |     recipes.trainer.max_steps=50 \
24 |     recipes.compiler_cache_url="$COMPILER_CACHE_PATH" \
25 |     recipes.data.tokenizer.type="$TOKENIZER_TYPE" \
26 |     recipes.data.train_dir="$TRAIN_DIR" \
27 |     recipes.data.val_dir="$VAL_DIR" \
28 |     recipes.exp_manager.resume_from_checkpoint="$RESUME_FROM_CHECKPOINT_DIR" \
29 |     recipes.model.model_config="$MODEL_CONFIG" \
30 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | COMPILE=0
10 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
11 | MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model
12 | 
13 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
14 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
15 |     instance_type="trn1.32xlarge" \
16 |     recipes=training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain \
17 |     recipes.run.name="hf-llama3-8b" \
18 |     recipes.run.compile="$COMPILE" \
19 |     recipes.trainer.max_steps=50 \
20 |     recipes.data.train_dir="$TRAIN_DIR" \
21 |     recipes.model.model_config="$MODEL_CONFIG" \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama-4-17b-16e-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama-4-17b-16e-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama-4-17b-16e-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=2 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11 | 
12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
14 | 
15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16 | 
17 | 
18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19 |     recipes=fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text \
20 |     base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21 |     recipes.run.name="hf-llama-4-17b-16e-lora" \
22 |     recipes.exp_manager.exp_dir="$EXP_DIR" \
23 |     recipes.trainer.num_nodes=1 \
24 |     recipes.model.train_batch_size=1 \
25 |     recipes.model.data.train_dir="$TRAIN_DIR" \
26 |     recipes.model.data.val_dir="$VAL_DIR" \
27 |     recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28 |     recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
29 | 


--------------------------------------------------------------------------------
/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mistral-7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mistral-7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mistral-7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mistral-7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x22b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x22b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x22b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x22b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x22b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x22b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 4 | 
 5 | #Users should setup their cluster type in /recipes_collection/config.yaml
 6 | 
 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
 8 | 
 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset
11 | 
12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
13 | 
14 | 
15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
16 |     recipes=training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain \
17 |     base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
18 |     recipes.run.name="hf-mixtral-8x7b" \
19 |     recipes.exp_manager.exp_dir=$EXP_DIR \
20 |     recipes.model.data.train_dir=$TRAIN_DIR \
21 |     recipes.model.data.val_dir=$VAL_DIR \
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pytest.ini_options]
 2 | minversion = 7.0
 3 | # durations=0 will display all tests execution time, sorted in ascending order starting from from the slowest one.
 4 | # -vv will also display tests with durration = 0.00s
 5 | addopts = [
 6 |     "--cache-clear",
 7 |     "--quiet",
 8 |     "--durations=0",
 9 |     "--cov=launcher/",
10 |     # uncomment this line to see a detailed HTML test coverage report instead of the usual summary table output to stdout.
11 |     # "--cov-report=html",
12 |     "tests/",
13 | ]
14 | testpaths = ["tests"]
15 | norecursedirs = [".eggs", ".pytest_cache", "*.egg-info", ".git", "build"]
16 | 


--------------------------------------------------------------------------------
/recipes_collection/cluster/k8s.yaml:
--------------------------------------------------------------------------------
 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 2 | 
 3 | pullPolicy: Always # policy to pull container, can be Always, IfNotPresent and Never
 4 | restartPolicy: Never # restart policy
 5 | namespace: default # the namespace to submit job
 6 | # create customized labels for the PytorchJob and Pods deployed jobs.
 7 | # Example:
 8 | #   custom_labels:
 9 | #     label-key-1: label-value-1
10 | #     label-key-2: label-value-2
11 | custom_labels: null
12 | # create customized annotations for the jobs.
13 | # Example:
14 | #   annotations:
15 | #     annotation-key-1: annotation-value-1
16 | #     annotation-key-2: annotation-value-2
17 | annotations: null
18 | # add service account to job pods
19 | # Example:
20 | #  serviceAccountName: service_account
21 | service_account_name: null
22 | # priorityClassName for Kueue scheduler to decide jobs priority
23 | priority_class_name: null
24 | 
25 | # temp volume, usually used to mount temp directory
26 | # Example:
27 | #  volumes:
28 | #    - volumeName: data1
29 | #      hostPath: "/data"
30 | #      mountPath: "/data"
31 | 
32 | volumes: null
33 | 
34 | # persistent volume, usually used to mount FSx
35 | # Example:
36 | # persistent_volume_claims:
37 | #       - claimName: null
38 | #         mountPath: null
39 | #       - claimName: null
40 | #         mountPath: null
41 | 
42 | # persistent volumes, usually used to mount FSx
43 | persistent_volume_claims:
44 |   - null
45 |   # This claim should be created before running. Example:
46 |   # - claimName: fsx-claim
47 |   #   mountPath: data
48 | 
49 | # Create k8s NodeAffinity to select nodes to deploy jobs which matches required and preferred labels
50 | # Structure:
51 | #   label_selector:
52 | #     required: <required label key-values pair>
53 | #     preferred: <preferred label key-values pair>
54 | #     weights: <weights list used by preferred labels to get nodes priority>
55 | # Example:
56 | #   label_selector:
57 | #     required:
58 | #       example-label-key:
59 | #         - expected-label-value-1
60 | #         - expected-label-value-2
61 | #     preferred:
62 | #       preferred-label-key:
63 | #         - preferred-label-value-1
64 | #         - preferred-label-value-2
65 | #     weights:
66 | #       - 100
67 | label_selector: null
68 | 
69 | # The clean up policy after the job completes or fails.
70 | cleanPodPolicy: null
71 | 


--------------------------------------------------------------------------------
/recipes_collection/cluster/slurm.yaml:
--------------------------------------------------------------------------------
 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 2 | 
 3 | exclusive: True
 4 | mem: 0
 5 | job_name_prefix: 'sagemaker-'
 6 | slurm_create_submission_file_only: False # Setting to True if just want to create submission file
 7 | stderr_to_stdout: True # Setting to False to split the stderr and stdout logs
 8 | srun_args:
 9 |   # - "--no-container-mount-home"
10 | slurm_docker_cfg:
11 |   docker_args:
12 |     # - "--runtime=nvidia" # this is required if the docker runtime version is low
13 |   post_launch_commands: # commands will run after launching the docker container using bash
14 | container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
15 |   - null
16 | 


--------------------------------------------------------------------------------
/recipes_collection/cluster/sm_jobs.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | sm_jobs_config:
15 |   output_path: null  # S3 output path to output artifacts
16 |   tensorboard_config:
17 |     output_path: null  # Output path for tensorboard logs
18 |     container_logs_path: null  # Path to logs on the container
19 |   wait: True  # Whether to wait for training job to finish
20 |   inputs:  # Inputs to call fit with. Set either s3 or file_system, not both.
21 |     s3:  # Dictionary of channel names and s3 URIs. For GPUs, use channels for train and validation.
22 |         train: null
23 |         val: null
24 |     file_system:  # If using file system input, please pass VPC params in additional_estimator_kwargs.
25 |         id: null
26 |         type: null
27 |         directory_path: null
28 |   additional_estimator_kwargs:  # All other additional args to pass to estimator. Must be int, float or string.
29 |     max_run: 1800
30 |     enable_remote_debug: True
31 |   recipe_overrides: null
32 | 


--------------------------------------------------------------------------------
/recipes_collection/config.yaml:
--------------------------------------------------------------------------------
 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
 2 | 
 3 | defaults:
 4 |   - _self_
 5 |   - cluster: slurm  # set to `slurm`, `k8s` or `sm_jobs`, depending on the desired cluster
 6 |   - recipes: training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain # select desired config inside the training directory
 7 |   - override hydra/job_logging: stdout
 8 | 
 9 | cluster_type: slurm  # bcm, bcp, k8s or sm_jobs. If bcm, k8s or sm_jobs, it must match - cluster above.
10 | # If using sm_jobs cluster_type, set sm_jobs_config. See cluster/sm_jobs.yaml for example.
11 | 
12 | hydra:
13 |   run:
14 |     dir: .
15 |   output_subdir: null
16 | 
17 | debug: False
18 | 
19 | instance_type: p5.48xlarge
20 | base_results_dir: null  # Location to store the results, checkpoints and logs.
21 | 
22 | container: null
23 | 
24 | git:
25 |   repo_url_or_path: null
26 |   branch: null
27 |   commit: null
28 |   entry_script: null
29 |   token: null
30 |   update_adapter: false # if true it will re-install the Adapter code but not its dependencies
31 | 
32 | env_vars:
33 |   NCCL_DEBUG: WARN  # Logging level for NCCL. Set to "INFO" for debug information
34 | 
35 | # Do not modify below, use the values above instead.
36 | training_config: ${hydra:runtime.choices.recipes}
37 | 


--------------------------------------------------------------------------------
/recipes_collection/recipes/training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml:
--------------------------------------------------------------------------------
  1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
  2 | 
  3 | run:
  4 |   name: llama3-2-90b
  5 |   results_dir: ${base_results_dir}/${.name}
  6 |   time_limit: "6-00:00:00"
  7 |   model_type: hf # huggingface for our recipes
  8 | 
  9 | trainer:
 10 |   devices: 8
 11 |   num_nodes: 32
 12 |   accelerator: gpu
 13 |   precision: bf16
 14 |   max_steps: 50
 15 |   log_every_n_steps: 1
 16 | 
 17 |   val_check_interval: 1
 18 |   accumulate_grad_batches: 1
 19 |   gradient_clip_val: 1.0
 20 | 
 21 | 
 22 | exp_manager:
 23 |   exp_dir: null
 24 |   name: experiment
 25 |   # experiment loggers
 26 |   create_tensorboard_logger: False
 27 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
 28 |   create_mlflow_logger: False
 29 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
 30 |   create_wandb_logger: False
 31 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
 32 |   create_checkpoint_callback: True
 33 |   checkpoint_callback_params:
 34 |     # Set save_top_k = 0 to disable sharded checkpointing
 35 |     save_top_k: 0
 36 |     every_n_train_steps: 10
 37 |     monitor: "step"
 38 |     mode: "max"
 39 |     save_last: False
 40 |   checkpoint_dir: ${..exp_manager.exp_dir}/checkpoints/
 41 |   resume_from_checkpoint: null
 42 |   # Set auto_checkpoint = False to disable auto resilience checkpointing
 43 |   auto_checkpoint:
 44 |     enabled: False
 45 |   export_full_model:
 46 |     # Set every_n_train_steps = 0 to disable full checkpointing
 47 |     every_n_train_steps: 0
 48 |     save_last: False
 49 | 
 50 | 
 51 | use_smp_model: False #enable SMP
 52 | distributed_backend: nccl
 53 | 
 54 | 
 55 | # Start training from pretrained model
 56 | model:
 57 |   model_type: llama_v3
 58 |   do_finetune: False
 59 |   hf_model_name_or_path: "meta-llama/Llama-3.2-90B-Vision-Instruct"
 60 |   hf_access_token: null
 61 |   train_batch_size: 1
 62 |   seed: 12345
 63 |   grad_clip: 1.0
 64 |   use_flash_attention: True
 65 |   activation_checkpointing: True
 66 |   multi_modal: True
 67 |   delayed_param: True
 68 | 
 69 |   # FSDP Configs
 70 |   sharding_strategy: hybrid_shard
 71 |   forward_prefetch: True
 72 |   shard_degree: 256
 73 |   backward_fetch_policy: backward_pre
 74 |   auto_wrap_policy: transformer_auto_wrap_policy
 75 |   limit_all_gathers: true
 76 |   use_orig_param: False
 77 | 
 78 |   # model architecture
 79 |   max_context_width: 8192
 80 |   precision: bf16
 81 |   lr_decay_iters: 47683
 82 |   log_reduced_training_loss: True
 83 | 
 84 |   # PEFT
 85 |   peft:
 86 |     peft_type: null # lora
 87 | 
 88 |   # Optimizer
 89 |   optim:
 90 |     name: adamw
 91 |     lr: 2e-4
 92 |     weight_decay: 0.01
 93 |     betas:
 94 |     - 0.9
 95 |     - 0.98
 96 |     sched:
 97 |       name: CosineAnnealing
 98 |       warmup_steps: 500
 99 |       constant_steps: 0
100 |       min_lr: 2e-5
101 | 
102 |   # Data
103 |   data:
104 |     train_dir: null
105 |     val_dir: null
106 |     dataset_type: hf
107 |     use_synthetic_data: False
108 |     tokenizer_name: null
109 |     zipped_data: False
110 | 
111 |   # Profiling configs
112 |   # Viztracer profiling options
113 |   viztracer:
114 |     enabled: false
115 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | hydra-core==1.3.2
2 | omegaconf>=2.2,<2.3
3 | pynvml==11.4.1
4 | requests==2.26.0
5 | tqdm==4.62.3
6 | zstandard==0.15.2
7 | tensorboard==2.12.0
8 | boto3==1.35.66
9 | 


--------------------------------------------------------------------------------
/scripts/licenseChecker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 6 | # may not use this file except in compliance with the License. A copy of
 7 | # the License is located at
 8 | #
 9 | #     http://aws.amazon.com/apache2.0/
10 | #
11 | # or in the "license" file accompanying this file. This file is
12 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
13 | # ANY KIND, either express or implied. See the License for the specific
14 | # language governing permissions and limitations under the License.
15 | 
16 | check_licenses() {
17 |     LICENSE_LIST=$(cat ./ApprovedLicenses.txt | tr '\n' '|'| sed 's/|$//')
18 |     pip-licenses --summary > LicenseSummary.txt
19 |     awk '{$1=""; print $0}' ./LicenseSummary.txt | tail -n +2 | sed 's/;/\n/g' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//'| sort -u > ./newLicenseSummary.txt
20 |     while IFS= read -r line || [[ -n "$line" ]]; do
21 |         if ! echo "$LICENSE_LIST" | grep -q "$line"; then
22 |             echo "License '$line' is not in the allowed list."
23 |             exit 1
24 |         fi
25 |     done < ./newLicenseSummary.txt
26 | 
27 |     if ! grep -q "prohibited-license: Did not find content matching specified patterns" ./scanOutput.txt; then
28 |         echo "Prohibited License Used in Source Code Scan: "
29 |         sed -n '/⚠  prohibited-license:/,/⚠  third-party-license-file:/p' ./scanOutput.txt | sed '1d;$d'| cat
30 |         exit 1
31 |     fi
32 |     echo "License Check complete"
33 | }
34 | 
35 | check_licenses
36 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-hyperpod-recipes/6a633c5500f60cea22d9409e06b069c1184b43e8/tests/__init__.py


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: Sagemaker Model Training
4 | name: sagemaker-training
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/config/llama-8b_hydra.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: llama-8b
 3 |   results_dir: {$results_dir}/llama-8b
 4 |   time_limit: 6-00:00:00
 5 |   model_type: hf
 6 | trainer:
 7 |   devices: 8
 8 |   num_nodes: 4
 9 |   accelerator: gpu
10 |   precision: bf16
11 |   max_steps: 50
12 |   log_every_n_steps: 10
13 | exp_manager:
14 |   exp_dir: /fsx/exp/
15 |   name: my_experiment
16 |   # experiment loggers
17 |   create_tensorboard_logger: False
18 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
19 |   create_mlflow_logger: False
20 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
21 |   create_wandb_logger: False
22 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
23 |   create_checkpoint_callback: true
24 |   checkpoint_callback_params:
25 |     save_top_k: 10
26 | use_smp_model: true
27 | distributed_backend: smddp
28 | model:
29 |   model_type: llama_v3
30 |   train_batch_size: 4
31 |   val_batch_size: 1
32 |   tensor_model_parallel_degree: 1
33 |   expert_model_parallel_degree: 1
34 |   moe: false
35 |   sequence_parallel: true
36 |   activation_checkpointing: true
37 |   activation_loading_horizon: 2
38 |   delayed_param: true
39 |   offload_activations: false
40 |   use_smp_model_flash_attn: false
41 |   seed: 12345
42 |   grad_clip: 1.0
43 |   hf_pretrained_model: null
44 |   sharding_strategy: hybrid_shard
45 |   forward_prefetch: true
46 |   shard_degree: 16
47 |   backward_fetch_policy: backward_pre
48 |   auto_wrap_policy: transformer_auto_wrap_policy
49 |   limit_all_gathers: true
50 |   use_orig_param: false
51 |   max_context_width: 2048
52 |   max_position_embeddings: 2048
53 |   num_hidden_layers: 8
54 |   hidden_size: 4096
55 |   num_attention_heads: 32
56 |   llama_intermediate_size: 14336
57 |   initializer_range: 0.02
58 |   layernorm_epsilon: 1.0e-05
59 |   vocab_size: 32000
60 |   num_key_value_heads: 8
61 |   transformer_engine: true
62 |   fp8: false
63 |   fp8_amax_history_len: 1024
64 |   fp8_amax_compute_algo: max
65 |   do_finetune: false
66 |   finetune_with_pretrained_weights: false
67 |   pretrained_model_weights: null
68 |   precision: bf16
69 |   lr_decay_iters: 47683
70 |   log_reduced_training_loss: true
71 |   optim:
72 |     name: adamw
73 |     lr: 0.0001
74 |     weight_decay: 0.01
75 |     betas:
76 |     - 0.9
77 |     - 0.95
78 |     sched:
79 |       name: CosineAnnealing
80 |       warmup_steps: 0
81 |       constant_steps: 0
82 |       min_lr: 0.000001
83 |   data:
84 |     train_dir: <path>/<to>/<data>
85 |     val_dir: null
86 |     dataset_type: gpt
87 |     use_synthetic_data: false
88 |     zipped_data: true
89 | cluster_type: k8s
90 | launcher_scripts_path: {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts/
91 | data_config: llama-8b
92 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training-config.yaml:
--------------------------------------------------------------------------------
1 | {{ $config := .Values.trainingConfig }}
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 |   name: training-config-{{ $config.jobName }}
6 | data:
7 |   config.yaml: |-
8 |   {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
9 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   trainingImage: test_container
 3 |   pullPolicy: Always
 4 | trainingConfig:
 5 |   jobName: llama-8b
 6 |   namespace: default
 7 |   scriptPath: examples/llama/llama_pretrain.py
 8 |   scriptArgs: --config-path=/config --config-name=config.yaml
 9 |   customScript: null
10 |   annotations: null
11 |   customLabels: null
12 |   priority_class_name: null
13 |   device: gpu
14 |   numEFADevices: 32
15 |   numNeuronDevices: null
16 |   ntasksPerNode: 8
17 |   nodes: 16
18 |   restartPolicy: Never
19 |   wandbKey: nil
20 |   serviceAccountName: null
21 |   compile: 0
22 |   persistentVolumeClaims:
23 |   - null
24 |   volumes: null
25 |   git:
26 |     repo_url_or_path: https://test_token@github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git
27 |     branch: test_branch
28 |     commit: test_commit
29 |     token: null
30 |     update_adapter: false
31 |   pre_script: []
32 |   post_script: []
33 |   labelSelector:
34 |     required: null
35 |     preferred: null
36 |     weights: null
37 |   cleanPodPolicy: null
38 |   envVars:
39 |     NCCL_DEBUG: WARN
40 |     NEMO_LAUNCHER_DEBUG: 1
41 |     SLURM_NTASKS_PER_NODE: 8
42 |     CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
43 |     FI_PROVIDER: efa
44 |     NCCL_SOCKET_IFNAME: ^lo,docker0,veth_def_agent
45 |     NCCL_IGNORE_DISABLED_P2P: '1'
46 |     TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
47 |     TORCH_DIST_INIT_BARRIER: '1'
48 |     CUDA_DEVICE_MAX_CONNECTIONS: '1'
49 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_hydra.yaml:
--------------------------------------------------------------------------------
  1 | run:
  2 |   name: llama-8b
  3 |   results_dir: {$results_dir}/llama-8b
  4 |   time_limit: 6-00:00:00
  5 |   model_type: hf
  6 | trainer:
  7 |   devices: 8
  8 |   num_nodes: 16
  9 |   accelerator: gpu
 10 |   precision: bf16
 11 |   max_steps: 50
 12 |   log_every_n_steps: 1
 13 |   val_check_interval: 1
 14 |   limit_val_batches: 0
 15 | exp_manager:
 16 |   exp_dir: null
 17 |   name: experiment
 18 |   # experiment loggers
 19 |   create_tensorboard_logger: False
 20 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
 21 |   create_mlflow_logger: False
 22 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
 23 |   create_wandb_logger: False
 24 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
 25 |   create_checkpoint_callback: true
 26 |   checkpoint_callback_params:
 27 |     save_top_k: 0
 28 |     every_n_train_steps: 10
 29 |     monitor: step
 30 |     mode: max
 31 |     save_last: true
 32 |   checkpoint_dir: None/checkpoints/
 33 |   resume_from_checkpoint: null
 34 |   auto_checkpoint:
 35 |     enabled: false
 36 |   export_full_model:
 37 |     every_n_train_steps: 0
 38 |     save_last: true
 39 | use_smp_model: true
 40 | distributed_backend: nccl
 41 | model:
 42 |   model_type: llama_v3
 43 |   train_batch_size: 4
 44 |   val_batch_size: 1
 45 |   seed: 12345
 46 |   grad_clip: 1.0
 47 |   log_reduced_training_loss: true
 48 |   tensor_model_parallel_degree: 4
 49 |   expert_model_parallel_degree: 1
 50 |   context_parallel_degree: 2
 51 |   moe: false
 52 |   activation_checkpointing: false
 53 |   activation_loading_horizon: 1
 54 |   delayed_param: true
 55 |   offload_activations: false
 56 |   sharding_strategy: hybrid_shard
 57 |   forward_prefetch: true
 58 |   shard_degree: 16
 59 |   backward_fetch_policy: backward_pre
 60 |   auto_wrap_policy: transformer_auto_wrap_policy
 61 |   limit_all_gathers: true
 62 |   use_orig_param: true
 63 |   fp8: true
 64 |   fp8_amax_history_len: 1024
 65 |   fp8_amax_compute_algo: max
 66 |   max_context_width: 16384
 67 |   max_position_embeddings: 16384
 68 |   num_hidden_layers: 32
 69 |   hidden_size: 4096
 70 |   num_attention_heads: 32
 71 |   intermediate_size: 14336
 72 |   initializer_range: 0.02
 73 |   layernorm_epsilon: 1.0e-05
 74 |   vocab_size: 128256
 75 |   num_key_value_heads: 8
 76 |   use_flash_attention: true
 77 |   rope_theta: 500000.0
 78 |   rope_scaling:
 79 |     rope_type: llama3
 80 |     factor: 8.0
 81 |     high_freq_factor: 4.0
 82 |     low_freq_factor: 1.0
 83 |     original_max_position_embeddings: 8192
 84 |   do_finetune: false
 85 |   hf_model_name_or_path: null
 86 |   peft:
 87 |     peft_type: null
 88 |   precision: bf16
 89 |   lr_decay_iters: 50
 90 |   optim:
 91 |     name: adamw
 92 |     lr: 0.0001
 93 |     weight_decay: 0.01
 94 |     betas:
 95 |     - 0.9
 96 |     - 0.95
 97 |     sched:
 98 |       name: CosineAnnealing
 99 |       warmup_steps: 0
100 |       constant_steps: 0
101 |       min_lr: 1.0e-06
102 |   data:
103 |     train_dir: null
104 |     val_dir: null
105 |     dataset_type: hf
106 |     use_synthetic_data: false
107 |   viztracer:
108 |     enabled: false
109 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_submission.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | helm install --timeout=15m --wait  --namespace default llama-8b {$results_dir}/llama-8b/k8s_template
3 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: Sagemaker Model Training
4 | name: sagemaker-training
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   trainingImage: test_container
 3 |   pullPolicy: Always
 4 | trainingConfig:
 5 |   jobName: test_custom
 6 |   namespace: default
 7 |   scriptPath: test.py
 8 |   scriptArgs: '--some_args debug --some_other_args 1 '
 9 |   customScript: true
10 |   annotations: null
11 |   customLabels: null
12 |   priority_class_name: null
13 |   device: gpu
14 |   numEFADevices: 32
15 |   numNeuronDevices: null
16 |   ntasksPerNode: 8
17 |   nodes: 8
18 |   restartPolicy: Never
19 |   wandbKey: nil
20 |   serviceAccountName: null
21 |   compile: 0
22 |   persistentVolumeClaims: null
23 |   volumes: null
24 |   git:
25 |     repo_url_or_path: https://github.com/example
26 |     branch: null
27 |     commit: null
28 |     token: null
29 |     update_adapter: null
30 |   pre_script: []
31 |   post_script: []
32 |   labelSelector:
33 |     required: null
34 |     preferred: null
35 |     weights: null
36 |   cleanPodPolicy: null
37 |   envVars:
38 |     NCCL_DEBUG: DEBUG
39 |     NEMO_LAUNCHER_DEBUG: 1
40 |     CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
41 |     FI_PROVIDER: efa
42 |     NCCL_SOCKET_IFNAME: ^lo,docker0,veth_def_agent
43 |     NCCL_IGNORE_DISABLED_P2P: '1'
44 |     TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
45 |     TORCH_DIST_INIT_BARRIER: '1'
46 |     CUDA_DEVICE_MAX_CONNECTIONS: '1'
47 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/test_custom_submission.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | helm install --timeout=15m --wait  --namespace default test-custom {$results_dir}/test_custom/k8s_template
3 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/test_custom_k8s_workflow.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from omegaconf import OmegaConf
 4 | 
 5 | from main import main
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | from tests.test_utils import (
10 |     compare_artifacts,
11 |     create_temp_directory,
12 |     make_hydra_cfg_instance,
13 | )
14 | 
15 | 
16 | def compare_custom_k8s_artifacts(artifacts_dir):
17 |     logger.info("Comparing custom k8s artifacts")
18 | 
19 |     artifacts_paths = [
20 |         "/test_custom/test_custom_submission.sh",
21 |         "/test_custom/k8s_template/Chart.yaml",
22 |         "/test_custom/k8s_template/values.yaml",
23 |         "/test_custom/k8s_template/templates/training.yaml",
24 |     ]
25 | 
26 |     k8s_baseline_artifacts_path = "/tests/k8s_workflow/k8s_baseline_artifacts"
27 |     compare_artifacts(artifacts_paths, artifacts_dir, k8s_baseline_artifacts_path)
28 | 
29 | 
30 | def test_custom_k8s_workflow():
31 |     logger.info("Testing k8s workflow")
32 | 
33 |     artifacts_dir = create_temp_directory()
34 |     overrides = [
35 |         "training_cfg.entry_script=test.py",
36 |         "cluster.instance_type=p5.48xlarge",
37 |         "base_results_dir={}".format(artifacts_dir),
38 |         "container=test_container",
39 |         "git.repo_url_or_path=https://github.com/example",
40 |         "+env_vars.NEMO_LAUNCHER_DEBUG=1",
41 |     ]
42 | 
43 |     sample_custom_k8s_config = make_hydra_cfg_instance("../launcher_scripts/custom_script", "config_k8s", overrides)
44 | 
45 |     logger.info("\nsample_custom_k8s_config\n")
46 |     logger.info(OmegaConf.to_yaml(sample_custom_k8s_config))
47 | 
48 |     main(sample_custom_k8s_config)
49 | 
50 |     compare_custom_k8s_artifacts(artifacts_dir)
51 | 


--------------------------------------------------------------------------------
/tests/k8s_workflow/test_recipe_k8s_workflow.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from omegaconf import OmegaConf
 4 | 
 5 | from main import main
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | import pytest
10 | 
11 | from tests.test_utils import (
12 |     compare_artifacts,
13 |     create_temp_directory,
14 |     make_hydra_cfg_instance,
15 | )
16 | 
17 | 
18 | def compare_recipe_k8s_artifacts(artifacts_dir):
19 |     logger.info("Comparing recipe k8s artifacts")
20 | 
21 |     artifacts_paths = [
22 |         "/llama-8b/llama-8b_submission.sh",
23 |         # "/llama-8b/llama-8b_hydra.yaml", # Do not test recipe, this changes often
24 |         "/llama-8b/k8s_template/values.yaml",
25 |         "/llama-8b/k8s_template/Chart.yaml",
26 |         # "/llama-8b/k8s_template/config/llama-8b_hydra.yaml", # Do not test recipe, this changes often
27 |         "/llama-8b/k8s_template/templates/training.yaml",
28 |         "/llama-8b/k8s_template/templates/training-config.yaml",
29 |     ]
30 | 
31 |     k8s_baseline_artifacts_path = "/tests/k8s_workflow/k8s_baseline_artifacts"
32 |     compare_artifacts(artifacts_paths, artifacts_dir, k8s_baseline_artifacts_path)
33 | 
34 | 
35 | def test_recipe_k8s_workflow():
36 |     logger.info("Testing recipe k8s workflow")
37 | 
38 |     artifacts_dir = create_temp_directory()
39 |     overrides = [
40 |         "instance_type=p5.48xlarge",
41 |         "base_results_dir={}".format(artifacts_dir),
42 |         "container=test_container",
43 |         "cluster=k8s",
44 |         "cluster_type=k8s",
45 |         "+env_vars.NEMO_LAUNCHER_DEBUG=1",
46 |         "git.repo_url_or_path=https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git",
47 |         "git.branch=test_branch",
48 |         "git.commit=test_commit",
49 |         "git.token=test_token",
50 |     ]
51 | 
52 |     sample_recipe_k8s_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
53 | 
54 |     logger.info("\nsample_recipe_k8s_config\n")
55 |     logger.info(OmegaConf.to_yaml(sample_recipe_k8s_config))
56 | 
57 |     main(sample_recipe_k8s_config)
58 | 
59 |     compare_recipe_k8s_artifacts(artifacts_dir)
60 | 
61 | 
62 | def test_recipe_k8s_workflow_invalid():
63 |     logger.info("Testing recipe k8s workflow with invalid git config")
64 | 
65 |     artifacts_dir = create_temp_directory()
66 |     overrides = [
67 |         "instance_type=p5.48xlarge",
68 |         "base_results_dir={}".format(artifacts_dir),
69 |         "container=test_container",
70 |         "cluster=k8s",
71 |         "cluster_type=k8s",
72 |         "+env_vars.NEMO_LAUNCHER_DEBUG=1",
73 |         "git.repo_url_or_path=/local/path",
74 |     ]
75 | 
76 |     sample_recipe_k8s_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
77 | 
78 |     logger.info("\nsample_recipe_k8s_config\n")
79 |     logger.info(OmegaConf.to_yaml(sample_recipe_k8s_config))
80 | 
81 |     with pytest.raises(ValueError):
82 |         main(sample_recipe_k8s_config)
83 | 


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/launch_docker_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | echo "image is test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
 4 | # Login ECR
 5 | aws ecr get-login-password --region test_region | docker login --username AWS --password-stdin test_account.dkr.ecr.test_region.amazonaws.com
 6 | 
 7 | # Getting EFA devices
 8 | device=("--device=/dev/gdrdrv")
 9 | while IFS= read -r -d '' d; do
10 |   device+=("--device=${d}")
11 | done < <(find "/dev/infiniband" -name "uverbs*" -print0)
12 | 
13 | # Clean old containers
14 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
15 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true
16 | 
17 | docker pull "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
18 | docker run --gpus 32 \
19 |   --privileged --rm -d --name "sm_training_launcher" \
20 |   --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \
21 |   --security-opt seccomp=unconfined  \
22 |   "${device[@]}" \
23 |   -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \
24 |   -v {$results_dir}:{$results_dir} \
25 |   test_docker_cmd \
26 |   "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" sleep infinity
27 | 
28 | # Running post launching commands
29 | docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config"
30 | docker exec -itd "sm_training_launcher" bash -c "printf \"  Port 2022\n\" >> /root/.ssh/config"
31 | docker exec -itd "sm_training_launcher" bash -c "service ssh start"
32 | docker exec "sm_training_launcher" bash -c "test_post_launch_cmd"
33 | 
34 | exit 0


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/llama-8b_hydra.yaml:
--------------------------------------------------------------------------------
  1 | run:
  2 |   name: llama-8b
  3 |   results_dir: {$results_dir}/llama-8b
  4 |   time_limit: 6-00:00:00
  5 |   model_type: hf
  6 | trainer:
  7 |   devices: 8
  8 |   num_nodes: 4
  9 |   accelerator: gpu
 10 |   precision: bf16
 11 |   max_steps: 50
 12 |   log_every_n_steps: 1
 13 |   val_check_interval: 1
 14 |   limit_val_batches: 0
 15 | exp_manager:
 16 |   exp_dir: null
 17 |   name: experiment
 18 |   # experiment loggers
 19 |   create_tensorboard_logger: False
 20 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
 21 |   create_mlflow_logger: False
 22 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
 23 |   create_wandb_logger: False
 24 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
 25 |   create_checkpoint_callback: true
 26 |   checkpoint_callback_params:
 27 |     save_top_k: 0
 28 |     every_n_train_steps: 10
 29 |     monitor: step
 30 |     mode: max
 31 |     save_last: true
 32 |   checkpoint_dir: None/checkpoints/
 33 |   resume_from_checkpoint: null
 34 |   auto_checkpoint:
 35 |     enabled: false
 36 |   export_full_model:
 37 |     every_n_train_steps: 0
 38 |     save_last: true
 39 | use_smp_model: true
 40 | distributed_backend: nccl
 41 | model:
 42 |   model_type: llama_v3
 43 |   train_batch_size: 2
 44 |   val_batch_size: 1
 45 |   seed: 12345
 46 |   grad_clip: 1.0
 47 |   log_reduced_training_loss: true
 48 |   tensor_model_parallel_degree: 1
 49 |   expert_model_parallel_degree: 1
 50 |   context_parallel_degree: 1
 51 |   moe: false
 52 |   activation_checkpointing: true
 53 |   activation_loading_horizon: 2
 54 |   delayed_param: true
 55 |   offload_activations: false
 56 |   sharding_strategy: hybrid_shard
 57 |   forward_prefetch: true
 58 |   shard_degree: 8
 59 |   backward_fetch_policy: backward_pre
 60 |   auto_wrap_policy: transformer_auto_wrap_policy
 61 |   limit_all_gathers: true
 62 |   use_orig_param: false
 63 |   fp8: true
 64 |   fp8_amax_history_len: 1024
 65 |   fp8_amax_compute_algo: max
 66 |   max_context_width: 8192
 67 |   max_position_embeddings: 8192
 68 |   num_hidden_layers: 32
 69 |   hidden_size: 4096
 70 |   num_attention_heads: 32
 71 |   intermediate_size: 14336
 72 |   initializer_range: 0.02
 73 |   layernorm_epsilon: 1.0e-05
 74 |   vocab_size: 128256
 75 |   num_key_value_heads: 8
 76 |   use_flash_attention: true
 77 |   rope_theta: 500000.0
 78 |   rope_scaling:
 79 |     rope_type: llama3
 80 |     factor: 8.0
 81 |     high_freq_factor: 4.0
 82 |     low_freq_factor: 1.0
 83 |     original_max_position_embeddings: 8192
 84 |   do_finetune: false
 85 |   hf_model_name_or_path: null
 86 |   peft:
 87 |     peft_type: null
 88 |   precision: bf16
 89 |   lr_decay_iters: 50
 90 |   optim:
 91 |     name: adamw
 92 |     lr: 0.0001
 93 |     weight_decay: 0.01
 94 |     betas:
 95 |     - 0.9
 96 |     - 0.95
 97 |     sched:
 98 |       name: CosineAnnealing
 99 |       warmup_steps: 0
100 |       constant_steps: 0
101 |       min_lr: 1.0e-06
102 |   data:
103 |     train_dir: null
104 |     val_dir: null
105 |     dataset_type: hf
106 |     use_synthetic_data: false
107 |   viztracer:
108 |     enabled: false
109 | 


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/sagemaker-hf-llama3-8b_submission.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --exclusive
 5 | #SBATCH --job-name=sagemaker-hf-llama3-8b
 6 | #SBATCH --mem=0
 7 | #SBATCH --nodes=4
 8 | #SBATCH --output={$results_dir}/hf-llama3-8b/log-sagemaker-hf-llama3-8b_%j.out
 9 | #SBATCH --time=6-00:00:00
10 | 
11 | # setup
12 | export NCCL_DEBUG=WARN
13 | export FI_PROVIDER=efa
14 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent
15 | export NCCL_IGNORE_DISABLED_P2P=1
16 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
17 | export TORCH_DIST_INIT_BARRIER=1
18 | export CUDA_DEVICE_MAX_CONNECTIONS=1
19 | 
20 | 
21 | # Prepare distributed files
22 | srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/hf-llama3-8b/hostname"
23 | 
24 | srun -l bash {$results_dir}/hf-llama3-8b/launch_docker_container.sh
25 | srun -l bash {$results_dir}/hf-llama3-8b/docker_exec_script.sh


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/train_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | export NCCL_DEBUG=WARN
 4 | export FI_PROVIDER=efa
 5 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent
 6 | export NCCL_IGNORE_DISABLED_P2P=1
 7 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
 8 | export TORCH_DIST_INIT_BARRIER=1
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | MASTER_ADDR=$(head -n 1 {$results_dir}/llama-8b/hostname)
11 | NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/llama-8b/hostname | cut -d ":" -f 1) - 1))
12 | NNODES=4
13 | PROCESSES_PER_NODE=8
14 | MASTER_PORT=41000
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d"
17 | 
18 | # For greater env stability, grab hostname from `hostname`
19 | # https://sim.amazon.com/issues/P162624109
20 | LAUNCHER_HOSTNAME="$(hostname)"
21 | 
22 | mkdir -p $HOME/tmp
23 | GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
24 | [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
25 | git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR
26 | GIT_CLONE_DIR=${GIT_CLONE_DIR}/
27 | cd $GIT_CLONE_DIR
28 | rm -rf __pycache__
29 | 
30 | unset SLURM_NTASKS
31 | 
32 | torchrun $DISTRIBUTED_ARGS  \
33 |   examples/llama/llama_pretrain.py \
34 |   --config-path={$results_dir}/llama-8b --config-name=llama-8b_hydra.yaml


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/docker_exec_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | function job_epilogue {
 5 |   docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
 6 | }
 7 | trap job_epilogue EXIT SIGTERM SIGINT
 8 | 
 9 | docker exec sm_training_launcher bash {$results_dir}/llama-8b/train_script.sh
10 | 
11 | exit 0


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/launch_docker_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | echo "image is test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
 4 | # Login ECR
 5 | aws ecr get-login-password --region test_region | docker login --username AWS --password-stdin test_account.dkr.ecr.test_region.amazonaws.com
 6 | 
 7 | # Getting EFA devices
 8 | device=("--device=/dev/gdrdrv")
 9 | while IFS= read -r -d '' d; do
10 |   device+=("--device=${d}")
11 | done < <(find "/dev/infiniband" -name "uverbs*" -print0)
12 | 
13 | # Clean old containers
14 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
15 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true
16 | 
17 | docker pull "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
18 | docker run --gpus 8 \
19 |   --privileged --rm -d --name "sm_training_launcher" \
20 |   --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \
21 |   --security-opt seccomp=unconfined  \
22 |   "${device[@]}" \
23 |   -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \
24 |   -v {$results_dir}:{$results_dir} \
25 |   test_docker_cmd \
26 |   "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" sleep infinity
27 | 
28 | # Running post launching commands
29 | docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config"
30 | docker exec -itd "sm_training_launcher" bash -c "printf \"  Port 2022\n\" >> /root/.ssh/config"
31 | docker exec -itd "sm_training_launcher" bash -c "service ssh start"
32 | docker exec "sm_training_launcher" bash -c "test_post_launch_cmd"
33 | 
34 | exit 0


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/llama-8b_hydra.yaml:
--------------------------------------------------------------------------------
  1 | run:
  2 |   name: llama-8b
  3 |   results_dir: {$results_dir}/llama-8b
  4 |   time_limit: 6-00:00:00
  5 |   model_type: hf
  6 | trainer:
  7 |   devices: 8
  8 |   num_nodes: 4
  9 |   accelerator: gpu
 10 |   precision: bf16
 11 |   max_steps: 50
 12 |   log_every_n_steps: 1
 13 |   val_check_interval: 1
 14 |   limit_val_batches: 0
 15 | exp_manager:
 16 |   exp_dir: null
 17 |   name: experiment
 18 |   # experiment loggers
 19 |   create_tensorboard_logger: False
 20 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
 21 |   create_mlflow_logger: False
 22 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
 23 |   create_wandb_logger: False
 24 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
 25 |   create_checkpoint_callback: true
 26 |   checkpoint_callback_params:
 27 |     save_top_k: 0
 28 |     every_n_train_steps: 10
 29 |     monitor: step
 30 |     mode: max
 31 |     save_last: true
 32 |   checkpoint_dir: None/checkpoints/
 33 |   resume_from_checkpoint: null
 34 |   auto_checkpoint:
 35 |     enabled: false
 36 |   export_full_model:
 37 |     every_n_train_steps: 0
 38 |     save_last: true
 39 | use_smp_model: true
 40 | distributed_backend: nccl
 41 | model:
 42 |   model_type: llama_v3
 43 |   train_batch_size: 2
 44 |   val_batch_size: 1
 45 |   seed: 12345
 46 |   grad_clip: 1.0
 47 |   log_reduced_training_loss: true
 48 |   tensor_model_parallel_degree: 1
 49 |   expert_model_parallel_degree: 1
 50 |   context_parallel_degree: 1
 51 |   moe: false
 52 |   activation_checkpointing: true
 53 |   activation_loading_horizon: 2
 54 |   delayed_param: true
 55 |   offload_activations: false
 56 |   sharding_strategy: hybrid_shard
 57 |   forward_prefetch: true
 58 |   shard_degree: 8
 59 |   backward_fetch_policy: backward_pre
 60 |   auto_wrap_policy: transformer_auto_wrap_policy
 61 |   limit_all_gathers: true
 62 |   use_orig_param: false
 63 |   fp8: true
 64 |   fp8_amax_history_len: 1024
 65 |   fp8_amax_compute_algo: max
 66 |   max_context_width: 8192
 67 |   max_position_embeddings: 8192
 68 |   num_hidden_layers: 32
 69 |   hidden_size: 4096
 70 |   num_attention_heads: 32
 71 |   intermediate_size: 14336
 72 |   initializer_range: 0.02
 73 |   layernorm_epsilon: 1.0e-05
 74 |   vocab_size: 128256
 75 |   num_key_value_heads: 8
 76 |   use_flash_attention: true
 77 |   rope_theta: 500000.0
 78 |   rope_scaling:
 79 |     rope_type: llama3
 80 |     factor: 8.0
 81 |     high_freq_factor: 4.0
 82 |     low_freq_factor: 1.0
 83 |     original_max_position_embeddings: 8192
 84 |   do_finetune: false
 85 |   hf_model_name_or_path: null
 86 |   peft:
 87 |     peft_type: null
 88 |   precision: bf16
 89 |   lr_decay_iters: 50
 90 |   optim:
 91 |     name: adamw
 92 |     lr: 0.0001
 93 |     weight_decay: 0.01
 94 |     betas:
 95 |     - 0.9
 96 |     - 0.95
 97 |     sched:
 98 |       name: CosineAnnealing
 99 |       warmup_steps: 0
100 |       constant_steps: 0
101 |       min_lr: 1.0e-06
102 |   data:
103 |     train_dir: null
104 |     val_dir: null
105 |     dataset_type: hf
106 |     use_synthetic_data: false
107 |   viztracer:
108 |     enabled: false
109 | 


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/sagemaker-llama-8b_submission.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --exclusive
 5 | #SBATCH --job-name=sagemaker-llama-8b
 6 | #SBATCH --mem=0
 7 | #SBATCH --nodes=16
 8 | #SBATCH --output={$results_dir}/llama-8b/log-sagemaker-llama-8b_%j.out
 9 | #SBATCH --time=6-00:00:00
10 | 
11 | # setup
12 | export NCCL_DEBUG=WARN
13 | export FI_PROVIDER=efa
14 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent
15 | export NCCL_IGNORE_DISABLED_P2P=1
16 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
17 | export TORCH_DIST_INIT_BARRIER=1
18 | export CUDA_DEVICE_MAX_CONNECTIONS=1
19 | 
20 | 
21 | # Prepare distributed files
22 | srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/llama-8b/hostname"
23 | 
24 | srun -l bash {$results_dir}/llama-8b/launch_docker_container.sh
25 | srun -l bash {$results_dir}/llama-8b/docker_exec_script.sh


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/train_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | export NCCL_DEBUG=WARN
 4 | export FI_PROVIDER=efa
 5 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent
 6 | export NCCL_IGNORE_DISABLED_P2P=1
 7 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
 8 | export TORCH_DIST_INIT_BARRIER=1
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | MASTER_ADDR=$(head -n 1 {$results_dir}/llama-8b/hostname)
11 | NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/llama-8b/hostname | cut -d ":" -f 1) - 1))
12 | NNODES=16
13 | PROCESSES_PER_NODE=8
14 | MASTER_PORT=41000
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d"
17 | 
18 | # For greater env stability, grab hostname from `hostname`
19 | # https://sim.amazon.com/issues/P162624109
20 | LAUNCHER_HOSTNAME="$(hostname)"
21 | 
22 | mkdir -p $HOME/tmp
23 | GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
24 | [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
25 | git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR
26 | GIT_CLONE_DIR=${GIT_CLONE_DIR}/
27 | cd $GIT_CLONE_DIR
28 | rm -rf __pycache__
29 | 
30 | unset SLURM_NTASKS
31 | 
32 | torchrun $DISTRIBUTED_ARGS  \
33 |   examples/llama/llama_pretrain.py \
34 |   --config-path={$results_dir}/llama-8b --config-name=llama-8b_hydra.yaml


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/docker_exec_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | function job_epilogue {
 5 |   docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
 6 | }
 7 | trap job_epilogue EXIT SIGTERM SIGINT
 8 | 
 9 | docker exec sm_training_launcher bash {$results_dir}/test_custom/train_script.sh
10 | 
11 | exit 0


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/launch_docker_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | echo "image is test_container"
 4 | echo "Not an ECR image, skipping ECR login"
 5 | # Getting EFA devices
 6 | device=("--device=/dev/gdrdrv")
 7 | while IFS= read -r -d '' d; do
 8 |   device+=("--device=${d}")
 9 | done < <(find "/dev/infiniband" -name "uverbs*" -print0)
10 | 
11 | # Clean old containers
12 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
13 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true
14 | 
15 | docker pull "test_container"
16 | docker run --gpus 8 \
17 |   --privileged --rm -d --name "sm_training_launcher" \
18 |   --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \
19 |   --security-opt seccomp=unconfined  \
20 |   "${device[@]}" \
21 |   -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \
22 |   -v {$results_dir}:{$results_dir} \
23 |   "test_container" sleep infinity
24 | 
25 | # Running post launching commands
26 | docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config"
27 | docker exec -itd "sm_training_launcher" bash -c "printf \"  Port 2022\n\" >> /root/.ssh/config"
28 | docker exec -itd "sm_training_launcher" bash -c "service ssh start"
29 | 
30 | exit 0


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/testcustom_slurm_test_custom_submission.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --error={$results_dir}/test_custom/log-testcustom_slurm_test_custom_%j.err
 5 | #SBATCH --exclusive
 6 | #SBATCH --job-name=testcustom_slurm_test_custom
 7 | #SBATCH --nodes=2
 8 | #SBATCH --output={$results_dir}/test_custom/log-testcustom_slurm_test_custom_%j.out
 9 | 
10 | # setup
11 | export NCCL_DEBUG=DEBUG
12 | export FI_PROVIDER=efa
13 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent
14 | export NCCL_IGNORE_DISABLED_P2P=1
15 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
16 | export TORCH_DIST_INIT_BARRIER=1
17 | export CUDA_DEVICE_MAX_CONNECTIONS=1
18 | 
19 | 
20 | # Prepare distributed files
21 | srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/test_custom/hostname"
22 | 
23 | srun -l bash {$results_dir}/test_custom/launch_docker_container.sh
24 | srun -l bash {$results_dir}/test_custom/docker_exec_script.sh


--------------------------------------------------------------------------------
/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/train_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | export NCCL_DEBUG=DEBUG
 4 | export FI_PROVIDER=efa
 5 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent
 6 | export NCCL_IGNORE_DISABLED_P2P=1
 7 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
 8 | export TORCH_DIST_INIT_BARRIER=1
 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1
10 | MASTER_ADDR=$(head -n 1 {$results_dir}/test_custom/hostname)
11 | NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/test_custom/hostname | cut -d ":" -f 1) - 1))
12 | NNODES=2
13 | PROCESSES_PER_NODE=8
14 | MASTER_PORT=41000
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d"
17 | 
18 | # For greater env stability, grab hostname from `hostname`
19 | # https://sim.amazon.com/issues/P162624109
20 | LAUNCHER_HOSTNAME="$(hostname)"
21 | 
22 | mkdir -p $HOME/tmp
23 | GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
24 | [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
25 | git clone https://github.com/example $GIT_CLONE_DIR
26 | GIT_CLONE_DIR=${GIT_CLONE_DIR}/
27 | cd $GIT_CLONE_DIR
28 | rm -rf __pycache__
29 | 
30 | unset SLURM_NTASKS
31 | 
32 | torchrun $DISTRIBUTED_ARGS  \
33 |   test.py \
34 |   


--------------------------------------------------------------------------------
/tests/slurm_workflow/test_custom_slurm_workflow.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import logging
15 | 
16 | from omegaconf import OmegaConf
17 | 
18 | from main import main
19 | 
20 | logger = logging.getLogger(__name__)
21 | 
22 | 
23 | from tests.test_utils import (
24 |     compare_artifacts,
25 |     create_temp_directory,
26 |     make_hydra_cfg_instance,
27 | )
28 | 
29 | 
30 | def compare_custom_slurm_artifacts(artifacts_dir):
31 |     logger.info("Comparing custom slurm artifacts")
32 | 
33 |     artifacts_paths = [
34 |         "/test_custom/launch_docker_container.sh",
35 |         "/test_custom/testcustom_slurm_test_custom_submission.sh",
36 |         "/test_custom/train_script.sh",
37 |         "/test_custom/docker_exec_script.sh",
38 |     ]
39 |     slurm_baseline_artifacts_path = "/tests/slurm_workflow/slurm_baseline_artifacts"
40 |     compare_artifacts(artifacts_paths, artifacts_dir, slurm_baseline_artifacts_path)
41 | 
42 | 
43 | def test_custom_slurm_workflow():
44 |     logger.info("Testing custom slurm workflow")
45 | 
46 |     artifacts_dir = create_temp_directory()
47 |     overrides = [
48 |         "training_cfg.entry_script=test.py",
49 |         "cluster.instance_type=p5.48xlarge",
50 |         "cluster.cluster_type=slurm",
51 |         "cluster.cluster_config.slurm_create_submission_file_only=True",
52 |         "git.repo_url_or_path=https://github.com/example",
53 |         "base_results_dir={}".format(artifacts_dir),
54 |         "container=test_container",
55 |     ]
56 | 
57 |     sample_custom_slurm_config = make_hydra_cfg_instance("../launcher_scripts/custom_script", "config_slurm", overrides)
58 | 
59 |     logger.info("\nsample_custom_slurm_config\n")
60 |     logger.info(OmegaConf.to_yaml(sample_custom_slurm_config))
61 | 
62 |     main(sample_custom_slurm_config)
63 | 
64 |     compare_custom_slurm_artifacts(artifacts_dir)
65 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-hyperpod-recipes/6a633c5500f60cea22d9409e06b069c1184b43e8/tests/sm_jobs_workflow/__init__.py


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/llama3-2-11b_hydra.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: llama3-2-11b
 3 |   results_dir: {$results_dir}/llama3-2-11b
 4 |   time_limit: 6-00:00:00
 5 |   model_type: hf
 6 | trainer:
 7 |   devices: 8
 8 |   num_nodes: 4
 9 |   accelerator: gpu
10 |   precision: bf16
11 |   max_steps: 50
12 |   log_every_n_steps: 1
13 |   val_check_interval: 1
14 |   accumulate_grad_batches: 1
15 |   gradient_clip_val: 1.0
16 | exp_manager:
17 |   exp_dir: null
18 |   name: experiment
19 |   # experiment loggers
20 |   create_tensorboard_logger: False
21 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
22 |   create_mlflow_logger: False
23 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
24 |   create_wandb_logger: False
25 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
26 |   create_checkpoint_callback: true
27 |   checkpoint_callback_params:
28 |     save_top_k: 0
29 |     every_n_train_steps: 10
30 |     monitor: step
31 |     mode: max
32 |     save_last: false
33 |   checkpoint_dir: None/checkpoints/
34 |   resume_from_checkpoint: null
35 |   auto_checkpoint:
36 |     enabled: false
37 |   export_full_model:
38 |     every_n_train_steps: 0
39 |     save_last: false
40 | use_smp_model: false
41 | distributed_backend: nccl
42 | model:
43 |   model_type: llama_v3
44 |   do_finetune: false
45 |   hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
46 |   hf_access_token: null
47 |   train_batch_size: 1
48 |   seed: 12345
49 |   grad_clip: 1.0
50 |   use_flash_attention: true
51 |   activation_checkpointing: true
52 |   multi_modal: true
53 |   delayed_param: false
54 |   sharding_strategy: hybrid_shard
55 |   forward_prefetch: true
56 |   shard_degree: 32
57 |   backward_fetch_policy: backward_pre
58 |   auto_wrap_policy: transformer_auto_wrap_policy
59 |   limit_all_gathers: true
60 |   use_orig_param: false
61 |   max_context_width: 8192
62 |   precision: bf16
63 |   lr_decay_iters: 47683
64 |   log_reduced_training_loss: true
65 |   peft:
66 |     peft_type: null
67 |   optim:
68 |     name: adamw
69 |     lr: 0.0002
70 |     weight_decay: 0.01
71 |     betas:
72 |     - 0.9
73 |     - 0.98
74 |     sched:
75 |       name: CosineAnnealing
76 |       warmup_steps: 500
77 |       constant_steps: 0
78 |       min_lr: 2.0e-05
79 |   data:
80 |     train_dir: null
81 |     val_dir: null
82 |     dataset_type: hf
83 |     use_synthetic_data: false
84 |     tokenizer_name: null
85 |     zipped_data: false
86 |   viztracer:
87 |     enabled: false
88 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/llama3-2-11b_submission.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pushd $(dirname -- $0)
4 | python launch.py --job_name llama3-2-11b --instance_type p5.48xlarge
5 | popd


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/recipe.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: llama3-2-11b
 3 |   results_dir: /var/folders/6w/nm79zb595ll18wyj6czl6gfm0000gq/T/tmp1nal2g5n/llama3-2-11b
 4 |   time_limit: 6-00:00:00
 5 |   model_type: hf
 6 | trainer:
 7 |   devices: 8
 8 |   num_nodes: 4
 9 |   accelerator: gpu
10 |   precision: bf16
11 |   max_steps: 50
12 |   log_every_n_steps: 1
13 |   val_check_interval: 100
14 |   accumulate_grad_batches: 1
15 |   gradient_clip_val: 1.0
16 | exp_manager:
17 |   exp_dir: null
18 |   name: experiment
19 |   # experiment loggers
20 |   create_tensorboard_logger: False
21 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
22 |   create_mlflow_logger: False
23 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
24 |   create_wandb_logger: False
25 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
26 |   create_checkpoint_callback: true
27 |   checkpoint_callback_params:
28 |     save_top_k: 0
29 |     every_n_train_steps: 10
30 |     monitor: step
31 |     mode: max
32 |   checkpoint_dir: None/checkpoints/
33 |   resume_from_checkpoint: null
34 |   auto_checkpoint:
35 |     enabled: false
36 |   export_full_model:
37 |     every_n_train_steps: 0
38 |     save_last: false
39 | use_smp_model: false
40 | distributed_backend: nccl
41 | model:
42 |   model_type: llama_v3
43 |   do_finetune: false
44 |   hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
45 |   hf_access_token: null
46 |   train_batch_size: 1
47 |   seed: 12345
48 |   grad_clip: 1.0
49 |   use_flash_attention: true
50 |   activation_checkpointing: true
51 |   multi_modal: true
52 |   delayed_param: false
53 |   sharding_strategy: hybrid_shard
54 |   forward_prefetch: true
55 |   shard_degree: 32
56 |   backward_fetch_policy: backward_pre
57 |   auto_wrap_policy: transformer_auto_wrap_policy
58 |   limit_all_gathers: false
59 |   use_orig_param: false
60 |   max_context_width: 8192
61 |   precision: bf16
62 |   lr_decay_iters: 47683
63 |   log_reduced_training_loss: true
64 |   peft:
65 |     peft_type: null
66 |   optim:
67 |     name: adamw
68 |     lr: 0.0002
69 |     weight_decay: 0.01
70 |     betas:
71 |     - 0.9
72 |     - 0.98
73 |     sched:
74 |       name: CosineAnnealing
75 |       warmup_steps: 500
76 |       constant_steps: 0
77 |       min_lr: 2.0e-05
78 |   data:
79 |     train_dir: null
80 |     val_dir: null
81 |     dataset_type: hf
82 |     use_synthetic_data: false
83 |     tokenizer_name: null
84 |     zipped_data: false
85 |   viztracer:
86 |     enabled: false
87 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.45.2


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/sm_jobs_config.yaml:
--------------------------------------------------------------------------------
 1 | output_path: s3://test_path
 2 | tensorboard_config:
 3 |   output_path: s3://test_tensorboard_path
 4 |   container_logs_path: /opt/ml/output/tensorboard
 5 | wait: true
 6 | inputs:
 7 |   s3:
 8 |     train: null
 9 |     val: null
10 |   file_system:
11 |     id: null
12 |     type: null
13 |     directory_path: null
14 | additional_estimator_kwargs:
15 |   max_run: 1800
16 |   enable_remote_debug: true
17 | recipe_overrides: null
18 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_hydra.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: llama3.2-11b
 3 |   results_dir: {$results_dir}/llama3.2-11b
 4 |   time_limit: 6-00:00:00
 5 |   model_type: hf
 6 | trainer:
 7 |   devices: 8
 8 |   num_nodes: 4
 9 |   accelerator: gpu
10 |   precision: bf16
11 |   max_steps: 50
12 |   log_every_n_steps: 1
13 |   val_check_interval: 1
14 |   accumulate_grad_batches: 1
15 |   gradient_clip_val: 1.0
16 | exp_manager:
17 |   exp_dir: null
18 |   name: experiment
19 |   create_tensorboard_logger: true
20 |   create_checkpoint_callback: true
21 |   checkpoint_callback_params:
22 |     save_top_k: 0
23 |     every_n_train_steps: 10
24 |     monitor: step
25 |     mode: max
26 |     save_last: false
27 |   checkpoint_dir: None/checkpoints/
28 |   resume_from_checkpoint: null
29 |   auto_checkpoint:
30 |     enabled: false
31 |   export_full_model:
32 |     every_n_train_steps: 0
33 |     save_last: false
34 | use_smp_model: false
35 | distributed_backend: nccl
36 | model:
37 |   model_type: llama_v3
38 |   do_finetune: false
39 |   hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
40 |   hf_access_token: null
41 |   train_batch_size: 1
42 |   seed: 12345
43 |   grad_clip: 1.0
44 |   use_flash_attention: true
45 |   activation_checkpointing: true
46 |   multi_modal: true
47 |   delayed_param: false
48 |   sharding_strategy: hybrid_shard
49 |   forward_prefetch: true
50 |   shard_degree: 32
51 |   backward_fetch_policy: backward_pre
52 |   auto_wrap_policy: transformer_auto_wrap_policy
53 |   limit_all_gathers: true
54 |   use_orig_param: false
55 |   max_context_width: 8192
56 |   precision: bf16
57 |   lr_decay_iters: 47683
58 |   log_reduced_training_loss: true
59 |   peft:
60 |     peft_type: null
61 |   optim:
62 |     name: adamw
63 |     lr: 0.0002
64 |     weight_decay: 0.01
65 |     betas:
66 |     - 0.9
67 |     - 0.98
68 |     sched:
69 |       name: CosineAnnealing
70 |       warmup_steps: 500
71 |       constant_steps: 0
72 |       min_lr: 2.0e-05
73 |   data:
74 |     train_dir: null
75 |     val_dir: null
76 |     dataset_type: hf
77 |     use_synthetic_data: false
78 |     tokenizer_name: null
79 |     zipped_data: false
80 |   viztracer:
81 |     enabled: false
82 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_submission.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pushd $(dirname -- $0)
4 | python launch.py --job_name llama3.2-11b --instance_type p5.48xlarge
5 | popd


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/recipe.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: llama3.2-11b
 3 |   results_dir: /var/folders/6w/nm79zb595ll18wyj6czl6gfm0000gq/T/tmp1nal2g5n/llama3.2-11b
 4 |   time_limit: 6-00:00:00
 5 |   model_type: hf
 6 | trainer:
 7 |   devices: 8
 8 |   num_nodes: 4
 9 |   accelerator: gpu
10 |   precision: bf16
11 |   max_steps: 50
12 |   log_every_n_steps: 1
13 |   val_check_interval: 100
14 |   accumulate_grad_batches: 1
15 |   gradient_clip_val: 1.0
16 | exp_manager:
17 |   exp_dir: null
18 |   name: experiment
19 |   create_tensorboard_logger: true
20 |   create_checkpoint_callback: true
21 |   checkpoint_callback_params:
22 |     save_top_k: 0
23 |     every_n_train_steps: 10
24 |     monitor: step
25 |     mode: max
26 |   checkpoint_dir: None/checkpoints/
27 |   resume_from_checkpoint: null
28 |   auto_checkpoint:
29 |     enabled: false
30 |   export_full_model:
31 |     every_n_train_steps: 0
32 |     save_last: false
33 | use_smp_model: false
34 | distributed_backend: nccl
35 | model:
36 |   model_type: llama_v3
37 |   do_finetune: false
38 |   hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
39 |   hf_access_token: null
40 |   train_batch_size: 1
41 |   seed: 12345
42 |   grad_clip: 1.0
43 |   use_flash_attention: true
44 |   activation_checkpointing: true
45 |   multi_modal: true
46 |   delayed_param: false
47 |   sharding_strategy: hybrid_shard
48 |   forward_prefetch: true
49 |   shard_degree: 32
50 |   backward_fetch_policy: backward_pre
51 |   auto_wrap_policy: transformer_auto_wrap_policy
52 |   limit_all_gathers: false
53 |   use_orig_param: false
54 |   max_context_width: 8192
55 |   precision: bf16
56 |   lr_decay_iters: 47683
57 |   log_reduced_training_loss: true
58 |   peft:
59 |     peft_type: null
60 |   optim:
61 |     name: adamw
62 |     lr: 0.0002
63 |     weight_decay: 0.01
64 |     betas:
65 |     - 0.9
66 |     - 0.98
67 |     sched:
68 |       name: CosineAnnealing
69 |       warmup_steps: 500
70 |       constant_steps: 0
71 |       min_lr: 2.0e-05
72 |   data:
73 |     train_dir: null
74 |     val_dir: null
75 |     dataset_type: hf
76 |     use_synthetic_data: false
77 |     tokenizer_name: null
78 |     zipped_data: false
79 |   viztracer:
80 |     enabled: false
81 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.45.2


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/sm_jobs_config.yaml:
--------------------------------------------------------------------------------
 1 | output_path: s3://test_path
 2 | tensorboard_config:
 3 |   output_path: s3://test_tensorboard_path
 4 |   container_logs_path: /opt/ml/output/tensorboard
 5 | wait: true
 6 | inputs:
 7 |   s3:
 8 |     train: null
 9 |     val: null
10 |   file_system:
11 |     id: null
12 |     type: null
13 |     directory_path: null
14 | additional_estimator_kwargs:
15 |   max_run: 1800
16 |   enable_remote_debug: true
17 | recipe_overrides: null
18 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_hydra.yaml:
--------------------------------------------------------------------------------
  1 | run:
  2 |   name: llama-8b
  3 |   results_dir: {$results_dir}/llama-8b
  4 |   time_limit: 6-00:00:00
  5 |   model_type: hf
  6 | trainer:
  7 |   devices: 8
  8 |   num_nodes: 16
  9 |   accelerator: gpu
 10 |   precision: bf16
 11 |   max_steps: 50
 12 |   log_every_n_steps: 1
 13 |   val_check_interval: 1
 14 |   limit_val_batches: 0
 15 | exp_manager:
 16 |   exp_dir: null
 17 |   name: experiment
 18 |   # experiment loggers
 19 |   create_tensorboard_logger: False
 20 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
 21 |   create_mlflow_logger: False
 22 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
 23 |   create_wandb_logger: False
 24 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
 25 |   create_checkpoint_callback: true
 26 |   checkpoint_callback_params:
 27 |     save_top_k: 0
 28 |     every_n_train_steps: 10
 29 |     monitor: step
 30 |     mode: max
 31 |     save_last: false
 32 |   checkpoint_dir: None/checkpoints/
 33 |   resume_from_checkpoint: null
 34 |   auto_checkpoint:
 35 |     enabled: false
 36 |   export_full_model:
 37 |     every_n_train_steps: 0
 38 |     save_last: true
 39 | use_smp_model: true
 40 | distributed_backend: nccl
 41 | model:
 42 |   model_type: llama_v3
 43 |   train_batch_size: 1
 44 |   val_batch_size: 1
 45 |   seed: 12345
 46 |   grad_clip: 1.0
 47 |   log_reduced_training_loss: true
 48 |   tensor_model_parallel_degree: 2
 49 |   expert_model_parallel_degree: 1
 50 |   context_parallel_degree: 1
 51 |   moe: false
 52 |   activation_checkpointing: false
 53 |   activation_loading_horizon: 1
 54 |   delayed_param: true
 55 |   offload_activations: false
 56 |   sharding_strategy: hybrid_shard
 57 |   forward_prefetch: true
 58 |   shard_degree: 64
 59 |   backward_fetch_policy: backward_pre
 60 |   auto_wrap_policy: transformer_auto_wrap_policy
 61 |   limit_all_gathers: true
 62 |   use_orig_param: true
 63 |   fp8: true
 64 |   fp8_amax_history_len: 1024
 65 |   fp8_amax_compute_algo: max
 66 |   max_context_width: 16384
 67 |   max_position_embeddings: 16384
 68 |   num_hidden_layers: 32
 69 |   hidden_size: 4096
 70 |   num_attention_heads: 32
 71 |   intermediate_size: 14336
 72 |   initializer_range: 0.02
 73 |   layernorm_epsilon: 1.0e-05
 74 |   vocab_size: 128256
 75 |   num_key_value_heads: 8
 76 |   use_flash_attention: true
 77 |   rope_theta: 500000.0
 78 |   rope_scaling:
 79 |     rope_type: llama3
 80 |     factor: 8.0
 81 |     high_freq_factor: 4.0
 82 |     low_freq_factor: 1.0
 83 |     original_max_position_embeddings: 8192
 84 |   do_finetune: false
 85 |   hf_model_name_or_path: null
 86 |   peft:
 87 |     peft_type: null
 88 |   precision: bf16
 89 |   lr_decay_iters: 50
 90 |   optim:
 91 |     name: adamw
 92 |     lr: 0.0001
 93 |     weight_decay: 0.01
 94 |     betas:
 95 |     - 0.9
 96 |     - 0.95
 97 |     sched:
 98 |       name: CosineAnnealing
 99 |       warmup_steps: 0
100 |       constant_steps: 0
101 |       min_lr: 1.0e-06
102 |   data:
103 |     train_dir: null
104 |     val_dir: null
105 |     dataset_type: hf
106 |     use_synthetic_data: false
107 |   viztracer:
108 |     enabled: false
109 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_submission.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pushd $(dirname -- $0)
4 | python launch.py --job_name llama-8b --instance_type p5.48xlarge
5 | popd


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/sm_jobs_config.yaml:
--------------------------------------------------------------------------------
 1 | output_path: s3://test_path
 2 | tensorboard_config:
 3 |   output_path: s3://test_tensorboard_path
 4 |   container_logs_path: /opt/ml/output/tensorboard
 5 | wait: true
 6 | inputs:
 7 |   s3:
 8 |     train: null
 9 |     val: null
10 |   file_system:
11 |     id: null
12 |     type: null
13 |     directory_path: null
14 | additional_estimator_kwargs:
15 |   max_run: 1800
16 |   enable_remote_debug: true
17 | recipe_overrides: null
18 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_hydra.yaml:
--------------------------------------------------------------------------------
  1 | run:
  2 |   name: llama-8b
  3 |   results_dir: {$results_dir}/llama-8b
  4 |   time_limit: 6-00:00:00
  5 |   model_type: hf
  6 | trainer:
  7 |   devices: 8
  8 |   num_nodes: 16
  9 |   accelerator: gpu
 10 |   precision: bf16
 11 |   max_steps: 50
 12 |   log_every_n_steps: 1
 13 |   val_check_interval: 1
 14 |   limit_val_batches: 0
 15 | exp_manager:
 16 |   exp_dir: null
 17 |   name: experiment
 18 |   # experiment loggers
 19 |   create_tensorboard_logger: False
 20 |   summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
 21 |   create_mlflow_logger: False
 22 |   mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
 23 |   create_wandb_logger: False
 24 |   wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
 25 |   create_checkpoint_callback: true
 26 |   checkpoint_callback_params:
 27 |     save_top_k: 0
 28 |     every_n_train_steps: 10
 29 |     monitor: step
 30 |     mode: max
 31 |     save_last: false
 32 |   checkpoint_dir: None/checkpoints/
 33 |   resume_from_checkpoint: null
 34 |   auto_checkpoint:
 35 |     enabled: false
 36 |   export_full_model:
 37 |     every_n_train_steps: 0
 38 |     save_last: true
 39 | use_smp_model: true
 40 | distributed_backend: nccl
 41 | model:
 42 |   model_type: llama_v3
 43 |   train_batch_size: 1
 44 |   val_batch_size: 1
 45 |   seed: 12345
 46 |   grad_clip: 1.0
 47 |   log_reduced_training_loss: true
 48 |   tensor_model_parallel_degree: 2
 49 |   expert_model_parallel_degree: 1
 50 |   context_parallel_degree: 1
 51 |   moe: false
 52 |   activation_checkpointing: false
 53 |   activation_loading_horizon: 1
 54 |   delayed_param: true
 55 |   offload_activations: false
 56 |   sharding_strategy: hybrid_shard
 57 |   forward_prefetch: true
 58 |   shard_degree: 64
 59 |   backward_fetch_policy: backward_pre
 60 |   auto_wrap_policy: transformer_auto_wrap_policy
 61 |   limit_all_gathers: true
 62 |   use_orig_param: true
 63 |   fp8: true
 64 |   fp8_amax_history_len: 1024
 65 |   fp8_amax_compute_algo: max
 66 |   max_context_width: 16384
 67 |   max_position_embeddings: 16384
 68 |   num_hidden_layers: 32
 69 |   hidden_size: 4096
 70 |   num_attention_heads: 32
 71 |   intermediate_size: 14336
 72 |   initializer_range: 0.02
 73 |   layernorm_epsilon: 1.0e-05
 74 |   vocab_size: 128256
 75 |   num_key_value_heads: 8
 76 |   use_flash_attention: true
 77 |   rope_theta: 500000.0
 78 |   rope_scaling:
 79 |     rope_type: llama3
 80 |     factor: 8.0
 81 |     high_freq_factor: 4.0
 82 |     low_freq_factor: 1.0
 83 |     original_max_position_embeddings: 8192
 84 |   do_finetune: false
 85 |   hf_model_name_or_path: null
 86 |   peft:
 87 |     peft_type: null
 88 |   precision: bf16
 89 |   lr_decay_iters: 50
 90 |   optim:
 91 |     name: adamw
 92 |     lr: 0.0001
 93 |     weight_decay: 0.01
 94 |     betas:
 95 |     - 0.9
 96 |     - 0.95
 97 |     sched:
 98 |       name: CosineAnnealing
 99 |       warmup_steps: 0
100 |       constant_steps: 0
101 |       min_lr: 1.0e-06
102 |   data:
103 |     train_dir: null
104 |     val_dir: null
105 |     dataset_type: hf
106 |     use_synthetic_data: false
107 |   viztracer:
108 |     enabled: false
109 | 


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_submission.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pushd $(dirname -- $0)
4 | python launch.py --job_name llama-8b --instance_type p5.48xlarge
5 | popd


--------------------------------------------------------------------------------
/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/sm_jobs_config.yaml:
--------------------------------------------------------------------------------
 1 | output_path: s3://test_path
 2 | tensorboard_config:
 3 |   output_path: null
 4 |   container_logs_path: null
 5 | wait: true
 6 | inputs:
 7 |   s3:
 8 |     train: s3://test_path
 9 |     val: s3://test_path
10 |   file_system:
11 |     id: null
12 |     type: null
13 |     directory_path: null
14 | additional_estimator_kwargs:
15 |   max_run: 1800
16 |   enable_remote_debug: true
17 | recipe_overrides: null
18 | 


--------------------------------------------------------------------------------
/tests/test_config_files.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import logging
15 | from typing import Optional
16 | 
17 | from omegaconf import OmegaConf
18 | 
19 | from launcher.nemo.constants import ROOT_DIR
20 | 
21 | from .test_utils import (
22 |     is_job_run_name_valid_for_clusters,
23 |     make_hydra_cfg_instance,
24 |     validate_distributed_degrees,
25 | )
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | 
30 | def test_configuration_files():
31 |     recipes_dir = ROOT_DIR / "recipes_collection/recipes"
32 |     log_config_name = lambda name: logger.info(f"\nFailing Config File: {name}")
33 | 
34 |     for path in recipes_dir.rglob("*.yaml"):
35 |         if not path.is_file():
36 |             continue
37 | 
38 |         # Hydra requires relative path definition
39 |         file_path: str = "../" + str(path.relative_to(ROOT_DIR).parent)
40 |         config = make_hydra_cfg_instance(file_path, path.name)
41 | 
42 |         # plucking values outside the method arguments substantially reduces log output on failure
43 |         shard_degree = OmegaConf.select(config, "model.shard_degree")
44 |         tensor_model_parallel_degree = OmegaConf.select(config, "model.tensor_model_parallel_degree")
45 |         expert_model_parallel_degree = OmegaConf.select(config, "model.expert_model_parallel_degree")
46 |         context_parallel_degree = OmegaConf.select(config, "model.context_parallel_degree")
47 |         num_nodes = OmegaConf.select(config, "trainer.num_nodes")
48 | 
49 |         assert validate_distributed_degrees(
50 |             shard_degree, tensor_model_parallel_degree, expert_model_parallel_degree, context_parallel_degree, num_nodes
51 |         ), log_config_name(path.name)
52 | 
53 |         job_run_name: Optional[str] = config.get("run", {}).get("name")
54 |         assert is_job_run_name_valid_for_clusters(job_run_name), log_config_name(path.name)
55 | 


--------------------------------------------------------------------------------
/tests/test_launcher_scripts.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import logging
15 | from typing import Optional
16 | 
17 | from launcher.nemo.constants import ROOT_DIR
18 | 
19 | from .test_utils import (
20 |     get_launcher_run_script_paths,
21 |     is_job_run_name_valid_for_clusters,
22 | )
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | RUN_SCRIPT_PATHS = get_launcher_run_script_paths()
27 | 
28 | 
29 | def test_config_for_run_script_exists():
30 |     RECIPES_DIR = ROOT_DIR / "recipes_collection/recipes"
31 |     log_line = lambda script, config: logger.info(
32 |         f"\nlauncher file: {script.relative_to(ROOT_DIR)}" f"\nconfig file: {config.relative_to(ROOT_DIR)}" "\n"
33 |     )
34 | 
35 |     def extract_value_in_line(line: str) -> str:
36 |         _, value = line.split("=")
37 |         value = value.replace(" \\", "")  # remove shell line continuation marker
38 |         value = value.strip()
39 |         return value
40 | 
41 |     def assert_recipe_config_exists(line: str, config_path_str: str):
42 |         # Example:
43 |         # recipes=training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain
44 |         config_path = RECIPES_DIR / (config_path_str + ".yaml")  # append .yaml
45 |         assert config_path.exists(), log_line(run_script_path, config_path)
46 | 
47 |     def assert_run_name_is_valid(line: str, config_path_str: Optional[str]):
48 |         """
49 |         Ensure the name is valid for Slurm and Kubernetes clusters
50 |         """
51 |         # Example:
52 |         #    recipes.run.name="hf-llama3-70b-lora" \
53 |         run_name = extract_value_in_line(line)
54 |         run_name = run_name.replace('"', "")  # remove quotes
55 |         run_name = run_name.strip()
56 | 
57 |         if config_path_str is None:
58 |             config_path_str = "config_file_not_defined"
59 | 
60 |         config_path = RECIPES_DIR / (config_path_str + ".yaml")  # append .yaml
61 |         assert is_job_run_name_valid_for_clusters(run_name), log_line(run_script_path, config_path)
62 | 
63 |     for run_script_path in RUN_SCRIPT_PATHS:
64 |         with open(run_script_path, "r") as fd:
65 |             for line in fd:
66 |                 config_path_str = None
67 | 
68 |                 if "recipes=" in line:
69 |                     config_path_str = extract_value_in_line(line)
70 |                     assert_recipe_config_exists(line, config_path_str)
71 | 
72 |                 if "recipes.run.name=" in line:
73 |                     assert_run_name_is_valid(line, config_path_str)
74 | 


--------------------------------------------------------------------------------
/tests/test_readme.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import logging
15 | from difflib import SequenceMatcher
16 | from typing import List
17 | 
18 | from launcher.nemo.constants import ROOT_DIR
19 | 
20 | logger = logging.getLogger(__name__)
21 | 
22 | 
23 | def test_readme_table_links():
24 |     readme_path = ROOT_DIR / "README.md"
25 |     log_line = lambda line: logger.info(f"\nFailing line:\n{line}")
26 | 
27 |     def pluck_path_strings(line: str):
28 |         paths_str: List[str] = []
29 | 
30 |         for chunk in line.split("|"):  # split by column delimeter
31 |             if "[link]" in chunk:
32 |                 chunk = chunk.strip()
33 |                 chunk = chunk.replace("[link]", "")
34 |                 assert chunk[0] == "(" and chunk[-1] == ")", log_line(line)
35 |                 chunk = chunk[1:-1]  # remove parantheses
36 |                 paths_str.append(chunk)
37 | 
38 |         return paths_str
39 | 
40 |     with open(readme_path, "r") as fd:
41 |         for line in fd:
42 |             """
43 |             Example:
44 |             | Hugging Face | Llama 3.2 | 11b  | 8192            | 4     | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh) |
45 |             """
46 |             if "[link]" in line:
47 |                 paths_str = pluck_path_strings(line)
48 | 
49 |                 if len(paths_str) == 1:
50 |                     file_path = ROOT_DIR / paths_str[0]
51 |                     assert file_path.exists(), log_line(line)
52 |                 # there is a config and a script link
53 |                 elif len(paths_str) == 2:
54 |                     config_file_path = ROOT_DIR / paths_str[0]
55 |                     launcher_script_path = ROOT_DIR / paths_str[1]
56 |                     # try to catch if a launch script is pointing to an incorrect config
57 |                     str_distance_ratio = SequenceMatcher(None, config_file_path.stem, launcher_script_path.stem).ratio()
58 | 
59 |                     assert config_file_path.exists(), log_line(line)
60 |                     assert launcher_script_path.exists(), log_line(line)
61 |                     assert str_distance_ratio >= 0.8, log_line(line)
62 |                 else:
63 |                     raise Exception("test condition not covered")
64 | 


--------------------------------------------------------------------------------
/validations_wrapper.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | from functools import wraps
15 | from typing import Any, Callable, TypeVar, cast
16 | 
17 | from omegaconf import DictConfig
18 | 
19 | from launcher.config_validator.type_validator import TypeValidator
20 | from launcher.config_validator.value_validator import ValueValidator
21 | 
22 | _T = TypeVar("_T", bound=Callable[..., Any])
23 | 
24 | 
25 | def validate_config(fn: _T) -> _T:
26 |     @wraps(fn)
27 |     def validations_wrapper(config: DictConfig, *args, **kwargs) -> DictConfig:
28 |         """
29 |         Execute all validations in this function
30 |         """
31 |         type_validator = TypeValidator(config)
32 |         type_validator.validate()
33 |         schema_validator = ValueValidator(config)
34 |         schema_validator.validate()
35 | 
36 |         return fn(config, *args, **kwargs)
37 | 
38 |     return cast(_T, validations_wrapper)
39 | 


--------------------------------------------------------------------------------