├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation_request.md │ └── feature_request.md ├── pull_request_template.md └── workflows │ ├── pre-commit-check-runner-push.yml │ ├── repo-monitoring-cron.yml │ ├── security-monitoring-cron.yml │ └── unit-test-runner-push.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Config ├── LICENSE ├── NOTICE ├── README.md ├── THIRD-PARTY.txt ├── launcher ├── __init__.py ├── accelerator_devices.py ├── config_validator │ ├── type_validator.py │ └── value_validator.py ├── efa.py ├── nemo │ ├── README.md │ ├── __init__.py │ ├── constants.py │ ├── k8s_templates │ │ └── training │ │ │ ├── Chart.yaml │ │ │ ├── train-script-gpu.yaml │ │ │ ├── train-script-trn.yaml │ │ │ ├── training-config.yaml │ │ │ ├── training.yaml │ │ │ └── values.yaml │ ├── launchers.py │ ├── recipe_stages.py │ ├── slurm_launcher.py │ └── stages.py └── telemetry.py ├── launcher_scripts ├── custom_model │ └── run_falcon.sh ├── custom_script │ ├── README.md │ ├── config_k8s.yaml │ ├── config_slurm.yaml │ ├── custom_allreduce.py │ └── run_allreduce.sh ├── deepseek │ ├── run_hf_deepseek_r1_671b_seq8k_gpu_lora.sh │ ├── run_hf_deepseek_r1_671b_seq8k_gpu_qlora.sh │ ├── run_hf_deepseek_r1_llama_70b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_llama_70b_seq16k_gpu_lora.sh │ ├── run_hf_deepseek_r1_llama_70b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_llama_70b_seq8k_gpu_lora.sh │ ├── run_hf_deepseek_r1_llama_8b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_llama_8b_seq16k_gpu_lora.sh │ ├── run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_llama_8b_seq8k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_14b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_qwen_14b_seq16k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_14b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_qwen_14b_seq8k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_32b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_qwen_32b_seq16k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_32b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_qwen_32b_seq8k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_7b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_deepseek_r1_qwen_7b_seq16k_gpu_lora.sh │ ├── run_hf_deepseek_r1_qwen_7b_seq8k_gpu_fine_tuning.sh │ └── run_hf_deepseek_r1_qwen_7b_seq8k_gpu_lora.sh ├── llama │ ├── p4_run_hf_llama3_70b_seq8k.sh │ ├── p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh │ ├── p4_run_hf_llama3_70b_seq8k_gpu_lora.sh │ ├── p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh │ ├── p4_run_hf_llama3_8b_seq8k_gpu_lora.sh │ ├── run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh │ ├── run_hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.sh │ ├── run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh │ ├── run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh │ ├── run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_llama3_3_70b_seq16k_gpu_lora.sh │ ├── run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_llama3_3_70b_seq8k_gpu_lora.sh │ ├── run_hf_llama3_405b_seq128k_gpu_qlora.sh │ ├── run_hf_llama3_405b_seq16k_gpu_lora.sh │ ├── run_hf_llama3_405b_seq16k_gpu_qlora.sh │ ├── run_hf_llama3_405b_seq32k_gpu_qlora.sh │ ├── run_hf_llama3_405b_seq8k_gpu_lora.sh │ ├── run_hf_llama3_405b_seq8k_gpu_qlora.sh │ ├── run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_llama3_70b_seq16k_gpu_lora.sh │ ├── run_hf_llama3_70b_seq16k_gpu_p5x128_pretrain.sh │ ├── run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh │ ├── run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh │ ├── run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_llama3_70b_seq8k_gpu_lora.sh │ ├── run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh │ ├── run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh │ ├── run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh │ ├── run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh │ ├── run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh │ ├── run_hf_llama3_8b_seq16k_gpu_lora.sh │ ├── run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh │ ├── run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh │ ├── run_hf_llama3_8b_seq8k_gpu_dpo.sh │ ├── run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh │ ├── run_hf_llama3_8b_seq8k_gpu_lora.sh │ ├── run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh │ ├── run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh │ ├── run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh │ ├── run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh │ ├── run_hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.sh │ ├── run_hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.sh │ ├── run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh │ └── run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh ├── mistral │ ├── run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh │ ├── run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh │ ├── run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh │ └── run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh └── mixtral │ ├── run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh │ ├── run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh │ ├── run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh │ ├── run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh │ ├── run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh │ ├── run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh │ ├── run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh │ ├── run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh │ ├── run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh │ └── run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh ├── main.py ├── pyproject.toml ├── recipes_collection ├── cluster │ ├── k8s.yaml │ ├── slurm.yaml │ └── sm_jobs.yaml ├── config.yaml └── recipes │ ├── fine-tuning │ ├── deepseek │ │ ├── hf_deepseek_r1_671b_seq8k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_671b_seq8k_gpu_qlora.yaml │ │ ├── hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_lora.yaml │ │ ├── hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_fine_tuning.yaml │ │ └── hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_lora.yaml │ └── llama │ │ ├── hf_llama3_3_70b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_llama3_3_70b_seq16k_gpu_lora.yaml │ │ ├── hf_llama3_3_70b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_llama3_3_70b_seq8k_gpu_lora.yaml │ │ ├── hf_llama3_405b_seq128k_gpu_qlora.yaml │ │ ├── hf_llama3_405b_seq16k_gpu_lora.yaml │ │ ├── hf_llama3_405b_seq16k_gpu_qlora.yaml │ │ ├── hf_llama3_405b_seq32k_gpu_qlora.yaml │ │ ├── hf_llama3_405b_seq8k_gpu_lora.yaml │ │ ├── hf_llama3_405b_seq8k_gpu_qlora.yaml │ │ ├── hf_llama3_70b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_llama3_70b_seq16k_gpu_lora.yaml │ │ ├── hf_llama3_70b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_llama3_70b_seq8k_gpu_lora.yaml │ │ ├── hf_llama3_8b_seq16k_gpu_fine_tuning.yaml │ │ ├── hf_llama3_8b_seq16k_gpu_lora.yaml │ │ ├── hf_llama3_8b_seq8k_gpu_dpo.yaml │ │ ├── hf_llama3_8b_seq8k_gpu_fine_tuning.yaml │ │ ├── hf_llama3_8b_seq8k_gpu_lora.yaml │ │ ├── hf_llama3_8b_seq8k_trn1_fine_tuning.yaml │ │ ├── hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.yaml │ │ ├── hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.yaml │ │ ├── hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.yaml │ │ ├── hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml │ │ ├── p4_hf_llama3_70b_seq8k_gpu_fine_tuning.yaml │ │ ├── p4_hf_llama3_70b_seq8k_gpu_lora.yaml │ │ ├── p4_hf_llama3_8b_seq8k_gpu_fine_tuning.yaml │ │ └── p4_hf_llama3_8b_seq8k_gpu_lora.yaml │ └── training │ ├── custom_model │ └── falcon.yaml │ ├── llama │ ├── hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml │ ├── hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.yaml │ ├── hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.yaml │ ├── hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml │ ├── hf_llama3_70b_seq16k_gpu_p5x128_pretrain.yaml │ ├── hf_llama3_70b_seq16k_gpu_p5x32_pretrain.yaml │ ├── hf_llama3_70b_seq16k_gpu_p5x64_pretrain.yaml │ ├── hf_llama3_70b_seq8k_gpu_p5x128_pretrain.yaml │ ├── hf_llama3_70b_seq8k_gpu_p5x32_pretrain.yaml │ ├── hf_llama3_70b_seq8k_gpu_p5x64_pretrain.yaml │ ├── hf_llama3_70b_seq8k_trn1x16_pretrain.yaml │ ├── hf_llama3_8b_seq16k_gpu_p5x16_pretrain.yaml │ ├── hf_llama3_8b_seq16k_gpu_p5x32_pretrain.yaml │ ├── hf_llama3_8b_seq8k_gpu_p5x16_pretrain.yaml │ ├── hf_llama3_8b_seq8k_gpu_p5x32_pretrain.yaml │ ├── hf_llama3_8b_seq8k_trn1x4_pretrain.yaml │ ├── megatron_llama3_1_8b_nemo.yaml │ └── p4_hf_llama3_70b_seq8k_gpu.yaml │ ├── mistral │ ├── hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml │ ├── hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml │ ├── hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml │ └── hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml │ └── mixtral │ ├── hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.yaml │ ├── hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml │ ├── hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml │ ├── hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.yaml │ ├── hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml │ ├── hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml │ ├── hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml │ ├── hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml │ ├── hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml │ └── hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml ├── requirements.txt ├── scripts └── licenseChecker.sh ├── template └── sm_jobs.py ├── tests ├── __init__.py ├── config_validator │ ├── test_type_validator.py │ └── test_value_validator.py ├── k8s_workflow │ ├── k8s_baseline_artifacts │ │ ├── llama-8b │ │ │ ├── k8s_template │ │ │ │ ├── Chart.yaml │ │ │ │ ├── config │ │ │ │ │ └── llama-8b_hydra.yaml │ │ │ │ ├── templates │ │ │ │ │ ├── training-config.yaml │ │ │ │ │ └── training.yaml │ │ │ │ └── values.yaml │ │ │ ├── llama-8b_hydra.yaml │ │ │ └── llama-8b_submission.sh │ │ └── test_custom │ │ │ ├── k8s_template │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ └── training.yaml │ │ │ └── values.yaml │ │ │ └── test_custom_submission.sh │ ├── test_custom_k8s_workflow.py │ └── test_recipe_k8s_workflow.py ├── slurm_workflow │ ├── slurm_baseline_artifacts │ │ ├── hf-llama3-8b │ │ │ ├── launch_docker_container.sh │ │ │ ├── llama-8b_hydra.yaml │ │ │ ├── sagemaker-hf-llama3-8b_submission.sh │ │ │ └── train_script.sh │ │ ├── llama-8b │ │ │ ├── docker_exec_script.sh │ │ │ ├── launch_docker_container.sh │ │ │ ├── llama-8b_hydra.yaml │ │ │ ├── sagemaker-llama-8b_submission.sh │ │ │ └── train_script.sh │ │ └── test_custom │ │ │ ├── docker_exec_script.sh │ │ │ ├── launch_docker_container.sh │ │ │ ├── testcustom_slurm_test_custom_submission.sh │ │ │ └── train_script.sh │ ├── test_custom_slurm_workflow.py │ └── test_recipe_slurm_workflow.py ├── sm_jobs_workflow │ ├── __init__.py │ ├── sm_jobs_baseline_artifacts │ │ ├── multimodal │ │ │ ├── llama3-2-11b │ │ │ │ ├── launch.py │ │ │ │ ├── llama3-2-11b_hydra.yaml │ │ │ │ ├── llama3-2-11b_submission.sh │ │ │ │ ├── recipe.yaml │ │ │ │ ├── requirements.txt │ │ │ │ └── sm_jobs_config.yaml │ │ │ └── llama3.2-11b │ │ │ │ ├── launch.py │ │ │ │ ├── llama3.2-11b_hydra.yaml │ │ │ │ ├── llama3.2-11b_submission.sh │ │ │ │ ├── recipe.yaml │ │ │ │ ├── requirements.txt │ │ │ │ └── sm_jobs_config.yaml │ │ ├── no_kwargs │ │ │ └── llama-8b │ │ │ │ ├── launch.py │ │ │ │ ├── llama-8b_hydra.yaml │ │ │ │ ├── llama-8b_submission.sh │ │ │ │ └── sm_jobs_config.yaml │ │ └── with_kwargs │ │ │ └── llama-8b │ │ │ ├── launch.py │ │ │ ├── llama-8b_hydra.yaml │ │ │ ├── llama-8b_submission.sh │ │ │ └── sm_jobs_config.yaml │ └── test_sm_jobs_workflow.py ├── test_config_files.py ├── test_launcher_scripts.py ├── test_readme.py ├── test_recipes.py └── test_utils.py └── validations_wrapper.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | # Exclude submodule directory from coverage 3 | omit = 4 | launcher/nemo/nemo_framework_launcher/* 5 | template/* 6 | 7 | [report] 8 | fail_under = 85 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: File a report to help us reproduce and fix the problem 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Describe the bug 11 | A clear and concise description of what the bug is. 12 | 13 | ## How to Reproduce? 14 | A clear, step-by-step set of instructions to reproduce the bug. 15 | The provided code need to be **complete** and **runnable**, if additional data is needed, please include them in the issue. 16 | 17 | ## Expected behavior 18 | A clear and concise description of what you expected to happen. 19 | 20 | ## Screenshots, error messages or logs 21 | If applicable, please share with us screenshots, error messages or logs to help explain your problem. 22 | 23 | ## System information 24 | A description of your system. Please provide: 25 | - **Docker image you ran against**: 26 | - **Source code version you ran against**: 27 | - **Python version**: 28 | - **Hardware accelerator used**: 29 | 30 | ## Additional context 31 | Add any other context about the problem here. Please provide any additional steps you have tried to solve your issue here. 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation request 3 | about: Request improved documentation 4 | title: '' 5 | labels: 'documentation request' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## What did you find confusing? 11 | A clear and concise description of what you found confusing. Ex. I tried to [...] but I didn't understand how to [...] 12 | 13 | ## Describe how documentation can be improved 14 | A clear and concise description of where documentation was lacking and how it can be improved. 15 | 16 | ## Additional context 17 | Add any other context or screenshots about the documentation request here. 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest new functionality for this project 4 | title: '' 5 | labels: 'feature request' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Describe the feature you'd like 11 | A clear and concise description of the functionality you want. 12 | 13 | ## How would this feature be used? 14 | A clear and concise description of the use case for this feature. Please provide an example, if possible. 15 | 16 | ## Describe alternatives you've considered 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | ## Additional context 20 | Add any other context about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | ### Motivation 4 | Explain the motivation 5 | 6 | ### Changes 7 | * List your changes 8 | 9 | ### Testing 10 | Explain how the changes were tested 11 | 12 | ## Merge Checklist 13 | Put an x in the boxes that apply. If you're unsure about any of them, don't hesitate to ask. We're here to help! This is simply a reminder of what we are going to look for before merging your pull request. 14 | 15 | ### General 16 | - [ ] I have read the [CONTRIBUTING](../CONTRIBUTING.md) doc 17 | - [ ] I have run `pre-commit run --all-files` on my code. It will check for [this configuration](../.pre-commit-config.yaml). 18 | - [ ] I have updated any necessary documentation, including [READMEs](../README.md) and API docs (if appropriate) 19 | - [ ] I have verified the licenses used in the license-files artifact generated in the Python License Scan CI check. If the license workflow fails, kindly check the licenses used in the artifact. 20 | 21 | ### Tests 22 | - [ ] I have run `pytest` on my code and all unit tests passed. 23 | - [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate) 24 | 25 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 26 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit-check-runner-push.yml: -------------------------------------------------------------------------------- 1 | name: Python Pre Commit Check CI After Commit 2 | 3 | on: 4 | push: 5 | branches: 6 | - main # Triggers on direct pushes to the main branch 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v3 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.8' # Set python version to 3.8 20 | 21 | - name: Install pre-commit dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install pre-commit 25 | 26 | - name: Run pre-commit checks 27 | run: | 28 | pre-commit run --all-files 29 | -------------------------------------------------------------------------------- /.github/workflows/repo-monitoring-cron.yml: -------------------------------------------------------------------------------- 1 | name: Repository Monitoring 2 | 3 | on: 4 | schedule: 5 | - cron: '0 16 * * *' 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.run_id }} 9 | cancel-in-progress: true 10 | 11 | permissions: 12 | id-token: write # This is required for requesting the JWT 13 | contents: read # This is required for actions/checkout 14 | 15 | jobs: 16 | check-pr-alerts: 17 | runs-on: ubuntu-latest 18 | if: github.event.repository.visibility == 'public' 19 | timeout-minutes: 10 20 | outputs: 21 | pr_count: ${{ steps.pr-count.outputs.count }} 22 | steps: 23 | - name: Checkout code 24 | uses: actions/checkout@v3 25 | - name: Check for open PRs 26 | id: pr-count 27 | env: 28 | GITHUB_TOKEN: ${{ secrets.GH_PAT }} 29 | run: | 30 | pr_count=$(gh pr list --state open --limit 1000 | wc -l) 31 | echo "count=$pr_count" >> $GITHUB_OUTPUT 32 | 33 | check-issue-alerts: 34 | runs-on: ubuntu-latest 35 | if: github.event.repository.visibility == 'public' 36 | timeout-minutes: 10 37 | outputs: 38 | issue_count: ${{ steps.issue-count.outputs.count }} 39 | steps: 40 | - name: Checkout code 41 | uses: actions/checkout@v3 42 | - name: Check for open issues 43 | id: issue-count 44 | env: 45 | GITHUB_TOKEN: ${{ secrets.GH_PAT }} 46 | run: | 47 | issue_count=$(gh issue list --state open --limit 1000 | wc -l) 48 | echo "count=$issue_count" >> $GITHUB_OUTPUT 49 | 50 | put-metric-data: 51 | runs-on: ubuntu-latest 52 | if: github.event.repository.visibility == 'public' 53 | timeout-minutes: 10 54 | needs: [check-pr-alerts, check-issue-alerts] 55 | steps: 56 | - name: Configure AWS Credentials 57 | uses: aws-actions/configure-aws-credentials@v2 58 | with: 59 | role-to-assume: ${{ secrets.RUNNER_ROLE_ARN }} 60 | role-session-name: repo-monitoring-cron-session 61 | aws-region: us-west-2 62 | 63 | - name: Put PR Alert Metric Data 64 | run: | 65 | aws cloudwatch put-metric-data --metric-name PRAlert --namespace RepoMetrics --value ${{ needs.check-pr-alerts.outputs.pr_count }} --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes 66 | 67 | - name: Put Issue Alert Metric Data 68 | run: | 69 | aws cloudwatch put-metric-data --metric-name IssueAlert --namespace RepoMetrics --value ${{ needs.check-issue-alerts.outputs.issue_count }} --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes 70 | -------------------------------------------------------------------------------- /.github/workflows/unit-test-runner-push.yml: -------------------------------------------------------------------------------- 1 | name: Python Unit Test CI After Commit 2 | 3 | on: 4 | push: 5 | branches: 6 | - main # Triggers on direct pushes to the main branch 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v3 15 | with: 16 | submodules: recursive # Checkout submodules as well 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: '3.8' # Set python version to 3.8 22 | 23 | - name: Install unit test dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r launcher/nemo/nemo_framework_launcher/requirements.txt 27 | pip install pytest 28 | pip install pytest-cov 29 | 30 | - name: Run unit tests 31 | run: | 32 | python -m pytest 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # log and data files 2 | trace 3 | .DS_Store 4 | .hydra 5 | .bash_history.local 6 | results/ 7 | outputs/ 8 | tmp/ 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | **.pyc 15 | core.* 16 | 17 | # Unit test / coverage reports 18 | coverage_html_report/ 19 | .coverage 20 | .coverage.* 21 | .cache 22 | *.cover 23 | .hypothesis/ 24 | .pytest_cache/ 25 | 26 | # Playground area 27 | mypg/ 28 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "launcher/nemo/nemo_framework_launcher"] 2 | path = launcher/nemo/nemo_framework_launcher 3 | url = https://github.com/NVIDIA/NeMo-Framework-Launcher.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | # force all unspecified python hooks to run python3 3 | python: python3 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v2.3.0 7 | hooks: 8 | - id: end-of-file-fixer 9 | exclude: ^(tests/slurm_workflow/slurm_baseline_artifacts/|tests/k8s_workflow/k8s_baseline_artifacts/|tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/) 10 | - id: trailing-whitespace 11 | exclude: ^(tests/slurm_workflow/slurm_baseline_artifacts/|tests/k8s_workflow/k8s_baseline_artifacts/|tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/) 12 | - repo: https://github.com/humitos/mirrors-autoflake.git 13 | rev: v1.3 14 | hooks: 15 | - id: autoflake 16 | args: ['--in-place', '--expand-star-imports', '--ignore-init-module-imports', '--remove-all-unused-imports'] 17 | additional_dependencies: [setuptools] 18 | - repo: https://github.com/psf/black 19 | rev: 23.3.0 20 | hooks: 21 | - id: black 22 | args: [--line-length=120] 23 | - repo: https://github.com/pocc/pre-commit-hooks 24 | rev: v1.1.1 25 | hooks: 26 | - id: clang-format 27 | args: [--style=file, -i] 28 | - repo: https://github.com/pycqa/isort 29 | rev: 5.12.0 30 | hooks: # imports sorting 31 | - id: isort 32 | name: isort (python) 33 | args: ["--profile", "black"] 34 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /Config: -------------------------------------------------------------------------------- 1 | package.SagemakerTrainingLauncher = { 2 | interfaces = (1.0); 3 | 4 | # Use NoOpBuild. See https://w.amazon.com/index.php/BrazilBuildSystem/NoOpBuild 5 | build-system = no-op; 6 | build-tools = { 7 | 1.0 = { 8 | NoOpBuild = 1.0; 9 | }; 10 | }; 11 | 12 | # Use runtime-dependencies for when you want to bring in additional 13 | # packages when deploying. 14 | # Use dependencies instead if you intend for these dependencies to 15 | # be exported to other packages that build against you. 16 | dependencies = { 17 | 1.0 = { 18 | }; 19 | }; 20 | 21 | runtime-dependencies = { 22 | 1.0 = { 23 | }; 24 | }; 25 | 26 | }; 27 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /launcher/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | -------------------------------------------------------------------------------- /launcher/nemo/README.md: -------------------------------------------------------------------------------- 1 | # Core NeMo launching implementations 2 | This folder contains the core launching framework for NeMo based implementations. We use the same design as the [NeMo-Framework-Launcher](https://github.com/NVIDIA/NeMo-Framework-Launcher/tree/main). Bsaically there are 2 steps: 3 | - A stage defined in `stages.py` will prepare for the training script launching command and the cluster configs, passing these configs into the actual launcher 4 | - A launcher defined in `launchers.py` will take the configs from the stage and generate the real launching script. Then launcher will kick off the run using corresponding cluster methods, i.e. slurm or k8s. 5 | 6 | ## Stages 7 | We support different use cases, and each will be corresponding to a stage: 8 | - `SMTraining`: Stage to run native NeMo workload 9 | - `SMTrainingGPURecipe`: Stage used to run our GPU recipes 10 | - `SMTrainingTrainiumRecipe`: Stage to run our Trainium recipes 11 | - `SMCustomTrainingGPU`: Stage for training with custom script on GPU 12 | - `SMCustomTrainingTrainium`: Stage for training with custom script on Trainium 13 | 14 | ## Launchers 15 | Currently we only need our own launchers for custom jobs, because we need to manage the `torchrun` command 16 | - `SMSlurmLauncher`: Launcher for custom jobs using slurm 17 | -------------------------------------------------------------------------------- /launcher/nemo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | -------------------------------------------------------------------------------- /launcher/nemo/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from pathlib import Path 15 | 16 | SM_ADAPTER_REPO = "https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git" 17 | NEMO_REPO = "https://github.com/NVIDIA/NeMo.git" 18 | NEMO_REPO_TAG = "v2.0.0rc0" # [TODO] move to v2.0.0 once it is released 19 | 20 | SM_ADAPTER_MODEL_TYPE_TO_CODE_PATH = { 21 | "deepseek": "examples/deepseek/deepseek_pretrain.py", 22 | "llama": "examples/llama/llama_pretrain.py", 23 | "mistral": "examples/mistral/mistral_pretrain.py", 24 | "mixtral": "examples/mixtral/mixtral_pretrain.py", 25 | } 26 | 27 | NEURONX_REPO_URI = "https://github.com/aws-neuron/neuronx-distributed-training.git" 28 | NEURONX_REPO_TAG = "main" 29 | NEURONX_CONF_PATH = "examples/conf" 30 | 31 | # utility directory to more easily navigate to other parts of the package 32 | ROOT_DIR = Path(__file__).resolve().parent.parent.parent # package root 33 | -------------------------------------------------------------------------------- /launcher/nemo/k8s_templates/training/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: Sagemaker Model Training 4 | name: sagemaker-training 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /launcher/nemo/k8s_templates/training/train-script-gpu.yaml: -------------------------------------------------------------------------------- 1 | {{ $config := .Values.trainingConfig }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: train-script-gpu-{{ $config.jobName }} 6 | data: 7 | train-script.sh: | 8 | #!/bin/bash 9 | set -ex 10 | 11 | {{- if $config.git.repo_url_or_path }} 12 | mkdir -p $HOME/tmp 13 | GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME 14 | [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR 15 | git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR 16 | GIT_CLONE_DIR=${GIT_CLONE_DIR}/ 17 | cd $GIT_CLONE_DIR 18 | rm -rf __pycache__ 19 | 20 | {{- if $config.git.branch }} 21 | git checkout {{ $config.git.branch }} 22 | {{- end }} 23 | 24 | {{- if $config.git.commit }} 25 | git fetch origin {{ $config.git.commit }} 26 | git reset --hard {{ $config.git.commit }} 27 | {{- end }} 28 | {{- if $config.git.update_adapter }} 29 | 30 | pip install . --force-reinstall --no-deps 31 | 32 | {{- end }} 33 | {{- else }} 34 | GIT_CLONE_DIR="" 35 | {{- end }} 36 | 37 | {{- range $config.pre_script }} 38 | {{ . }} 39 | {{- end }} 40 | 41 | {{- if gt (int $config.nodes) 1 }} 42 | export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --rdzv_backend=c10d --rdzv_endpoint={{ $config.jobName }}-worker-0" 43 | {{- else }} 44 | export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}" 45 | {{- end }} 46 | 47 | echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS" 48 | torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }} \ 49 | {{- if $config.scriptArgs -}} 50 | {{ $config.scriptArgs }} 51 | {{- end }} 52 | 53 | {{- range $config.post_script }} 54 | {{ . }} 55 | {{- end }} 56 | 57 | {{- if $config.git.repo_url_or_path }} 58 | cd $HOME 59 | rm -rf $GIT_CLONE_DIR 60 | {{- end }} 61 | -------------------------------------------------------------------------------- /launcher/nemo/k8s_templates/training/training-config.yaml: -------------------------------------------------------------------------------- 1 | {{ $config := .Values.trainingConfig }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: training-config-{{ $config.jobName }} 6 | data: 7 | config.yaml: |- 8 | {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }} 9 | -------------------------------------------------------------------------------- /launcher/nemo/k8s_templates/training/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | # training image 3 | trainingImage: cfg.container 4 | 5 | # image pulling policy 6 | pullPolicy: IfNotPresent 7 | 8 | 9 | trainingConfig: 10 | # current job name 11 | jobName: "nil" 12 | 13 | # namespace to launch job 14 | namespace: "default" 15 | 16 | # script path 17 | scriptPath: null 18 | 19 | # script args 20 | scriptArgs: null 21 | 22 | # specify whether to use custom scripts 23 | customScript: null 24 | 25 | # list of custom annotations apply to jobs 26 | annotations: null 27 | 28 | # list of custom labels apply to jobs and pods 29 | customLabels: null 30 | 31 | # Kueue scheduler priority class name 32 | priority_class_name: null 33 | 34 | # device type, can be "gpu", "trainium" and "nil", "nil" means cpu 35 | device: "nil" 36 | 37 | # number of EFA devices if the instance type support EFA 38 | numEFADevices: 0 39 | 40 | # number of Neuron devices if job is for Trainium 41 | numNeuronDevices: null 42 | 43 | # number of process per node 44 | ntasksPerNode: 0 45 | 46 | # number of nodes to run 47 | nodes: training.trainer.num_nodes 48 | 49 | # restart policy 50 | restartPolicy: Never 51 | 52 | # from NeMo, not used currently 53 | wandbKey: "nil" 54 | 55 | # name of service account associated with the namespace 56 | serviceAccountName: null 57 | 58 | # relevant for Trainium chips, either 0 or 1 59 | compile: 0 60 | 61 | # persistent volume, usually used to mount FSx 62 | persistentVolumeClaims: null 63 | 64 | # temp volume, usually used to mount temp file in the host 65 | volumes: null 66 | 67 | # A github repo if user might want to use script inside 68 | git: 69 | repo_url_or_path: null 70 | branch: null 71 | commit: null 72 | token: null 73 | update_adapter: null 74 | 75 | # Commands to run before training 76 | pre_script: [] 77 | # Commands to run after training 78 | post_script: [] 79 | 80 | # select preferred and required labels for nodes 81 | labelSelector: 82 | required: null # select nodes with required labels 83 | preferred: null # select nodes with priority which has preferred labels 84 | weights: null # list of weights for the preferred labels 85 | 86 | # The clean up policy after the job completes or fails. 87 | cleanPodPolicy: null 88 | -------------------------------------------------------------------------------- /launcher/telemetry.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import time 5 | from dataclasses import asdict, dataclass, field 6 | from typing import List 7 | 8 | CW_NAME_SPACE = "RecipesTelemetry" 9 | 10 | 11 | @dataclass 12 | class Metric: 13 | Name: str = None 14 | Unit: str = None 15 | 16 | 17 | @dataclass 18 | class MetricDirective: 19 | Namespace: str = "" 20 | Dimensions: List[List[str]] = None 21 | Metrics: List[Metric] = None 22 | 23 | 24 | @dataclass 25 | class Metadata: 26 | CloudWatchMetrics: List[MetricDirective] = field(default_factory=lambda: [MetricDirective]) 27 | Timestamp: int = None 28 | 29 | 30 | @dataclass 31 | class CWTelemetryStart: 32 | account_id: str = "" 33 | training_start_time: int = 0 34 | num_nodes: int = 0 35 | job_name: str = "" 36 | cluster_type: str = "" 37 | instance_type: str = "" 38 | _aws: Metadata = None 39 | job_id: int = 0 40 | recipe: str = "" 41 | container: str = "" 42 | 43 | 44 | class Telemetry: 45 | def __init__(self, log_path="/var/log/aws/clusters/sagemaker-hyperpod-recipes-telemetry.log"): 46 | self.log_path = log_path 47 | 48 | def get_account_id(self): 49 | import boto3 50 | 51 | client = boto3.client("sts") 52 | return client.get_caller_identity()["Account"] 53 | 54 | def publish_cw_log(self, log): 55 | save_log = asdict(log) 56 | with open(self.log_path, "a") as f: 57 | f.write(json.dumps(save_log, separators=(",", ":")) + "\n") 58 | 59 | def start( 60 | self, 61 | cluster_type=None, 62 | instance_type=None, 63 | num_nodes=None, 64 | job_id=None, 65 | container=None, 66 | ): 67 | if not os.path.exists(self.log_path): 68 | return 69 | account_id = self.get_account_id() 70 | cw_telemetry_start = CWTelemetryStart(account_id=account_id) 71 | cw_telemetry_start.training_start_time = int(time.time() * 1000) 72 | cw_telemetry_start.num_nodes = int(num_nodes) 73 | cw_telemetry_start.cluster_type = cluster_type 74 | cw_telemetry_start.instance_type = instance_type 75 | cw_telemetry_start.job_id = job_id 76 | cw_telemetry_start.container = container 77 | 78 | recipe = "" 79 | for arg in sys.argv: 80 | if arg.startswith("recipes="): 81 | recipe = arg.split("=")[1] 82 | cw_telemetry_start.recipe = recipe 83 | 84 | metadata = Metadata( 85 | Timestamp=int(time.time() * 1000), 86 | CloudWatchMetrics=[ 87 | MetricDirective( 88 | Namespace=CW_NAME_SPACE, 89 | Dimensions=[[]], 90 | Metrics=[Metric(Name="num_nodes", Unit="Count")], 91 | ) 92 | ], 93 | ) 94 | cw_telemetry_start._aws = metadata 95 | self.publish_cw_log(cw_telemetry_start) 96 | -------------------------------------------------------------------------------- /launcher_scripts/custom_model/run_falcon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset 10 | VAL_DIR=${VAL_DIR} # Location of talidation dataset 11 | 12 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, ect 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/custom_model/falcon \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-falcon" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.trainer.num_nodes=4 \ 21 | recipes.model.train_batch_size=2 \ 22 | recipes.model.data.train_dir=$TRAIN_DIR \ 23 | recipes.model.data.val_dir=$VAL_DIR \ 24 | -------------------------------------------------------------------------------- /launcher_scripts/custom_script/README.md: -------------------------------------------------------------------------------- 1 | # Config for running with custom scripts 2 | Custom config allows user to use launcher to run some custom jobs that does not use our recipe. We use hydra format for the configs, same as our recipes. Please refer to the `config.yaml` as the template, which also aligns with the `config.yaml` in the recipe folder with some extra configs on cluster and custom script. 3 | ## Config fields 4 | Here are some essential fields that user might want to override during for custom training 5 | - training_cfg: This field contains most configs about the training runs 6 | - entry_script: Path to the entry script of training/fine-tuning. This path can be one in the container mounts. 7 | - script_args: The args that will be used to run this script 8 | - run: All runtime configs 9 | - name: Current run name 10 | - nodes: Number of nodes to use 11 | - ntasks_per_node: Number of devices to use per node 12 | - results_dir: Directories to store the result. It is recommended to keep it as `${base_results_dir}/${.name}` so everything will be in `base_results_dir` 13 | - cluster: All cluster based configs 14 | - cluster_type: Type of the cluster, can be slrum(bcm) or k8s 15 | - instance_type: Instance type to use, if null will use default instance type in cluster. 16 | - cluster_config: The detailed cluster config, will be different between slrum and k8s. For details please refer to recipe's doc about cluster setup. 17 | - namespace: Namespace to launch jobs 18 | - custom_labels: k8s labels applied to job and also each pod running the job, see more details about labels in https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ 19 | - annotations: k8s annotations added to the job, see more details in https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ 20 | - priority_class_name: Kueue scheduler priority class name, see more details in https://kueue.sigs.k8s.io/ 21 | - label_selector: k8s NodeAffinity functionality. To allow node selection based on required labels or priority scheduling based on preferred labels. 22 | - service_account_name: aws eks service account name. To give pods credentials to call aws services. 23 | - persistent_volume_claims: specify multiple persistent volume claims to mount job pod. 24 | The rest of the configs are similar to the recipe configs. 25 | ## Launch 26 | To launch the job, simply run inside the `SagemakerTrainingLauncher/launcher folder` with command `python main.py --config-path examples/custom_script/ --config-name config` or use your own config folder. 27 | -------------------------------------------------------------------------------- /launcher_scripts/custom_script/config_slurm.yaml: -------------------------------------------------------------------------------- 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 2 | 3 | defaults: 4 | - override hydra/job_logging: stdout 5 | 6 | hydra: 7 | run: 8 | dir: . 9 | output_subdir: null 10 | 11 | git: 12 | repo_url_or_path: null 13 | branch: null 14 | commit: null 15 | token: null 16 | 17 | training_cfg: 18 | 19 | entry_script: null # Path to the entry script of training/fine-tuning. This path should be inside container or relative path in git repo 20 | # script_args: 21 | # - "--some_args" : "debug" 22 | # - "--some_other_args" : 1 23 | run: 24 | name: test_custom # Current run name 25 | nodes: 2 # Number of nodes to use for current training 26 | ntasks_per_node: 8 # Number of devices to use per node 27 | 28 | cluster: 29 | #Example slurm cluster 30 | 31 | cluster_type: slurm 32 | instance_type: p5.48xlarge 33 | cluster_config: 34 | exclusive: True 35 | job_name_prefix: testcustom_slurm_ 36 | slurm_create_submission_file_only: False # Setting to True if just want to create submission file 37 | srun_args: 38 | # - "--no-container-mount-home" 39 | 40 | base_results_dir: null # Location to store the results, checkpoints and logs. 41 | container_mounts: # List of additional paths to mount to container. They will be mounted to same path. 42 | - null 43 | container: null # container to use 44 | slurm_docker_cfg: # Will only be used with docker on slurm 45 | docker_args: 46 | # - "--runtime=nvidia" # this is required if the docker runtime version is low 47 | post_launch_commands: # commands will run after launching the docker container using bash 48 | 49 | env_vars: 50 | NCCL_DEBUG: DEBUG # Logging level for NCCL. Set to "INFO" for debug information 51 | -------------------------------------------------------------------------------- /launcher_scripts/custom_script/custom_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | print("init process group") 5 | dist.init_process_group("nccl") 6 | print("rank:", dist.get_rank()) 7 | torch.cuda.set_device(dist.get_rank() % 8) 8 | tensor = torch.randn(4, 4, device="cuda") 9 | print(f"[{dist.get_rank()}] tensor {tensor}") 10 | dist.all_reduce(tensor) 11 | print(f"[{dist.get_rank()}] tensor {tensor} after reduce") 12 | -------------------------------------------------------------------------------- /launcher_scripts/custom_script/run_allreduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /recipes_collection/config.yaml 4 | 5 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 6 | 7 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset 8 | VAL_DIR=${VAL_DIR} # Location of talidation dataset 9 | 10 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, ect 11 | 12 | 13 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 14 | --config-path=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/launcher_scripts/custom_script \ 15 | --config-name=config_slurm \ 16 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 17 | training_cfg.entry_script=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/laucher_scripts/custom_script/custom_allreduce.py \ 18 | container_mounts=[${SAGEMAKER_TRAINING_LAUNCHER_DIR}] \ 19 | container=\ 20 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-671b-seq8k-gpu-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=5 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_qlora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-671b-seq8k-gpu-qlora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-70b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=16 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-70b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-70b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=10 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-70b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-8b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.data.train_dir="$TRAIN_DIR" \ 25 | recipes.model.data.val_dir="$VAL_DIR" \ 26 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 27 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 28 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-8b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-8b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-llama-8b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_14b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_14b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-14b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=4 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_1_dot_5b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_1_dot_5b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-1-dot-5b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=4 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=6 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=4 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-32b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_7b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_7b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-deepseek-r1-distilled-qwen-7b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 16 | recipes=training/llama/p4_hf_llama3_70b_seq8k_gpu \ 17 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 18 | recipes.run.name="hf-llama3-70b" \ 19 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 20 | recipes.trainer.num_nodes=32 \ 21 | recipes.model.train_batch_size=1 \ 22 | recipes.model.data.train_dir="$TRAIN_DIR" \ 23 | recipes.model.data.val_dir="$VAL_DIR" \ 24 | -------------------------------------------------------------------------------- /launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=32 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=20 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-8b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=4 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-8b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset 10 | VAL_DIR=${VAL_DIR} # Location of talidation dataset 11 | 12 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, etc 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 16 | recipes=training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain \ 17 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 18 | recipes.run.name="hf-llama3-2-11b" \ 19 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 20 | recipes.model.data.train_dir="$TRAIN_DIR" \ 21 | recipes.model.data.val_dir="$VAL_DIR" \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-2-1b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-2-3b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 7 | 8 | TRAIN_DIR=${TRAIN_DIR} # Location of training dataset 9 | VAL_DIR=${VAL_DIR} # Location of talidation dataset 10 | 11 | EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, etc 12 | 13 | 14 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 15 | recipes=training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain \ 16 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 17 | recipes.run.name="hf-llama3-2-90b" \ 18 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 19 | recipes.model.data.train_dir="$TRAIN_DIR" \ 20 | recipes.model.data.val_dir="$VAL_DIR" \ 21 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=16 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=10 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-405b-seq131072-qlora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-405b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=6 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-405b-qlora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_405b_seq32k_gpu_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_405b_seq32k_gpu_qlora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-405b-qlora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-405b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=6 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-405b-qlora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=16 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x128_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x128_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-70b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-70b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-70b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=10 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_70b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-70b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x128_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-70b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-70b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-70b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | COMPILE="${COMPILE}" # Set to 1 to compile the model, 0 to load a pre-compiled model 10 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 11 | MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model 12 | 13 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 14 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 15 | instance_type="trn1.32xlarge" \ 16 | recipes=training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain \ 17 | recipes.run.name="hf-llama3-70b" \ 18 | recipes.run.compile="$COMPILE" \ 19 | recipes.trainer.max_steps=50 \ 20 | recipes.data.train_dir="$TRAIN_DIR" \ 21 | recipes.model.model_config="$MODEL_CONFIG" \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-8b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.data.train_dir="$TRAIN_DIR" \ 25 | recipes.model.data.val_dir="$VAL_DIR" \ 26 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 27 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 28 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_8b_seq16k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-8b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-8b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-8b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_dpo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_dpo \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-8b-dpo" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_fine_tuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-8b-fine-tuning" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama3-8b-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=2 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-8b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/llama/hf_llama3_8b_seq8k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-llama3-8b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | COMPILE="${COMPILE}" 10 | COMPILER_CACHE_PATH="${COMPILER_CACHE_PATH}" 11 | TOKENIZER_TYPE="${TOKENIZER_TYPE}" 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | RESUME_FROM_CHECKPOINT_DIR="${RESUME_FROM_CHECKPOINT_DIR}" 15 | MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model 16 | 17 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 18 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 19 | instance_type="trn1.32xlarge" \ 20 | recipes=fine-tuning/llama/hf_llama3_8b_seq8k_trn1_fine_tuning \ 21 | recipes.run.name="hf-llama3-8b-sft" \ 22 | recipes.run.compile="$COMPILE" \ 23 | recipes.trainer.max_steps=50 \ 24 | recipes.compiler_cache_url="$COMPILER_CACHE_PATH" \ 25 | recipes.data.tokenizer.type="$TOKENIZER_TYPE" \ 26 | recipes.data.train_dir="$TRAIN_DIR" \ 27 | recipes.data.val_dir="$VAL_DIR" \ 28 | recipes.exp_manager.resume_from_checkpoint="$RESUME_FROM_CHECKPOINT_DIR" \ 29 | recipes.model.model_config="$MODEL_CONFIG" \ 30 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | COMPILE=0 10 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 11 | MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model 12 | 13 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 14 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 15 | instance_type="trn1.32xlarge" \ 16 | recipes=training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain \ 17 | recipes.run.name="hf-llama3-8b" \ 18 | recipes.run.compile="$COMPILE" \ 19 | recipes.trainer.max_steps=50 \ 20 | recipes.data.train_dir="$TRAIN_DIR" \ 21 | recipes.model.model_config="$MODEL_CONFIG" \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama-4-17b-16e-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama-4-17b-16e-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama-4-17b-16e-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=2 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path 10 | HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token 11 | 12 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 13 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 14 | 15 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 16 | 17 | 18 | HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ 19 | recipes=fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text \ 20 | base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ 21 | recipes.run.name="hf-llama-4-17b-16e-lora" \ 22 | recipes.exp_manager.exp_dir="$EXP_DIR" \ 23 | recipes.trainer.num_nodes=1 \ 24 | recipes.model.train_batch_size=1 \ 25 | recipes.model.data.train_dir="$TRAIN_DIR" \ 26 | recipes.model.data.val_dir="$VAL_DIR" \ 27 | recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ 28 | recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ 29 | -------------------------------------------------------------------------------- /launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mistral-7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mistral-7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mistral-7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mistral-7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x22b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x22b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x22b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x22b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x22b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x22b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 4 | 5 | #Users should setup their cluster type in /recipes_collection/config.yaml 6 | 7 | SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} 8 | 9 | TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset 10 | VAL_DIR="${VAL_DIR}" # Location of validation dataset 11 | 12 | EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. 13 | 14 | 15 | HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ 16 | recipes=training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain \ 17 | base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ 18 | recipes.run.name="hf-mixtral-8x7b" \ 19 | recipes.exp_manager.exp_dir=$EXP_DIR \ 20 | recipes.model.data.train_dir=$TRAIN_DIR \ 21 | recipes.model.data.val_dir=$VAL_DIR \ 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | minversion = 7.0 3 | # durations=0 will display all tests execution time, sorted in ascending order starting from from the slowest one. 4 | # -vv will also display tests with durration = 0.00s 5 | addopts = [ 6 | "--cache-clear", 7 | "--quiet", 8 | "--durations=0", 9 | "--cov=launcher/", 10 | # uncomment this line to see a detailed HTML test coverage report instead of the usual summary table output to stdout. 11 | # "--cov-report=html", 12 | "tests/", 13 | ] 14 | testpaths = ["tests"] 15 | norecursedirs = [".eggs", ".pytest_cache", "*.egg-info", ".git", "build"] 16 | -------------------------------------------------------------------------------- /recipes_collection/cluster/k8s.yaml: -------------------------------------------------------------------------------- 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 2 | 3 | pullPolicy: Always # policy to pull container, can be Always, IfNotPresent and Never 4 | restartPolicy: Never # restart policy 5 | namespace: default # the namespace to submit job 6 | # create customized labels for the PytorchJob and Pods deployed jobs. 7 | # Example: 8 | # custom_labels: 9 | # label-key-1: label-value-1 10 | # label-key-2: label-value-2 11 | custom_labels: null 12 | # create customized annotations for the jobs. 13 | # Example: 14 | # annotations: 15 | # annotation-key-1: annotation-value-1 16 | # annotation-key-2: annotation-value-2 17 | annotations: null 18 | # add service account to job pods 19 | # Example: 20 | # serviceAccountName: service_account 21 | service_account_name: null 22 | # priorityClassName for Kueue scheduler to decide jobs priority 23 | priority_class_name: null 24 | 25 | # temp volume, usually used to mount temp directory 26 | # Example: 27 | # volumes: 28 | # - volumeName: data1 29 | # hostPath: "/data" 30 | # mountPath: "/data" 31 | 32 | volumes: null 33 | 34 | # persistent volume, usually used to mount FSx 35 | # Example: 36 | # persistent_volume_claims: 37 | # - claimName: null 38 | # mountPath: null 39 | # - claimName: null 40 | # mountPath: null 41 | 42 | # persistent volumes, usually used to mount FSx 43 | persistent_volume_claims: 44 | - null 45 | # This claim should be created before running. Example: 46 | # - claimName: fsx-claim 47 | # mountPath: data 48 | 49 | # Create k8s NodeAffinity to select nodes to deploy jobs which matches required and preferred labels 50 | # Structure: 51 | # label_selector: 52 | # required: 53 | # preferred: 54 | # weights: 55 | # Example: 56 | # label_selector: 57 | # required: 58 | # example-label-key: 59 | # - expected-label-value-1 60 | # - expected-label-value-2 61 | # preferred: 62 | # preferred-label-key: 63 | # - preferred-label-value-1 64 | # - preferred-label-value-2 65 | # weights: 66 | # - 100 67 | label_selector: null 68 | 69 | # The clean up policy after the job completes or fails. 70 | cleanPodPolicy: null 71 | -------------------------------------------------------------------------------- /recipes_collection/cluster/slurm.yaml: -------------------------------------------------------------------------------- 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 2 | 3 | exclusive: True 4 | mem: 0 5 | job_name_prefix: 'sagemaker-' 6 | slurm_create_submission_file_only: False # Setting to True if just want to create submission file 7 | stderr_to_stdout: True # Setting to False to split the stderr and stdout logs 8 | srun_args: 9 | # - "--no-container-mount-home" 10 | slurm_docker_cfg: 11 | docker_args: 12 | # - "--runtime=nvidia" # this is required if the docker runtime version is low 13 | post_launch_commands: # commands will run after launching the docker container using bash 14 | container_mounts: # List of additional paths to mount to container. They will be mounted to same path. 15 | - null 16 | -------------------------------------------------------------------------------- /recipes_collection/cluster/sm_jobs.yaml: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | sm_jobs_config: 15 | output_path: null # S3 output path to output artifacts 16 | tensorboard_config: 17 | output_path: null # Output path for tensorboard logs 18 | container_logs_path: null # Path to logs on the container 19 | wait: True # Whether to wait for training job to finish 20 | inputs: # Inputs to call fit with. Set either s3 or file_system, not both. 21 | s3: # Dictionary of channel names and s3 URIs. For GPUs, use channels for train and validation. 22 | train: null 23 | val: null 24 | file_system: # If using file system input, please pass VPC params in additional_estimator_kwargs. 25 | id: null 26 | type: null 27 | directory_path: null 28 | additional_estimator_kwargs: # All other additional args to pass to estimator. Must be int, float or string. 29 | max_run: 1800 30 | enable_remote_debug: True 31 | recipe_overrides: null 32 | -------------------------------------------------------------------------------- /recipes_collection/config.yaml: -------------------------------------------------------------------------------- 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 2 | 3 | defaults: 4 | - _self_ 5 | - cluster: slurm # set to `slurm`, `k8s` or `sm_jobs`, depending on the desired cluster 6 | - recipes: training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain # select desired config inside the training directory 7 | - override hydra/job_logging: stdout 8 | 9 | cluster_type: slurm # bcm, bcp, k8s or sm_jobs. If bcm, k8s or sm_jobs, it must match - cluster above. 10 | # If using sm_jobs cluster_type, set sm_jobs_config. See cluster/sm_jobs.yaml for example. 11 | 12 | hydra: 13 | run: 14 | dir: . 15 | output_subdir: null 16 | 17 | debug: False 18 | 19 | instance_type: p5.48xlarge 20 | base_results_dir: null # Location to store the results, checkpoints and logs. 21 | 22 | container: null 23 | 24 | git: 25 | repo_url_or_path: null 26 | branch: null 27 | commit: null 28 | entry_script: null 29 | token: null 30 | update_adapter: false # if true it will re-install the Adapter code but not its dependencies 31 | 32 | env_vars: 33 | NCCL_DEBUG: WARN # Logging level for NCCL. Set to "INFO" for debug information 34 | 35 | # Do not modify below, use the values above instead. 36 | training_config: ${hydra:runtime.choices.recipes} 37 | -------------------------------------------------------------------------------- /recipes_collection/recipes/training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com 2 | 3 | run: 4 | name: llama3-2-90b 5 | results_dir: ${base_results_dir}/${.name} 6 | time_limit: "6-00:00:00" 7 | model_type: hf # huggingface for our recipes 8 | 9 | trainer: 10 | devices: 8 11 | num_nodes: 32 12 | accelerator: gpu 13 | precision: bf16 14 | max_steps: 50 15 | log_every_n_steps: 1 16 | 17 | val_check_interval: 1 18 | accumulate_grad_batches: 1 19 | gradient_clip_val: 1.0 20 | 21 | 22 | exp_manager: 23 | exp_dir: null 24 | name: experiment 25 | # experiment loggers 26 | create_tensorboard_logger: False 27 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 28 | create_mlflow_logger: False 29 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 30 | create_wandb_logger: False 31 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 32 | create_checkpoint_callback: True 33 | checkpoint_callback_params: 34 | # Set save_top_k = 0 to disable sharded checkpointing 35 | save_top_k: 0 36 | every_n_train_steps: 10 37 | monitor: "step" 38 | mode: "max" 39 | save_last: False 40 | checkpoint_dir: ${..exp_manager.exp_dir}/checkpoints/ 41 | resume_from_checkpoint: null 42 | # Set auto_checkpoint = False to disable auto resilience checkpointing 43 | auto_checkpoint: 44 | enabled: False 45 | export_full_model: 46 | # Set every_n_train_steps = 0 to disable full checkpointing 47 | every_n_train_steps: 0 48 | save_last: False 49 | 50 | 51 | use_smp_model: False #enable SMP 52 | distributed_backend: nccl 53 | 54 | 55 | # Start training from pretrained model 56 | model: 57 | model_type: llama_v3 58 | do_finetune: False 59 | hf_model_name_or_path: "meta-llama/Llama-3.2-90B-Vision-Instruct" 60 | hf_access_token: null 61 | train_batch_size: 1 62 | seed: 12345 63 | grad_clip: 1.0 64 | use_flash_attention: True 65 | activation_checkpointing: True 66 | multi_modal: True 67 | delayed_param: True 68 | 69 | # FSDP Configs 70 | sharding_strategy: hybrid_shard 71 | forward_prefetch: True 72 | shard_degree: 256 73 | backward_fetch_policy: backward_pre 74 | auto_wrap_policy: transformer_auto_wrap_policy 75 | limit_all_gathers: true 76 | use_orig_param: False 77 | 78 | # model architecture 79 | max_context_width: 8192 80 | precision: bf16 81 | lr_decay_iters: 47683 82 | log_reduced_training_loss: True 83 | 84 | # PEFT 85 | peft: 86 | peft_type: null # lora 87 | 88 | # Optimizer 89 | optim: 90 | name: adamw 91 | lr: 2e-4 92 | weight_decay: 0.01 93 | betas: 94 | - 0.9 95 | - 0.98 96 | sched: 97 | name: CosineAnnealing 98 | warmup_steps: 500 99 | constant_steps: 0 100 | min_lr: 2e-5 101 | 102 | # Data 103 | data: 104 | train_dir: null 105 | val_dir: null 106 | dataset_type: hf 107 | use_synthetic_data: False 108 | tokenizer_name: null 109 | zipped_data: False 110 | 111 | # Profiling configs 112 | # Viztracer profiling options 113 | viztracer: 114 | enabled: false 115 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | hydra-core==1.3.2 2 | omegaconf>=2.2,<2.3 3 | pynvml==11.4.1 4 | requests==2.26.0 5 | tqdm==4.62.3 6 | zstandard==0.15.2 7 | tensorboard==2.12.0 8 | boto3==1.35.66 9 | -------------------------------------------------------------------------------- /scripts/licenseChecker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"). You 6 | # may not use this file except in compliance with the License. A copy of 7 | # the License is located at 8 | # 9 | # http://aws.amazon.com/apache2.0/ 10 | # 11 | # or in the "license" file accompanying this file. This file is 12 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 13 | # ANY KIND, either express or implied. See the License for the specific 14 | # language governing permissions and limitations under the License. 15 | 16 | check_licenses() { 17 | LICENSE_LIST=$(cat ./ApprovedLicenses.txt | tr '\n' '|'| sed 's/|$//') 18 | pip-licenses --summary > LicenseSummary.txt 19 | awk '{$1=""; print $0}' ./LicenseSummary.txt | tail -n +2 | sed 's/;/\n/g' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//'| sort -u > ./newLicenseSummary.txt 20 | while IFS= read -r line || [[ -n "$line" ]]; do 21 | if ! echo "$LICENSE_LIST" | grep -q "$line"; then 22 | echo "License '$line' is not in the allowed list." 23 | exit 1 24 | fi 25 | done < ./newLicenseSummary.txt 26 | 27 | if ! grep -q "prohibited-license: Did not find content matching specified patterns" ./scanOutput.txt; then 28 | echo "Prohibited License Used in Source Code Scan: " 29 | sed -n '/⚠ prohibited-license:/,/⚠ third-party-license-file:/p' ./scanOutput.txt | sed '1d;$d'| cat 30 | exit 1 31 | fi 32 | echo "License Check complete" 33 | } 34 | 35 | check_licenses 36 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-hyperpod-recipes/6a633c5500f60cea22d9409e06b069c1184b43e8/tests/__init__.py -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: Sagemaker Model Training 4 | name: sagemaker-training 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/config/llama-8b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama-8b 3 | results_dir: {$results_dir}/llama-8b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 4 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 10 13 | exp_manager: 14 | exp_dir: /fsx/exp/ 15 | name: my_experiment 16 | # experiment loggers 17 | create_tensorboard_logger: False 18 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 19 | create_mlflow_logger: False 20 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 21 | create_wandb_logger: False 22 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 23 | create_checkpoint_callback: true 24 | checkpoint_callback_params: 25 | save_top_k: 10 26 | use_smp_model: true 27 | distributed_backend: smddp 28 | model: 29 | model_type: llama_v3 30 | train_batch_size: 4 31 | val_batch_size: 1 32 | tensor_model_parallel_degree: 1 33 | expert_model_parallel_degree: 1 34 | moe: false 35 | sequence_parallel: true 36 | activation_checkpointing: true 37 | activation_loading_horizon: 2 38 | delayed_param: true 39 | offload_activations: false 40 | use_smp_model_flash_attn: false 41 | seed: 12345 42 | grad_clip: 1.0 43 | hf_pretrained_model: null 44 | sharding_strategy: hybrid_shard 45 | forward_prefetch: true 46 | shard_degree: 16 47 | backward_fetch_policy: backward_pre 48 | auto_wrap_policy: transformer_auto_wrap_policy 49 | limit_all_gathers: true 50 | use_orig_param: false 51 | max_context_width: 2048 52 | max_position_embeddings: 2048 53 | num_hidden_layers: 8 54 | hidden_size: 4096 55 | num_attention_heads: 32 56 | llama_intermediate_size: 14336 57 | initializer_range: 0.02 58 | layernorm_epsilon: 1.0e-05 59 | vocab_size: 32000 60 | num_key_value_heads: 8 61 | transformer_engine: true 62 | fp8: false 63 | fp8_amax_history_len: 1024 64 | fp8_amax_compute_algo: max 65 | do_finetune: false 66 | finetune_with_pretrained_weights: false 67 | pretrained_model_weights: null 68 | precision: bf16 69 | lr_decay_iters: 47683 70 | log_reduced_training_loss: true 71 | optim: 72 | name: adamw 73 | lr: 0.0001 74 | weight_decay: 0.01 75 | betas: 76 | - 0.9 77 | - 0.95 78 | sched: 79 | name: CosineAnnealing 80 | warmup_steps: 0 81 | constant_steps: 0 82 | min_lr: 0.000001 83 | data: 84 | train_dir: // 85 | val_dir: null 86 | dataset_type: gpt 87 | use_synthetic_data: false 88 | zipped_data: true 89 | cluster_type: k8s 90 | launcher_scripts_path: {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts/ 91 | data_config: llama-8b 92 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training-config.yaml: -------------------------------------------------------------------------------- 1 | {{ $config := .Values.trainingConfig }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: training-config-{{ $config.jobName }} 6 | data: 7 | config.yaml: |- 8 | {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }} 9 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | trainingImage: test_container 3 | pullPolicy: Always 4 | trainingConfig: 5 | jobName: llama-8b 6 | namespace: default 7 | scriptPath: examples/llama/llama_pretrain.py 8 | scriptArgs: --config-path=/config --config-name=config.yaml 9 | customScript: null 10 | annotations: null 11 | customLabels: null 12 | priority_class_name: null 13 | device: gpu 14 | numEFADevices: 32 15 | numNeuronDevices: null 16 | ntasksPerNode: 8 17 | nodes: 16 18 | restartPolicy: Never 19 | wandbKey: nil 20 | serviceAccountName: null 21 | compile: 0 22 | persistentVolumeClaims: 23 | - null 24 | volumes: null 25 | git: 26 | repo_url_or_path: https://test_token@github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git 27 | branch: test_branch 28 | commit: test_commit 29 | token: null 30 | update_adapter: false 31 | pre_script: [] 32 | post_script: [] 33 | labelSelector: 34 | required: null 35 | preferred: null 36 | weights: null 37 | cleanPodPolicy: null 38 | envVars: 39 | NCCL_DEBUG: WARN 40 | NEMO_LAUNCHER_DEBUG: 1 41 | SLURM_NTASKS_PER_NODE: 8 42 | CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 43 | FI_PROVIDER: efa 44 | NCCL_SOCKET_IFNAME: ^lo,docker0,veth_def_agent 45 | NCCL_IGNORE_DISABLED_P2P: '1' 46 | TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' 47 | TORCH_DIST_INIT_BARRIER: '1' 48 | CUDA_DEVICE_MAX_CONNECTIONS: '1' 49 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama-8b 3 | results_dir: {$results_dir}/llama-8b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 16 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 1 14 | limit_val_batches: 0 15 | exp_manager: 16 | exp_dir: null 17 | name: experiment 18 | # experiment loggers 19 | create_tensorboard_logger: False 20 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 21 | create_mlflow_logger: False 22 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 23 | create_wandb_logger: False 24 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 25 | create_checkpoint_callback: true 26 | checkpoint_callback_params: 27 | save_top_k: 0 28 | every_n_train_steps: 10 29 | monitor: step 30 | mode: max 31 | save_last: true 32 | checkpoint_dir: None/checkpoints/ 33 | resume_from_checkpoint: null 34 | auto_checkpoint: 35 | enabled: false 36 | export_full_model: 37 | every_n_train_steps: 0 38 | save_last: true 39 | use_smp_model: true 40 | distributed_backend: nccl 41 | model: 42 | model_type: llama_v3 43 | train_batch_size: 4 44 | val_batch_size: 1 45 | seed: 12345 46 | grad_clip: 1.0 47 | log_reduced_training_loss: true 48 | tensor_model_parallel_degree: 4 49 | expert_model_parallel_degree: 1 50 | context_parallel_degree: 2 51 | moe: false 52 | activation_checkpointing: false 53 | activation_loading_horizon: 1 54 | delayed_param: true 55 | offload_activations: false 56 | sharding_strategy: hybrid_shard 57 | forward_prefetch: true 58 | shard_degree: 16 59 | backward_fetch_policy: backward_pre 60 | auto_wrap_policy: transformer_auto_wrap_policy 61 | limit_all_gathers: true 62 | use_orig_param: true 63 | fp8: true 64 | fp8_amax_history_len: 1024 65 | fp8_amax_compute_algo: max 66 | max_context_width: 16384 67 | max_position_embeddings: 16384 68 | num_hidden_layers: 32 69 | hidden_size: 4096 70 | num_attention_heads: 32 71 | intermediate_size: 14336 72 | initializer_range: 0.02 73 | layernorm_epsilon: 1.0e-05 74 | vocab_size: 128256 75 | num_key_value_heads: 8 76 | use_flash_attention: true 77 | rope_theta: 500000.0 78 | rope_scaling: 79 | rope_type: llama3 80 | factor: 8.0 81 | high_freq_factor: 4.0 82 | low_freq_factor: 1.0 83 | original_max_position_embeddings: 8192 84 | do_finetune: false 85 | hf_model_name_or_path: null 86 | peft: 87 | peft_type: null 88 | precision: bf16 89 | lr_decay_iters: 50 90 | optim: 91 | name: adamw 92 | lr: 0.0001 93 | weight_decay: 0.01 94 | betas: 95 | - 0.9 96 | - 0.95 97 | sched: 98 | name: CosineAnnealing 99 | warmup_steps: 0 100 | constant_steps: 0 101 | min_lr: 1.0e-06 102 | data: 103 | train_dir: null 104 | val_dir: null 105 | dataset_type: hf 106 | use_synthetic_data: false 107 | viztracer: 108 | enabled: false 109 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | helm install --timeout=15m --wait --namespace default llama-8b {$results_dir}/llama-8b/k8s_template 3 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: Sagemaker Model Training 4 | name: sagemaker-training 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | trainingImage: test_container 3 | pullPolicy: Always 4 | trainingConfig: 5 | jobName: test_custom 6 | namespace: default 7 | scriptPath: test.py 8 | scriptArgs: '--some_args debug --some_other_args 1 ' 9 | customScript: true 10 | annotations: null 11 | customLabels: null 12 | priority_class_name: null 13 | device: gpu 14 | numEFADevices: 32 15 | numNeuronDevices: null 16 | ntasksPerNode: 8 17 | nodes: 8 18 | restartPolicy: Never 19 | wandbKey: nil 20 | serviceAccountName: null 21 | compile: 0 22 | persistentVolumeClaims: null 23 | volumes: null 24 | git: 25 | repo_url_or_path: https://github.com/example 26 | branch: null 27 | commit: null 28 | token: null 29 | update_adapter: null 30 | pre_script: [] 31 | post_script: [] 32 | labelSelector: 33 | required: null 34 | preferred: null 35 | weights: null 36 | cleanPodPolicy: null 37 | envVars: 38 | NCCL_DEBUG: DEBUG 39 | NEMO_LAUNCHER_DEBUG: 1 40 | CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 41 | FI_PROVIDER: efa 42 | NCCL_SOCKET_IFNAME: ^lo,docker0,veth_def_agent 43 | NCCL_IGNORE_DISABLED_P2P: '1' 44 | TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' 45 | TORCH_DIST_INIT_BARRIER: '1' 46 | CUDA_DEVICE_MAX_CONNECTIONS: '1' 47 | -------------------------------------------------------------------------------- /tests/k8s_workflow/k8s_baseline_artifacts/test_custom/test_custom_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | helm install --timeout=15m --wait --namespace default test-custom {$results_dir}/test_custom/k8s_template 3 | -------------------------------------------------------------------------------- /tests/k8s_workflow/test_custom_k8s_workflow.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from omegaconf import OmegaConf 4 | 5 | from main import main 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | from tests.test_utils import ( 10 | compare_artifacts, 11 | create_temp_directory, 12 | make_hydra_cfg_instance, 13 | ) 14 | 15 | 16 | def compare_custom_k8s_artifacts(artifacts_dir): 17 | logger.info("Comparing custom k8s artifacts") 18 | 19 | artifacts_paths = [ 20 | "/test_custom/test_custom_submission.sh", 21 | "/test_custom/k8s_template/Chart.yaml", 22 | "/test_custom/k8s_template/values.yaml", 23 | "/test_custom/k8s_template/templates/training.yaml", 24 | ] 25 | 26 | k8s_baseline_artifacts_path = "/tests/k8s_workflow/k8s_baseline_artifacts" 27 | compare_artifacts(artifacts_paths, artifacts_dir, k8s_baseline_artifacts_path) 28 | 29 | 30 | def test_custom_k8s_workflow(): 31 | logger.info("Testing k8s workflow") 32 | 33 | artifacts_dir = create_temp_directory() 34 | overrides = [ 35 | "training_cfg.entry_script=test.py", 36 | "cluster.instance_type=p5.48xlarge", 37 | "base_results_dir={}".format(artifacts_dir), 38 | "container=test_container", 39 | "git.repo_url_or_path=https://github.com/example", 40 | "+env_vars.NEMO_LAUNCHER_DEBUG=1", 41 | ] 42 | 43 | sample_custom_k8s_config = make_hydra_cfg_instance("../launcher_scripts/custom_script", "config_k8s", overrides) 44 | 45 | logger.info("\nsample_custom_k8s_config\n") 46 | logger.info(OmegaConf.to_yaml(sample_custom_k8s_config)) 47 | 48 | main(sample_custom_k8s_config) 49 | 50 | compare_custom_k8s_artifacts(artifacts_dir) 51 | -------------------------------------------------------------------------------- /tests/k8s_workflow/test_recipe_k8s_workflow.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from omegaconf import OmegaConf 4 | 5 | from main import main 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | import pytest 10 | 11 | from tests.test_utils import ( 12 | compare_artifacts, 13 | create_temp_directory, 14 | make_hydra_cfg_instance, 15 | ) 16 | 17 | 18 | def compare_recipe_k8s_artifacts(artifacts_dir): 19 | logger.info("Comparing recipe k8s artifacts") 20 | 21 | artifacts_paths = [ 22 | "/llama-8b/llama-8b_submission.sh", 23 | # "/llama-8b/llama-8b_hydra.yaml", # Do not test recipe, this changes often 24 | "/llama-8b/k8s_template/values.yaml", 25 | "/llama-8b/k8s_template/Chart.yaml", 26 | # "/llama-8b/k8s_template/config/llama-8b_hydra.yaml", # Do not test recipe, this changes often 27 | "/llama-8b/k8s_template/templates/training.yaml", 28 | "/llama-8b/k8s_template/templates/training-config.yaml", 29 | ] 30 | 31 | k8s_baseline_artifacts_path = "/tests/k8s_workflow/k8s_baseline_artifacts" 32 | compare_artifacts(artifacts_paths, artifacts_dir, k8s_baseline_artifacts_path) 33 | 34 | 35 | def test_recipe_k8s_workflow(): 36 | logger.info("Testing recipe k8s workflow") 37 | 38 | artifacts_dir = create_temp_directory() 39 | overrides = [ 40 | "instance_type=p5.48xlarge", 41 | "base_results_dir={}".format(artifacts_dir), 42 | "container=test_container", 43 | "cluster=k8s", 44 | "cluster_type=k8s", 45 | "+env_vars.NEMO_LAUNCHER_DEBUG=1", 46 | "git.repo_url_or_path=https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git", 47 | "git.branch=test_branch", 48 | "git.commit=test_commit", 49 | "git.token=test_token", 50 | ] 51 | 52 | sample_recipe_k8s_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides) 53 | 54 | logger.info("\nsample_recipe_k8s_config\n") 55 | logger.info(OmegaConf.to_yaml(sample_recipe_k8s_config)) 56 | 57 | main(sample_recipe_k8s_config) 58 | 59 | compare_recipe_k8s_artifacts(artifacts_dir) 60 | 61 | 62 | def test_recipe_k8s_workflow_invalid(): 63 | logger.info("Testing recipe k8s workflow with invalid git config") 64 | 65 | artifacts_dir = create_temp_directory() 66 | overrides = [ 67 | "instance_type=p5.48xlarge", 68 | "base_results_dir={}".format(artifacts_dir), 69 | "container=test_container", 70 | "cluster=k8s", 71 | "cluster_type=k8s", 72 | "+env_vars.NEMO_LAUNCHER_DEBUG=1", 73 | "git.repo_url_or_path=/local/path", 74 | ] 75 | 76 | sample_recipe_k8s_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides) 77 | 78 | logger.info("\nsample_recipe_k8s_config\n") 79 | logger.info(OmegaConf.to_yaml(sample_recipe_k8s_config)) 80 | 81 | with pytest.raises(ValueError): 82 | main(sample_recipe_k8s_config) 83 | -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/launch_docker_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | echo "image is test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" 4 | # Login ECR 5 | aws ecr get-login-password --region test_region | docker login --username AWS --password-stdin test_account.dkr.ecr.test_region.amazonaws.com 6 | 7 | # Getting EFA devices 8 | device=("--device=/dev/gdrdrv") 9 | while IFS= read -r -d '' d; do 10 | device+=("--device=${d}") 11 | done < <(find "/dev/infiniband" -name "uverbs*" -print0) 12 | 13 | # Clean old containers 14 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true 15 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true 16 | 17 | docker pull "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" 18 | docker run --gpus 32 \ 19 | --privileged --rm -d --name "sm_training_launcher" \ 20 | --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \ 21 | --security-opt seccomp=unconfined \ 22 | "${device[@]}" \ 23 | -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \ 24 | -v {$results_dir}:{$results_dir} \ 25 | test_docker_cmd \ 26 | "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" sleep infinity 27 | 28 | # Running post launching commands 29 | docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config" 30 | docker exec -itd "sm_training_launcher" bash -c "printf \" Port 2022\n\" >> /root/.ssh/config" 31 | docker exec -itd "sm_training_launcher" bash -c "service ssh start" 32 | docker exec "sm_training_launcher" bash -c "test_post_launch_cmd" 33 | 34 | exit 0 -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/llama-8b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama-8b 3 | results_dir: {$results_dir}/llama-8b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 4 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 1 14 | limit_val_batches: 0 15 | exp_manager: 16 | exp_dir: null 17 | name: experiment 18 | # experiment loggers 19 | create_tensorboard_logger: False 20 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 21 | create_mlflow_logger: False 22 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 23 | create_wandb_logger: False 24 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 25 | create_checkpoint_callback: true 26 | checkpoint_callback_params: 27 | save_top_k: 0 28 | every_n_train_steps: 10 29 | monitor: step 30 | mode: max 31 | save_last: true 32 | checkpoint_dir: None/checkpoints/ 33 | resume_from_checkpoint: null 34 | auto_checkpoint: 35 | enabled: false 36 | export_full_model: 37 | every_n_train_steps: 0 38 | save_last: true 39 | use_smp_model: true 40 | distributed_backend: nccl 41 | model: 42 | model_type: llama_v3 43 | train_batch_size: 2 44 | val_batch_size: 1 45 | seed: 12345 46 | grad_clip: 1.0 47 | log_reduced_training_loss: true 48 | tensor_model_parallel_degree: 1 49 | expert_model_parallel_degree: 1 50 | context_parallel_degree: 1 51 | moe: false 52 | activation_checkpointing: true 53 | activation_loading_horizon: 2 54 | delayed_param: true 55 | offload_activations: false 56 | sharding_strategy: hybrid_shard 57 | forward_prefetch: true 58 | shard_degree: 8 59 | backward_fetch_policy: backward_pre 60 | auto_wrap_policy: transformer_auto_wrap_policy 61 | limit_all_gathers: true 62 | use_orig_param: false 63 | fp8: true 64 | fp8_amax_history_len: 1024 65 | fp8_amax_compute_algo: max 66 | max_context_width: 8192 67 | max_position_embeddings: 8192 68 | num_hidden_layers: 32 69 | hidden_size: 4096 70 | num_attention_heads: 32 71 | intermediate_size: 14336 72 | initializer_range: 0.02 73 | layernorm_epsilon: 1.0e-05 74 | vocab_size: 128256 75 | num_key_value_heads: 8 76 | use_flash_attention: true 77 | rope_theta: 500000.0 78 | rope_scaling: 79 | rope_type: llama3 80 | factor: 8.0 81 | high_freq_factor: 4.0 82 | low_freq_factor: 1.0 83 | original_max_position_embeddings: 8192 84 | do_finetune: false 85 | hf_model_name_or_path: null 86 | peft: 87 | peft_type: null 88 | precision: bf16 89 | lr_decay_iters: 50 90 | optim: 91 | name: adamw 92 | lr: 0.0001 93 | weight_decay: 0.01 94 | betas: 95 | - 0.9 96 | - 0.95 97 | sched: 98 | name: CosineAnnealing 99 | warmup_steps: 0 100 | constant_steps: 0 101 | min_lr: 1.0e-06 102 | data: 103 | train_dir: null 104 | val_dir: null 105 | dataset_type: hf 106 | use_synthetic_data: false 107 | viztracer: 108 | enabled: false 109 | -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/sagemaker-hf-llama3-8b_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --exclusive 5 | #SBATCH --job-name=sagemaker-hf-llama3-8b 6 | #SBATCH --mem=0 7 | #SBATCH --nodes=4 8 | #SBATCH --output={$results_dir}/hf-llama3-8b/log-sagemaker-hf-llama3-8b_%j.out 9 | #SBATCH --time=6-00:00:00 10 | 11 | # setup 12 | export NCCL_DEBUG=WARN 13 | export FI_PROVIDER=efa 14 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent 15 | export NCCL_IGNORE_DISABLED_P2P=1 16 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 17 | export TORCH_DIST_INIT_BARRIER=1 18 | export CUDA_DEVICE_MAX_CONNECTIONS=1 19 | 20 | 21 | # Prepare distributed files 22 | srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/hf-llama3-8b/hostname" 23 | 24 | srun -l bash {$results_dir}/hf-llama3-8b/launch_docker_container.sh 25 | srun -l bash {$results_dir}/hf-llama3-8b/docker_exec_script.sh -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/train_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | export NCCL_DEBUG=WARN 4 | export FI_PROVIDER=efa 5 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent 6 | export NCCL_IGNORE_DISABLED_P2P=1 7 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 8 | export TORCH_DIST_INIT_BARRIER=1 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | MASTER_ADDR=$(head -n 1 {$results_dir}/llama-8b/hostname) 11 | NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/llama-8b/hostname | cut -d ":" -f 1) - 1)) 12 | NNODES=4 13 | PROCESSES_PER_NODE=8 14 | MASTER_PORT=41000 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d" 17 | 18 | # For greater env stability, grab hostname from `hostname` 19 | # https://sim.amazon.com/issues/P162624109 20 | LAUNCHER_HOSTNAME="$(hostname)" 21 | 22 | mkdir -p $HOME/tmp 23 | GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME" 24 | [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR 25 | git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR 26 | GIT_CLONE_DIR=${GIT_CLONE_DIR}/ 27 | cd $GIT_CLONE_DIR 28 | rm -rf __pycache__ 29 | 30 | unset SLURM_NTASKS 31 | 32 | torchrun $DISTRIBUTED_ARGS \ 33 | examples/llama/llama_pretrain.py \ 34 | --config-path={$results_dir}/llama-8b --config-name=llama-8b_hydra.yaml -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/docker_exec_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | function job_epilogue { 5 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true 6 | } 7 | trap job_epilogue EXIT SIGTERM SIGINT 8 | 9 | docker exec sm_training_launcher bash {$results_dir}/llama-8b/train_script.sh 10 | 11 | exit 0 -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/launch_docker_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | echo "image is test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" 4 | # Login ECR 5 | aws ecr get-login-password --region test_region | docker login --username AWS --password-stdin test_account.dkr.ecr.test_region.amazonaws.com 6 | 7 | # Getting EFA devices 8 | device=("--device=/dev/gdrdrv") 9 | while IFS= read -r -d '' d; do 10 | device+=("--device=${d}") 11 | done < <(find "/dev/infiniband" -name "uverbs*" -print0) 12 | 13 | # Clean old containers 14 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true 15 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true 16 | 17 | docker pull "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" 18 | docker run --gpus 8 \ 19 | --privileged --rm -d --name "sm_training_launcher" \ 20 | --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \ 21 | --security-opt seccomp=unconfined \ 22 | "${device[@]}" \ 23 | -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \ 24 | -v {$results_dir}:{$results_dir} \ 25 | test_docker_cmd \ 26 | "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" sleep infinity 27 | 28 | # Running post launching commands 29 | docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config" 30 | docker exec -itd "sm_training_launcher" bash -c "printf \" Port 2022\n\" >> /root/.ssh/config" 31 | docker exec -itd "sm_training_launcher" bash -c "service ssh start" 32 | docker exec "sm_training_launcher" bash -c "test_post_launch_cmd" 33 | 34 | exit 0 -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/llama-8b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama-8b 3 | results_dir: {$results_dir}/llama-8b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 4 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 1 14 | limit_val_batches: 0 15 | exp_manager: 16 | exp_dir: null 17 | name: experiment 18 | # experiment loggers 19 | create_tensorboard_logger: False 20 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 21 | create_mlflow_logger: False 22 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 23 | create_wandb_logger: False 24 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 25 | create_checkpoint_callback: true 26 | checkpoint_callback_params: 27 | save_top_k: 0 28 | every_n_train_steps: 10 29 | monitor: step 30 | mode: max 31 | save_last: true 32 | checkpoint_dir: None/checkpoints/ 33 | resume_from_checkpoint: null 34 | auto_checkpoint: 35 | enabled: false 36 | export_full_model: 37 | every_n_train_steps: 0 38 | save_last: true 39 | use_smp_model: true 40 | distributed_backend: nccl 41 | model: 42 | model_type: llama_v3 43 | train_batch_size: 2 44 | val_batch_size: 1 45 | seed: 12345 46 | grad_clip: 1.0 47 | log_reduced_training_loss: true 48 | tensor_model_parallel_degree: 1 49 | expert_model_parallel_degree: 1 50 | context_parallel_degree: 1 51 | moe: false 52 | activation_checkpointing: true 53 | activation_loading_horizon: 2 54 | delayed_param: true 55 | offload_activations: false 56 | sharding_strategy: hybrid_shard 57 | forward_prefetch: true 58 | shard_degree: 8 59 | backward_fetch_policy: backward_pre 60 | auto_wrap_policy: transformer_auto_wrap_policy 61 | limit_all_gathers: true 62 | use_orig_param: false 63 | fp8: true 64 | fp8_amax_history_len: 1024 65 | fp8_amax_compute_algo: max 66 | max_context_width: 8192 67 | max_position_embeddings: 8192 68 | num_hidden_layers: 32 69 | hidden_size: 4096 70 | num_attention_heads: 32 71 | intermediate_size: 14336 72 | initializer_range: 0.02 73 | layernorm_epsilon: 1.0e-05 74 | vocab_size: 128256 75 | num_key_value_heads: 8 76 | use_flash_attention: true 77 | rope_theta: 500000.0 78 | rope_scaling: 79 | rope_type: llama3 80 | factor: 8.0 81 | high_freq_factor: 4.0 82 | low_freq_factor: 1.0 83 | original_max_position_embeddings: 8192 84 | do_finetune: false 85 | hf_model_name_or_path: null 86 | peft: 87 | peft_type: null 88 | precision: bf16 89 | lr_decay_iters: 50 90 | optim: 91 | name: adamw 92 | lr: 0.0001 93 | weight_decay: 0.01 94 | betas: 95 | - 0.9 96 | - 0.95 97 | sched: 98 | name: CosineAnnealing 99 | warmup_steps: 0 100 | constant_steps: 0 101 | min_lr: 1.0e-06 102 | data: 103 | train_dir: null 104 | val_dir: null 105 | dataset_type: hf 106 | use_synthetic_data: false 107 | viztracer: 108 | enabled: false 109 | -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/sagemaker-llama-8b_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --exclusive 5 | #SBATCH --job-name=sagemaker-llama-8b 6 | #SBATCH --mem=0 7 | #SBATCH --nodes=16 8 | #SBATCH --output={$results_dir}/llama-8b/log-sagemaker-llama-8b_%j.out 9 | #SBATCH --time=6-00:00:00 10 | 11 | # setup 12 | export NCCL_DEBUG=WARN 13 | export FI_PROVIDER=efa 14 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent 15 | export NCCL_IGNORE_DISABLED_P2P=1 16 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 17 | export TORCH_DIST_INIT_BARRIER=1 18 | export CUDA_DEVICE_MAX_CONNECTIONS=1 19 | 20 | 21 | # Prepare distributed files 22 | srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/llama-8b/hostname" 23 | 24 | srun -l bash {$results_dir}/llama-8b/launch_docker_container.sh 25 | srun -l bash {$results_dir}/llama-8b/docker_exec_script.sh -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/train_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | export NCCL_DEBUG=WARN 4 | export FI_PROVIDER=efa 5 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent 6 | export NCCL_IGNORE_DISABLED_P2P=1 7 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 8 | export TORCH_DIST_INIT_BARRIER=1 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | MASTER_ADDR=$(head -n 1 {$results_dir}/llama-8b/hostname) 11 | NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/llama-8b/hostname | cut -d ":" -f 1) - 1)) 12 | NNODES=16 13 | PROCESSES_PER_NODE=8 14 | MASTER_PORT=41000 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d" 17 | 18 | # For greater env stability, grab hostname from `hostname` 19 | # https://sim.amazon.com/issues/P162624109 20 | LAUNCHER_HOSTNAME="$(hostname)" 21 | 22 | mkdir -p $HOME/tmp 23 | GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME" 24 | [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR 25 | git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR 26 | GIT_CLONE_DIR=${GIT_CLONE_DIR}/ 27 | cd $GIT_CLONE_DIR 28 | rm -rf __pycache__ 29 | 30 | unset SLURM_NTASKS 31 | 32 | torchrun $DISTRIBUTED_ARGS \ 33 | examples/llama/llama_pretrain.py \ 34 | --config-path={$results_dir}/llama-8b --config-name=llama-8b_hydra.yaml -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/test_custom/docker_exec_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | function job_epilogue { 5 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true 6 | } 7 | trap job_epilogue EXIT SIGTERM SIGINT 8 | 9 | docker exec sm_training_launcher bash {$results_dir}/test_custom/train_script.sh 10 | 11 | exit 0 -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/test_custom/launch_docker_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | echo "image is test_container" 4 | echo "Not an ECR image, skipping ECR login" 5 | # Getting EFA devices 6 | device=("--device=/dev/gdrdrv") 7 | while IFS= read -r -d '' d; do 8 | device+=("--device=${d}") 9 | done < <(find "/dev/infiniband" -name "uverbs*" -print0) 10 | 11 | # Clean old containers 12 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true 13 | docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true 14 | 15 | docker pull "test_container" 16 | docker run --gpus 8 \ 17 | --privileged --rm -d --name "sm_training_launcher" \ 18 | --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \ 19 | --security-opt seccomp=unconfined \ 20 | "${device[@]}" \ 21 | -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \ 22 | -v {$results_dir}:{$results_dir} \ 23 | "test_container" sleep infinity 24 | 25 | # Running post launching commands 26 | docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config" 27 | docker exec -itd "sm_training_launcher" bash -c "printf \" Port 2022\n\" >> /root/.ssh/config" 28 | docker exec -itd "sm_training_launcher" bash -c "service ssh start" 29 | 30 | exit 0 -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/test_custom/testcustom_slurm_test_custom_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --error={$results_dir}/test_custom/log-testcustom_slurm_test_custom_%j.err 5 | #SBATCH --exclusive 6 | #SBATCH --job-name=testcustom_slurm_test_custom 7 | #SBATCH --nodes=2 8 | #SBATCH --output={$results_dir}/test_custom/log-testcustom_slurm_test_custom_%j.out 9 | 10 | # setup 11 | export NCCL_DEBUG=DEBUG 12 | export FI_PROVIDER=efa 13 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent 14 | export NCCL_IGNORE_DISABLED_P2P=1 15 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 16 | export TORCH_DIST_INIT_BARRIER=1 17 | export CUDA_DEVICE_MAX_CONNECTIONS=1 18 | 19 | 20 | # Prepare distributed files 21 | srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/test_custom/hostname" 22 | 23 | srun -l bash {$results_dir}/test_custom/launch_docker_container.sh 24 | srun -l bash {$results_dir}/test_custom/docker_exec_script.sh -------------------------------------------------------------------------------- /tests/slurm_workflow/slurm_baseline_artifacts/test_custom/train_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | export NCCL_DEBUG=DEBUG 4 | export FI_PROVIDER=efa 5 | export NCCL_SOCKET_IFNAME=^lo,docker0,veth_def_agent 6 | export NCCL_IGNORE_DISABLED_P2P=1 7 | export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 8 | export TORCH_DIST_INIT_BARRIER=1 9 | export CUDA_DEVICE_MAX_CONNECTIONS=1 10 | MASTER_ADDR=$(head -n 1 {$results_dir}/test_custom/hostname) 11 | NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/test_custom/hostname | cut -d ":" -f 1) - 1)) 12 | NNODES=2 13 | PROCESSES_PER_NODE=8 14 | MASTER_PORT=41000 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d" 17 | 18 | # For greater env stability, grab hostname from `hostname` 19 | # https://sim.amazon.com/issues/P162624109 20 | LAUNCHER_HOSTNAME="$(hostname)" 21 | 22 | mkdir -p $HOME/tmp 23 | GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME" 24 | [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR 25 | git clone https://github.com/example $GIT_CLONE_DIR 26 | GIT_CLONE_DIR=${GIT_CLONE_DIR}/ 27 | cd $GIT_CLONE_DIR 28 | rm -rf __pycache__ 29 | 30 | unset SLURM_NTASKS 31 | 32 | torchrun $DISTRIBUTED_ARGS \ 33 | test.py \ 34 | -------------------------------------------------------------------------------- /tests/slurm_workflow/test_custom_slurm_workflow.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import logging 15 | 16 | from omegaconf import OmegaConf 17 | 18 | from main import main 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | from tests.test_utils import ( 24 | compare_artifacts, 25 | create_temp_directory, 26 | make_hydra_cfg_instance, 27 | ) 28 | 29 | 30 | def compare_custom_slurm_artifacts(artifacts_dir): 31 | logger.info("Comparing custom slurm artifacts") 32 | 33 | artifacts_paths = [ 34 | "/test_custom/launch_docker_container.sh", 35 | "/test_custom/testcustom_slurm_test_custom_submission.sh", 36 | "/test_custom/train_script.sh", 37 | "/test_custom/docker_exec_script.sh", 38 | ] 39 | slurm_baseline_artifacts_path = "/tests/slurm_workflow/slurm_baseline_artifacts" 40 | compare_artifacts(artifacts_paths, artifacts_dir, slurm_baseline_artifacts_path) 41 | 42 | 43 | def test_custom_slurm_workflow(): 44 | logger.info("Testing custom slurm workflow") 45 | 46 | artifacts_dir = create_temp_directory() 47 | overrides = [ 48 | "training_cfg.entry_script=test.py", 49 | "cluster.instance_type=p5.48xlarge", 50 | "cluster.cluster_type=slurm", 51 | "cluster.cluster_config.slurm_create_submission_file_only=True", 52 | "git.repo_url_or_path=https://github.com/example", 53 | "base_results_dir={}".format(artifacts_dir), 54 | "container=test_container", 55 | ] 56 | 57 | sample_custom_slurm_config = make_hydra_cfg_instance("../launcher_scripts/custom_script", "config_slurm", overrides) 58 | 59 | logger.info("\nsample_custom_slurm_config\n") 60 | logger.info(OmegaConf.to_yaml(sample_custom_slurm_config)) 61 | 62 | main(sample_custom_slurm_config) 63 | 64 | compare_custom_slurm_artifacts(artifacts_dir) 65 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-hyperpod-recipes/6a633c5500f60cea22d9409e06b069c1184b43e8/tests/sm_jobs_workflow/__init__.py -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/llama3-2-11b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama3-2-11b 3 | results_dir: {$results_dir}/llama3-2-11b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 4 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 1 14 | accumulate_grad_batches: 1 15 | gradient_clip_val: 1.0 16 | exp_manager: 17 | exp_dir: null 18 | name: experiment 19 | # experiment loggers 20 | create_tensorboard_logger: False 21 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 22 | create_mlflow_logger: False 23 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 24 | create_wandb_logger: False 25 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 26 | create_checkpoint_callback: true 27 | checkpoint_callback_params: 28 | save_top_k: 0 29 | every_n_train_steps: 10 30 | monitor: step 31 | mode: max 32 | save_last: false 33 | checkpoint_dir: None/checkpoints/ 34 | resume_from_checkpoint: null 35 | auto_checkpoint: 36 | enabled: false 37 | export_full_model: 38 | every_n_train_steps: 0 39 | save_last: false 40 | use_smp_model: false 41 | distributed_backend: nccl 42 | model: 43 | model_type: llama_v3 44 | do_finetune: false 45 | hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct 46 | hf_access_token: null 47 | train_batch_size: 1 48 | seed: 12345 49 | grad_clip: 1.0 50 | use_flash_attention: true 51 | activation_checkpointing: true 52 | multi_modal: true 53 | delayed_param: false 54 | sharding_strategy: hybrid_shard 55 | forward_prefetch: true 56 | shard_degree: 32 57 | backward_fetch_policy: backward_pre 58 | auto_wrap_policy: transformer_auto_wrap_policy 59 | limit_all_gathers: true 60 | use_orig_param: false 61 | max_context_width: 8192 62 | precision: bf16 63 | lr_decay_iters: 47683 64 | log_reduced_training_loss: true 65 | peft: 66 | peft_type: null 67 | optim: 68 | name: adamw 69 | lr: 0.0002 70 | weight_decay: 0.01 71 | betas: 72 | - 0.9 73 | - 0.98 74 | sched: 75 | name: CosineAnnealing 76 | warmup_steps: 500 77 | constant_steps: 0 78 | min_lr: 2.0e-05 79 | data: 80 | train_dir: null 81 | val_dir: null 82 | dataset_type: hf 83 | use_synthetic_data: false 84 | tokenizer_name: null 85 | zipped_data: false 86 | viztracer: 87 | enabled: false 88 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/llama3-2-11b_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd $(dirname -- $0) 4 | python launch.py --job_name llama3-2-11b --instance_type p5.48xlarge 5 | popd -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/recipe.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama3-2-11b 3 | results_dir: /var/folders/6w/nm79zb595ll18wyj6czl6gfm0000gq/T/tmp1nal2g5n/llama3-2-11b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 4 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 100 14 | accumulate_grad_batches: 1 15 | gradient_clip_val: 1.0 16 | exp_manager: 17 | exp_dir: null 18 | name: experiment 19 | # experiment loggers 20 | create_tensorboard_logger: False 21 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 22 | create_mlflow_logger: False 23 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 24 | create_wandb_logger: False 25 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 26 | create_checkpoint_callback: true 27 | checkpoint_callback_params: 28 | save_top_k: 0 29 | every_n_train_steps: 10 30 | monitor: step 31 | mode: max 32 | checkpoint_dir: None/checkpoints/ 33 | resume_from_checkpoint: null 34 | auto_checkpoint: 35 | enabled: false 36 | export_full_model: 37 | every_n_train_steps: 0 38 | save_last: false 39 | use_smp_model: false 40 | distributed_backend: nccl 41 | model: 42 | model_type: llama_v3 43 | do_finetune: false 44 | hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct 45 | hf_access_token: null 46 | train_batch_size: 1 47 | seed: 12345 48 | grad_clip: 1.0 49 | use_flash_attention: true 50 | activation_checkpointing: true 51 | multi_modal: true 52 | delayed_param: false 53 | sharding_strategy: hybrid_shard 54 | forward_prefetch: true 55 | shard_degree: 32 56 | backward_fetch_policy: backward_pre 57 | auto_wrap_policy: transformer_auto_wrap_policy 58 | limit_all_gathers: false 59 | use_orig_param: false 60 | max_context_width: 8192 61 | precision: bf16 62 | lr_decay_iters: 47683 63 | log_reduced_training_loss: true 64 | peft: 65 | peft_type: null 66 | optim: 67 | name: adamw 68 | lr: 0.0002 69 | weight_decay: 0.01 70 | betas: 71 | - 0.9 72 | - 0.98 73 | sched: 74 | name: CosineAnnealing 75 | warmup_steps: 500 76 | constant_steps: 0 77 | min_lr: 2.0e-05 78 | data: 79 | train_dir: null 80 | val_dir: null 81 | dataset_type: hf 82 | use_synthetic_data: false 83 | tokenizer_name: null 84 | zipped_data: false 85 | viztracer: 86 | enabled: false 87 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.45.2 -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3-2-11b/sm_jobs_config.yaml: -------------------------------------------------------------------------------- 1 | output_path: s3://test_path 2 | tensorboard_config: 3 | output_path: s3://test_tensorboard_path 4 | container_logs_path: /opt/ml/output/tensorboard 5 | wait: true 6 | inputs: 7 | s3: 8 | train: null 9 | val: null 10 | file_system: 11 | id: null 12 | type: null 13 | directory_path: null 14 | additional_estimator_kwargs: 15 | max_run: 1800 16 | enable_remote_debug: true 17 | recipe_overrides: null 18 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama3.2-11b 3 | results_dir: {$results_dir}/llama3.2-11b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 4 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 1 14 | accumulate_grad_batches: 1 15 | gradient_clip_val: 1.0 16 | exp_manager: 17 | exp_dir: null 18 | name: experiment 19 | create_tensorboard_logger: true 20 | create_checkpoint_callback: true 21 | checkpoint_callback_params: 22 | save_top_k: 0 23 | every_n_train_steps: 10 24 | monitor: step 25 | mode: max 26 | save_last: false 27 | checkpoint_dir: None/checkpoints/ 28 | resume_from_checkpoint: null 29 | auto_checkpoint: 30 | enabled: false 31 | export_full_model: 32 | every_n_train_steps: 0 33 | save_last: false 34 | use_smp_model: false 35 | distributed_backend: nccl 36 | model: 37 | model_type: llama_v3 38 | do_finetune: false 39 | hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct 40 | hf_access_token: null 41 | train_batch_size: 1 42 | seed: 12345 43 | grad_clip: 1.0 44 | use_flash_attention: true 45 | activation_checkpointing: true 46 | multi_modal: true 47 | delayed_param: false 48 | sharding_strategy: hybrid_shard 49 | forward_prefetch: true 50 | shard_degree: 32 51 | backward_fetch_policy: backward_pre 52 | auto_wrap_policy: transformer_auto_wrap_policy 53 | limit_all_gathers: true 54 | use_orig_param: false 55 | max_context_width: 8192 56 | precision: bf16 57 | lr_decay_iters: 47683 58 | log_reduced_training_loss: true 59 | peft: 60 | peft_type: null 61 | optim: 62 | name: adamw 63 | lr: 0.0002 64 | weight_decay: 0.01 65 | betas: 66 | - 0.9 67 | - 0.98 68 | sched: 69 | name: CosineAnnealing 70 | warmup_steps: 500 71 | constant_steps: 0 72 | min_lr: 2.0e-05 73 | data: 74 | train_dir: null 75 | val_dir: null 76 | dataset_type: hf 77 | use_synthetic_data: false 78 | tokenizer_name: null 79 | zipped_data: false 80 | viztracer: 81 | enabled: false 82 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd $(dirname -- $0) 4 | python launch.py --job_name llama3.2-11b --instance_type p5.48xlarge 5 | popd -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/recipe.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama3.2-11b 3 | results_dir: /var/folders/6w/nm79zb595ll18wyj6czl6gfm0000gq/T/tmp1nal2g5n/llama3.2-11b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 4 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 100 14 | accumulate_grad_batches: 1 15 | gradient_clip_val: 1.0 16 | exp_manager: 17 | exp_dir: null 18 | name: experiment 19 | create_tensorboard_logger: true 20 | create_checkpoint_callback: true 21 | checkpoint_callback_params: 22 | save_top_k: 0 23 | every_n_train_steps: 10 24 | monitor: step 25 | mode: max 26 | checkpoint_dir: None/checkpoints/ 27 | resume_from_checkpoint: null 28 | auto_checkpoint: 29 | enabled: false 30 | export_full_model: 31 | every_n_train_steps: 0 32 | save_last: false 33 | use_smp_model: false 34 | distributed_backend: nccl 35 | model: 36 | model_type: llama_v3 37 | do_finetune: false 38 | hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct 39 | hf_access_token: null 40 | train_batch_size: 1 41 | seed: 12345 42 | grad_clip: 1.0 43 | use_flash_attention: true 44 | activation_checkpointing: true 45 | multi_modal: true 46 | delayed_param: false 47 | sharding_strategy: hybrid_shard 48 | forward_prefetch: true 49 | shard_degree: 32 50 | backward_fetch_policy: backward_pre 51 | auto_wrap_policy: transformer_auto_wrap_policy 52 | limit_all_gathers: false 53 | use_orig_param: false 54 | max_context_width: 8192 55 | precision: bf16 56 | lr_decay_iters: 47683 57 | log_reduced_training_loss: true 58 | peft: 59 | peft_type: null 60 | optim: 61 | name: adamw 62 | lr: 0.0002 63 | weight_decay: 0.01 64 | betas: 65 | - 0.9 66 | - 0.98 67 | sched: 68 | name: CosineAnnealing 69 | warmup_steps: 500 70 | constant_steps: 0 71 | min_lr: 2.0e-05 72 | data: 73 | train_dir: null 74 | val_dir: null 75 | dataset_type: hf 76 | use_synthetic_data: false 77 | tokenizer_name: null 78 | zipped_data: false 79 | viztracer: 80 | enabled: false 81 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.45.2 -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/sm_jobs_config.yaml: -------------------------------------------------------------------------------- 1 | output_path: s3://test_path 2 | tensorboard_config: 3 | output_path: s3://test_tensorboard_path 4 | container_logs_path: /opt/ml/output/tensorboard 5 | wait: true 6 | inputs: 7 | s3: 8 | train: null 9 | val: null 10 | file_system: 11 | id: null 12 | type: null 13 | directory_path: null 14 | additional_estimator_kwargs: 15 | max_run: 1800 16 | enable_remote_debug: true 17 | recipe_overrides: null 18 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama-8b 3 | results_dir: {$results_dir}/llama-8b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 16 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 1 14 | limit_val_batches: 0 15 | exp_manager: 16 | exp_dir: null 17 | name: experiment 18 | # experiment loggers 19 | create_tensorboard_logger: False 20 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 21 | create_mlflow_logger: False 22 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 23 | create_wandb_logger: False 24 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 25 | create_checkpoint_callback: true 26 | checkpoint_callback_params: 27 | save_top_k: 0 28 | every_n_train_steps: 10 29 | monitor: step 30 | mode: max 31 | save_last: false 32 | checkpoint_dir: None/checkpoints/ 33 | resume_from_checkpoint: null 34 | auto_checkpoint: 35 | enabled: false 36 | export_full_model: 37 | every_n_train_steps: 0 38 | save_last: true 39 | use_smp_model: true 40 | distributed_backend: nccl 41 | model: 42 | model_type: llama_v3 43 | train_batch_size: 1 44 | val_batch_size: 1 45 | seed: 12345 46 | grad_clip: 1.0 47 | log_reduced_training_loss: true 48 | tensor_model_parallel_degree: 2 49 | expert_model_parallel_degree: 1 50 | context_parallel_degree: 1 51 | moe: false 52 | activation_checkpointing: false 53 | activation_loading_horizon: 1 54 | delayed_param: true 55 | offload_activations: false 56 | sharding_strategy: hybrid_shard 57 | forward_prefetch: true 58 | shard_degree: 64 59 | backward_fetch_policy: backward_pre 60 | auto_wrap_policy: transformer_auto_wrap_policy 61 | limit_all_gathers: true 62 | use_orig_param: true 63 | fp8: true 64 | fp8_amax_history_len: 1024 65 | fp8_amax_compute_algo: max 66 | max_context_width: 16384 67 | max_position_embeddings: 16384 68 | num_hidden_layers: 32 69 | hidden_size: 4096 70 | num_attention_heads: 32 71 | intermediate_size: 14336 72 | initializer_range: 0.02 73 | layernorm_epsilon: 1.0e-05 74 | vocab_size: 128256 75 | num_key_value_heads: 8 76 | use_flash_attention: true 77 | rope_theta: 500000.0 78 | rope_scaling: 79 | rope_type: llama3 80 | factor: 8.0 81 | high_freq_factor: 4.0 82 | low_freq_factor: 1.0 83 | original_max_position_embeddings: 8192 84 | do_finetune: false 85 | hf_model_name_or_path: null 86 | peft: 87 | peft_type: null 88 | precision: bf16 89 | lr_decay_iters: 50 90 | optim: 91 | name: adamw 92 | lr: 0.0001 93 | weight_decay: 0.01 94 | betas: 95 | - 0.9 96 | - 0.95 97 | sched: 98 | name: CosineAnnealing 99 | warmup_steps: 0 100 | constant_steps: 0 101 | min_lr: 1.0e-06 102 | data: 103 | train_dir: null 104 | val_dir: null 105 | dataset_type: hf 106 | use_synthetic_data: false 107 | viztracer: 108 | enabled: false 109 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd $(dirname -- $0) 4 | python launch.py --job_name llama-8b --instance_type p5.48xlarge 5 | popd -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/sm_jobs_config.yaml: -------------------------------------------------------------------------------- 1 | output_path: s3://test_path 2 | tensorboard_config: 3 | output_path: s3://test_tensorboard_path 4 | container_logs_path: /opt/ml/output/tensorboard 5 | wait: true 6 | inputs: 7 | s3: 8 | train: null 9 | val: null 10 | file_system: 11 | id: null 12 | type: null 13 | directory_path: null 14 | additional_estimator_kwargs: 15 | max_run: 1800 16 | enable_remote_debug: true 17 | recipe_overrides: null 18 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_hydra.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: llama-8b 3 | results_dir: {$results_dir}/llama-8b 4 | time_limit: 6-00:00:00 5 | model_type: hf 6 | trainer: 7 | devices: 8 8 | num_nodes: 16 9 | accelerator: gpu 10 | precision: bf16 11 | max_steps: 50 12 | log_every_n_steps: 1 13 | val_check_interval: 1 14 | limit_val_batches: 0 15 | exp_manager: 16 | exp_dir: null 17 | name: experiment 18 | # experiment loggers 19 | create_tensorboard_logger: False 20 | summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} 21 | create_mlflow_logger: False 22 | mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} 23 | create_wandb_logger: False 24 | wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default 25 | create_checkpoint_callback: true 26 | checkpoint_callback_params: 27 | save_top_k: 0 28 | every_n_train_steps: 10 29 | monitor: step 30 | mode: max 31 | save_last: false 32 | checkpoint_dir: None/checkpoints/ 33 | resume_from_checkpoint: null 34 | auto_checkpoint: 35 | enabled: false 36 | export_full_model: 37 | every_n_train_steps: 0 38 | save_last: true 39 | use_smp_model: true 40 | distributed_backend: nccl 41 | model: 42 | model_type: llama_v3 43 | train_batch_size: 1 44 | val_batch_size: 1 45 | seed: 12345 46 | grad_clip: 1.0 47 | log_reduced_training_loss: true 48 | tensor_model_parallel_degree: 2 49 | expert_model_parallel_degree: 1 50 | context_parallel_degree: 1 51 | moe: false 52 | activation_checkpointing: false 53 | activation_loading_horizon: 1 54 | delayed_param: true 55 | offload_activations: false 56 | sharding_strategy: hybrid_shard 57 | forward_prefetch: true 58 | shard_degree: 64 59 | backward_fetch_policy: backward_pre 60 | auto_wrap_policy: transformer_auto_wrap_policy 61 | limit_all_gathers: true 62 | use_orig_param: true 63 | fp8: true 64 | fp8_amax_history_len: 1024 65 | fp8_amax_compute_algo: max 66 | max_context_width: 16384 67 | max_position_embeddings: 16384 68 | num_hidden_layers: 32 69 | hidden_size: 4096 70 | num_attention_heads: 32 71 | intermediate_size: 14336 72 | initializer_range: 0.02 73 | layernorm_epsilon: 1.0e-05 74 | vocab_size: 128256 75 | num_key_value_heads: 8 76 | use_flash_attention: true 77 | rope_theta: 500000.0 78 | rope_scaling: 79 | rope_type: llama3 80 | factor: 8.0 81 | high_freq_factor: 4.0 82 | low_freq_factor: 1.0 83 | original_max_position_embeddings: 8192 84 | do_finetune: false 85 | hf_model_name_or_path: null 86 | peft: 87 | peft_type: null 88 | precision: bf16 89 | lr_decay_iters: 50 90 | optim: 91 | name: adamw 92 | lr: 0.0001 93 | weight_decay: 0.01 94 | betas: 95 | - 0.9 96 | - 0.95 97 | sched: 98 | name: CosineAnnealing 99 | warmup_steps: 0 100 | constant_steps: 0 101 | min_lr: 1.0e-06 102 | data: 103 | train_dir: null 104 | val_dir: null 105 | dataset_type: hf 106 | use_synthetic_data: false 107 | viztracer: 108 | enabled: false 109 | -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_submission.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd $(dirname -- $0) 4 | python launch.py --job_name llama-8b --instance_type p5.48xlarge 5 | popd -------------------------------------------------------------------------------- /tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/sm_jobs_config.yaml: -------------------------------------------------------------------------------- 1 | output_path: s3://test_path 2 | tensorboard_config: 3 | output_path: null 4 | container_logs_path: null 5 | wait: true 6 | inputs: 7 | s3: 8 | train: s3://test_path 9 | val: s3://test_path 10 | file_system: 11 | id: null 12 | type: null 13 | directory_path: null 14 | additional_estimator_kwargs: 15 | max_run: 1800 16 | enable_remote_debug: true 17 | recipe_overrides: null 18 | -------------------------------------------------------------------------------- /tests/test_config_files.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import logging 15 | from typing import Optional 16 | 17 | from omegaconf import OmegaConf 18 | 19 | from launcher.nemo.constants import ROOT_DIR 20 | 21 | from .test_utils import ( 22 | is_job_run_name_valid_for_clusters, 23 | make_hydra_cfg_instance, 24 | validate_distributed_degrees, 25 | ) 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def test_configuration_files(): 31 | recipes_dir = ROOT_DIR / "recipes_collection/recipes" 32 | log_config_name = lambda name: logger.info(f"\nFailing Config File: {name}") 33 | 34 | for path in recipes_dir.rglob("*.yaml"): 35 | if not path.is_file(): 36 | continue 37 | 38 | # Hydra requires relative path definition 39 | file_path: str = "../" + str(path.relative_to(ROOT_DIR).parent) 40 | config = make_hydra_cfg_instance(file_path, path.name) 41 | 42 | # plucking values outside the method arguments substantially reduces log output on failure 43 | shard_degree = OmegaConf.select(config, "model.shard_degree") 44 | tensor_model_parallel_degree = OmegaConf.select(config, "model.tensor_model_parallel_degree") 45 | expert_model_parallel_degree = OmegaConf.select(config, "model.expert_model_parallel_degree") 46 | context_parallel_degree = OmegaConf.select(config, "model.context_parallel_degree") 47 | num_nodes = OmegaConf.select(config, "trainer.num_nodes") 48 | 49 | assert validate_distributed_degrees( 50 | shard_degree, tensor_model_parallel_degree, expert_model_parallel_degree, context_parallel_degree, num_nodes 51 | ), log_config_name(path.name) 52 | 53 | job_run_name: Optional[str] = config.get("run", {}).get("name") 54 | assert is_job_run_name_valid_for_clusters(job_run_name), log_config_name(path.name) 55 | -------------------------------------------------------------------------------- /tests/test_launcher_scripts.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import logging 15 | from typing import Optional 16 | 17 | from launcher.nemo.constants import ROOT_DIR 18 | 19 | from .test_utils import ( 20 | get_launcher_run_script_paths, 21 | is_job_run_name_valid_for_clusters, 22 | ) 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | RUN_SCRIPT_PATHS = get_launcher_run_script_paths() 27 | 28 | 29 | def test_config_for_run_script_exists(): 30 | RECIPES_DIR = ROOT_DIR / "recipes_collection/recipes" 31 | log_line = lambda script, config: logger.info( 32 | f"\nlauncher file: {script.relative_to(ROOT_DIR)}" f"\nconfig file: {config.relative_to(ROOT_DIR)}" "\n" 33 | ) 34 | 35 | def extract_value_in_line(line: str) -> str: 36 | _, value = line.split("=") 37 | value = value.replace(" \\", "") # remove shell line continuation marker 38 | value = value.strip() 39 | return value 40 | 41 | def assert_recipe_config_exists(line: str, config_path_str: str): 42 | # Example: 43 | # recipes=training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain 44 | config_path = RECIPES_DIR / (config_path_str + ".yaml") # append .yaml 45 | assert config_path.exists(), log_line(run_script_path, config_path) 46 | 47 | def assert_run_name_is_valid(line: str, config_path_str: Optional[str]): 48 | """ 49 | Ensure the name is valid for Slurm and Kubernetes clusters 50 | """ 51 | # Example: 52 | # recipes.run.name="hf-llama3-70b-lora" \ 53 | run_name = extract_value_in_line(line) 54 | run_name = run_name.replace('"', "") # remove quotes 55 | run_name = run_name.strip() 56 | 57 | if config_path_str is None: 58 | config_path_str = "config_file_not_defined" 59 | 60 | config_path = RECIPES_DIR / (config_path_str + ".yaml") # append .yaml 61 | assert is_job_run_name_valid_for_clusters(run_name), log_line(run_script_path, config_path) 62 | 63 | for run_script_path in RUN_SCRIPT_PATHS: 64 | with open(run_script_path, "r") as fd: 65 | for line in fd: 66 | config_path_str = None 67 | 68 | if "recipes=" in line: 69 | config_path_str = extract_value_in_line(line) 70 | assert_recipe_config_exists(line, config_path_str) 71 | 72 | if "recipes.run.name=" in line: 73 | assert_run_name_is_valid(line, config_path_str) 74 | -------------------------------------------------------------------------------- /tests/test_readme.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import logging 15 | from difflib import SequenceMatcher 16 | from typing import List 17 | 18 | from launcher.nemo.constants import ROOT_DIR 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def test_readme_table_links(): 24 | readme_path = ROOT_DIR / "README.md" 25 | log_line = lambda line: logger.info(f"\nFailing line:\n{line}") 26 | 27 | def pluck_path_strings(line: str): 28 | paths_str: List[str] = [] 29 | 30 | for chunk in line.split("|"): # split by column delimeter 31 | if "[link]" in chunk: 32 | chunk = chunk.strip() 33 | chunk = chunk.replace("[link]", "") 34 | assert chunk[0] == "(" and chunk[-1] == ")", log_line(line) 35 | chunk = chunk[1:-1] # remove parantheses 36 | paths_str.append(chunk) 37 | 38 | return paths_str 39 | 40 | with open(readme_path, "r") as fd: 41 | for line in fd: 42 | """ 43 | Example: 44 | | Hugging Face | Llama 3.2 | 11b | 8192 | 4 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh) | 45 | """ 46 | if "[link]" in line: 47 | paths_str = pluck_path_strings(line) 48 | 49 | if len(paths_str) == 1: 50 | file_path = ROOT_DIR / paths_str[0] 51 | assert file_path.exists(), log_line(line) 52 | # there is a config and a script link 53 | elif len(paths_str) == 2: 54 | config_file_path = ROOT_DIR / paths_str[0] 55 | launcher_script_path = ROOT_DIR / paths_str[1] 56 | # try to catch if a launch script is pointing to an incorrect config 57 | str_distance_ratio = SequenceMatcher(None, config_file_path.stem, launcher_script_path.stem).ratio() 58 | 59 | assert config_file_path.exists(), log_line(line) 60 | assert launcher_script_path.exists(), log_line(line) 61 | assert str_distance_ratio >= 0.8, log_line(line) 62 | else: 63 | raise Exception("test condition not covered") 64 | -------------------------------------------------------------------------------- /validations_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from functools import wraps 15 | from typing import Any, Callable, TypeVar, cast 16 | 17 | from omegaconf import DictConfig 18 | 19 | from launcher.config_validator.type_validator import TypeValidator 20 | from launcher.config_validator.value_validator import ValueValidator 21 | 22 | _T = TypeVar("_T", bound=Callable[..., Any]) 23 | 24 | 25 | def validate_config(fn: _T) -> _T: 26 | @wraps(fn) 27 | def validations_wrapper(config: DictConfig, *args, **kwargs) -> DictConfig: 28 | """ 29 | Execute all validations in this function 30 | """ 31 | type_validator = TypeValidator(config) 32 | type_validator.validate() 33 | schema_validator = ValueValidator(config) 34 | schema_validator.validate() 35 | 36 | return fn(config, *args, **kwargs) 37 | 38 | return cast(_T, validations_wrapper) 39 | --------------------------------------------------------------------------------