├── .github └── workflows │ ├── autoconf.yml │ ├── launcher.yml │ └── style.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── README.md ├── auto_configurator ├── autoconfig │ ├── __init__.py │ ├── base_config.py │ ├── inference_summary.py │ ├── inference_sweep.py │ ├── scripts │ │ └── compare_throughput.py │ ├── search_config.py │ ├── train.py │ ├── training_config.py │ └── utils.py ├── base_configs │ ├── baichuan2_13b.yaml │ ├── baichuan2_7b.yaml │ ├── bert.yaml │ ├── chatglm_6b.yaml │ ├── gpt3.yaml │ ├── llama2_13b.yaml │ ├── llama2_70b.yaml │ ├── llama2_7b.yaml │ ├── llama3_70b.yaml │ ├── llama3_8b.yaml │ ├── mixtral_3b.yaml │ ├── mixtral_7b.yaml │ ├── mt5.yaml │ ├── qwen2_14b.yaml │ ├── qwen2_4b.yaml │ ├── qwen2_72b.yaml │ ├── qwen2_7b.yaml │ └── t5.yaml ├── conf │ ├── cluster │ │ └── bcm.yaml │ ├── config.yaml │ └── search_config │ │ ├── baichuan2 │ │ ├── 13b.yaml │ │ └── 7b.yaml │ │ ├── bert │ │ ├── 0.11b.yaml │ │ ├── 100b.yaml │ │ ├── 20b.yaml │ │ ├── 4b.yaml │ │ └── unknown_size.yaml │ │ ├── chatglm │ │ └── 6b.yaml │ │ ├── gpt3 │ │ ├── 0.126b.yaml │ │ ├── 0.843b.yaml │ │ ├── 175b.yaml │ │ ├── 20b.yaml │ │ ├── 2b.yaml │ │ ├── 43b.yaml │ │ ├── 5b.yaml │ │ ├── 8b.yaml │ │ └── unknown_size.yaml │ │ ├── llama │ │ ├── llama2_13b.yaml │ │ ├── llama2_70b.yaml │ │ ├── llama2_7b.yaml │ │ ├── llama2_7b_nemo.yaml │ │ ├── llama3_70b.yaml │ │ └── llama3_8b.yaml │ │ ├── mixtral │ │ ├── 3b.yaml │ │ └── 7b.yaml │ │ ├── mt5 │ │ ├── 0.17b.yaml │ │ ├── 0.39b.yaml │ │ ├── 101.6b.yaml │ │ ├── 11.9b.yaml │ │ ├── 206b.yaml │ │ ├── 24.65b.yaml │ │ ├── 3.2b.yaml │ │ ├── 42.54b.yaml │ │ └── unknown_size.yaml │ │ ├── qwen2 │ │ ├── 14b.yaml │ │ ├── 4b.yaml │ │ ├── 72b.yaml │ │ └── 7b.yaml │ │ └── t5 │ │ ├── 0.22b.yaml │ │ ├── 100b.yaml │ │ ├── 11b.yaml │ │ ├── 2.8b.yaml │ │ ├── 203b.yaml │ │ ├── 23.5b.yaml │ │ ├── 41.2b.yaml │ │ └── unknown_size.yaml ├── main.py ├── tests │ ├── __init__.py │ ├── base_configs_tests │ │ ├── __init__.py │ │ └── test_base_configs.py │ ├── code_tests │ │ ├── __init__.py │ │ ├── test_base_config.py │ │ ├── test_training_config.py │ │ └── test_utils.py │ └── config_tests │ │ ├── __init__.py │ │ ├── test_bert_config.py │ │ ├── test_cluster_config.py │ │ ├── test_gpt3_config.py │ │ ├── test_llama_config.py │ │ ├── test_main_config.py │ │ ├── test_mixtral_config.py │ │ ├── test_mt5_config.py │ │ └── test_t5_config.py └── tuning │ ├── README.md │ ├── conf │ ├── cluster │ │ └── bcm.yaml │ └── config.yaml │ ├── main.py │ └── src │ ├── result_analysis.py │ ├── search.py │ └── utils.py ├── csp_tools ├── aws │ ├── Dockerfile │ ├── build-nccl-tests.sh │ ├── cluster-validation-sample-output │ │ └── dcgmi-220.out │ ├── cluster_validation.sh │ ├── dcgmi_diag.sh │ ├── nccl.sh │ └── topo.xml ├── azure │ ├── build-nccl-tests.sh │ ├── cluster_validation.sh │ ├── dcgmi_diag.sh │ ├── nccl.sh │ └── topo.xml └── oci │ ├── build-nccl-tests.sh │ ├── cluster_validation.sh │ ├── dcgmi_diag.sh │ ├── nccl.sh │ └── topo.xml ├── examples ├── README.md ├── peft │ ├── llama │ │ ├── a100 │ │ │ ├── 13b_1node.sh │ │ │ ├── 13b_1node_lora.sh │ │ │ ├── 70b_1node_lora.sh │ │ │ ├── 70b_2node.sh │ │ │ ├── 7b_1node.sh │ │ │ ├── 7b_1node_lora.sh │ │ │ └── lora_4gpu_k8s.sh │ │ └── h100 │ │ │ ├── 13b_1node.sh │ │ │ ├── 13b_1node_lora.sh │ │ │ ├── 70b_1node_lora.sh │ │ │ ├── 70b_2node.sh │ │ │ ├── 7b_1node.sh │ │ │ └── 7b_1node_lora.sh │ └── nemotron │ │ └── h100 │ │ ├── 22b_1node.sh │ │ ├── 22b_1node_lora.sh │ │ ├── 8b_1node.sh │ │ └── 8b_1node_lora.sh └── training │ ├── gpt │ ├── a100 │ │ ├── 175b_16node.sh │ │ ├── 20b_1node.sh │ │ ├── 20b_8node.sh │ │ ├── 40b_8node.sh │ │ ├── 5b_1node.sh │ │ ├── 5b_8node.sh │ │ ├── fsdp_20b_1node.sh │ │ ├── fsdp_40b_32node.sh │ │ └── fsdp_5b_1node.sh │ └── h100 │ │ ├── 175b_bf16_16node.sh │ │ ├── 175b_fp8_16node.sh │ │ ├── 20b_bf16_1node.sh │ │ ├── 20b_bf16_8node.sh │ │ ├── 20b_fp8_1node.sh │ │ ├── 20b_fp8_8node.sh │ │ ├── 40b_bf16_8node.sh │ │ ├── 40b_fp8_8node.sh │ │ ├── 5b_bf16_1node.sh │ │ ├── 5b_bf16_8node.sh │ │ ├── 5b_fp8_1node.sh │ │ ├── 5b_fp8_8node.sh │ │ ├── fsdp_20b_bf16_1node.sh │ │ ├── fsdp_40b_bf16_32node.sh │ │ └── fsdp_5b_bf16_1node.sh │ ├── grok1-proxy │ └── h100 │ │ ├── grok1_proxy_bf16.sh │ │ └── grok1_proxy_fp8.sh │ ├── llama │ ├── a100 │ │ ├── llama2_13b_bf16.sh │ │ ├── llama2_70b_bf16.sh │ │ └── llama2_7b_bf16.sh │ └── h100 │ │ ├── llama2_13b_bf16.sh │ │ ├── llama2_13b_fp8.sh │ │ ├── llama2_70b_bf16.sh │ │ ├── llama2_70b_fp8.sh │ │ ├── llama2_7b_bf16.sh │ │ ├── llama2_7b_fp8.sh │ │ ├── llama3_405b_bf16.sh │ │ ├── llama3_405b_fp8.sh │ │ ├── llama3_70b_bf16.sh │ │ ├── llama3_70b_fp8.sh │ │ ├── llama3_8b_bf16.sh │ │ └── llama3_8b_fp8.sh │ ├── mixtral │ └── h100 │ │ ├── mixtral_8x3b_bf16.sh │ │ ├── mixtral_8x3b_fp8.sh │ │ ├── mixtral_8x7b_bf16.sh │ │ └── mixtral_8x7b_fp8.sh │ └── nemotron │ ├── a100 │ ├── nemotron_22b_bf16.sh │ └── nemotron_8b_bf16.sh │ └── h100 │ ├── nemotron_22b_bf16.sh │ ├── nemotron_22b_fp8.sh │ ├── nemotron_8b_bf16.sh │ └── nemotron_8b_fp8.sh ├── img ├── 170M_mT5_loss_final.svg ├── 175B_GPT_3_throughput.svg ├── 220M_T5_loss_final.svg ├── 390M_mT5_loss_final.svg ├── 3B_T5_loss_100percent.svg ├── 3B_T5_loss_75percent.svg ├── 3B_T5_throughput_2205.svg ├── 3B_T5_throughput_2208.svg ├── 3B_mT5_loss_75percent.svg ├── 3B_mT5_loss_final.svg ├── 3B_mT5_throughput_2205.svg ├── 3B_mT5_throughput_2208.svg ├── 4B_bert_throughput_2211.png ├── 4b_bert_loss_final.png ├── 5B_GPT_3_loss_final.svg ├── 5B_GPT_3_throughput.svg ├── infer_model_size_gpt3.svg ├── infer_model_size_gpt3.svg_old ├── infer_model_size_mt5.svg ├── infer_model_size_t5.svg └── model_overview.png ├── launcher_scripts ├── __init__.py ├── conf │ ├── adapter_learning │ │ ├── gpt3 │ │ │ └── squad.yaml │ │ ├── llama │ │ │ └── squad.yaml │ │ └── t5 │ │ │ └── squad.yaml │ ├── cluster │ │ ├── bcm.yaml │ │ ├── k8s.yaml │ │ └── k8s_v2.yaml │ ├── config.yaml │ ├── conversion │ │ ├── baichuan2 │ │ │ └── convert_baichuan2.yaml │ │ ├── chatglm │ │ │ └── convert_chatglm.yaml │ │ ├── clip │ │ │ └── convert_clip.yaml │ │ ├── controlnet │ │ │ └── convert_controlnet.yaml │ │ ├── dreambooth │ │ │ └── convert_dreambooth.yaml │ │ ├── gpt3 │ │ │ └── convert_gpt3.yaml │ │ ├── imagen │ │ │ └── convert_imagen.yaml │ │ ├── instruct_pix2pix │ │ │ └── convert_instruct_pix2pix.yaml │ │ ├── llama │ │ │ └── convert_llama.yaml │ │ ├── mistral │ │ │ └── convert_mistral.yaml │ │ ├── mixtral │ │ │ ├── convert_mixtral.yaml │ │ │ └── convert_mixtral_8x22b.yaml │ │ ├── mt5 │ │ │ └── convert_mt5.yaml │ │ ├── nemotron │ │ │ └── convert_nemotron.yaml │ │ ├── neva │ │ │ └── convert_neva.yaml │ │ ├── qwen2 │ │ │ └── convert_qwen2.yaml │ │ ├── stable_diffusion │ │ │ └── convert_stable_diffusion.yaml │ │ ├── starcoder2 │ │ │ └── convert_starcoder2.yaml │ │ ├── t5 │ │ │ └── convert_t5.yaml │ │ └── vit │ │ │ └── convert_vit.yaml │ ├── conversion_hf2nemo │ │ ├── hf_llama2 │ │ │ └── convert_llama2_nemo.yaml │ │ ├── hf_mistral_7b │ │ │ └── convert_mistral_7b_nemo.yaml │ │ └── hf_mixtral │ │ │ ├── convert_mixtral_8x22b_nemo.yaml │ │ │ └── convert_mixtral_nemo.yaml │ ├── data_curation │ │ ├── common_crawl │ │ │ ├── compute_minhashes │ │ │ │ └── compute_minhashes.yaml │ │ │ ├── connected_component │ │ │ │ └── connected_component.yaml │ │ │ ├── curate_common_crawl.yaml │ │ │ ├── fasttext_download │ │ │ │ └── fasttext_download.yaml │ │ │ ├── find_matching_ngrams │ │ │ │ └── find_matching_ngrams.yaml │ │ │ ├── jaccard_compute │ │ │ │ └── jaccard_compute.yaml │ │ │ ├── jaccard_map_buckets │ │ │ │ └── jaccard_map_buckets.yaml │ │ │ ├── jaccard_shuffle │ │ │ │ └── jaccard_shuffle.yaml │ │ │ ├── language_identification │ │ │ │ └── language_identification.yaml │ │ │ ├── minhash_buckets │ │ │ │ └── minhash_buckets.yaml │ │ │ ├── prepare_task_data │ │ │ │ └── prepare_task_data.yaml │ │ │ ├── quality_filtering │ │ │ │ └── heuristic_english.yaml │ │ │ ├── remove_matching_ngrams │ │ │ │ └── remove_matching_ngrams.yaml │ │ │ ├── separate_by_language │ │ │ │ └── separate_by_language.yaml │ │ │ ├── text_cleaning │ │ │ │ └── text_cleaning.yaml │ │ │ ├── verify_all_pairs_jaccard │ │ │ │ └── verify_all_pairs_jaccard.yaml │ │ │ └── write_deduped_result_with_text │ │ │ │ └── write_deduped_result_with_text.yaml │ │ └── sft │ │ │ ├── curate_sft.yaml │ │ │ ├── find_matching_ngrams │ │ │ └── find_matching_ngrams.yaml │ │ │ ├── prepare_task_data │ │ │ └── prepare_task_data.yaml │ │ │ └── remove_matching_ngrams │ │ │ └── remove_matching_ngrams.yaml │ ├── data_preparation │ │ ├── baichuan2 │ │ │ └── download_baichuan2_pile.yaml │ │ ├── bert │ │ │ └── download_bert_pile.yaml │ │ ├── chatglm │ │ │ └── download_chatglm_pile.yaml │ │ ├── code_llama │ │ │ └── download_human_eval.yaml │ │ ├── falcon │ │ │ └── download_falcon_pile.yaml │ │ ├── fid_evaluation │ │ │ └── download_coco2014.yaml │ │ ├── generic │ │ │ └── custom_dataset.yaml │ │ ├── gpt │ │ │ └── download_slim_pajama.yaml │ │ ├── gpt3 │ │ │ └── download_gpt3_pile.yaml │ │ ├── llama │ │ │ └── download_llama_pile.yaml │ │ ├── mistral │ │ │ ├── download_mistral_nemo_123b_pile.yaml │ │ │ ├── download_mistral_nemo_12b_pile.yaml │ │ │ └── download_mistral_pile.yaml │ │ ├── mixtral │ │ │ ├── download_mixtral_8x22b_pile.yaml │ │ │ └── download_mixtral_pile.yaml │ │ ├── mt5 │ │ │ └── download_mc4.yaml │ │ ├── multimodal │ │ │ ├── download_multimodal.yaml │ │ │ ├── precache_sd.yaml │ │ │ └── precache_t5xxl.yaml │ │ ├── nemotron │ │ │ └── download_nemotron_pile.yaml │ │ ├── steerlm │ │ │ ├── steerlm_data_prep1.yaml │ │ │ └── steerlm_data_prep2_reg.yaml │ │ └── t5 │ │ │ └── download_t5_pile.yaml │ ├── evaluation │ │ ├── adapter_gpt3 │ │ │ └── squad.yaml │ │ ├── adapter_t5 │ │ │ └── squad.yaml │ │ ├── baichuan2 │ │ │ ├── evaluate_all.yaml │ │ │ └── evaluate_boolq.yaml │ │ ├── chatglm │ │ │ ├── evaluate_all.yaml │ │ │ └── evaluate_boolq.yaml │ │ ├── clip │ │ │ └── imagenet_zeroshot.yaml │ │ ├── code_llama │ │ │ └── human_eval.yaml │ │ ├── falcon │ │ │ └── evaluate_all.yaml │ │ ├── gpt3 │ │ │ ├── evaluate_all.yaml │ │ │ └── evaluate_lambada.yaml │ │ ├── ia3_gpt3 │ │ │ └── squad.yaml │ │ ├── ia3_t5 │ │ │ └── squad.yaml │ │ ├── imagen │ │ │ └── fid_clip.yaml │ │ ├── llama │ │ │ ├── evaluate_all.yaml │ │ │ └── evaluate_boolq.yaml │ │ ├── mistral │ │ │ └── evaluate_all.yaml │ │ ├── mixtral │ │ │ ├── evaluate_all.yaml │ │ │ └── evaluate_all_8x22b.yaml │ │ ├── mt5 │ │ │ ├── custom_task.yaml │ │ │ └── xquad.yaml │ │ ├── nemotron │ │ │ └── evaluate_all.yaml │ │ ├── peft_baichuan2 │ │ │ └── squad.yaml │ │ ├── peft_chatglm │ │ │ └── squad.yaml │ │ ├── peft_falcon │ │ │ └── squad.yaml │ │ ├── peft_llama │ │ │ └── squad.yaml │ │ ├── peft_mistral │ │ │ └── squad.yaml │ │ ├── peft_mixtral │ │ │ ├── squad.yaml │ │ │ └── squad_8x22b.yaml │ │ ├── peft_t5 │ │ │ └── squad.yaml │ │ ├── prompt_gpt3 │ │ │ └── squad.yaml │ │ ├── prompt_llama │ │ │ └── squad.yaml │ │ ├── prompt_mt5 │ │ │ └── squad.yaml │ │ ├── prompt_t5 │ │ │ └── squad.yaml │ │ ├── qwen2 │ │ │ ├── evaluate_all.yaml │ │ │ └── evaluate_boolq.yaml │ │ ├── retro │ │ │ ├── evaluate_nq.yaml │ │ │ └── evaluate_tqa.yaml │ │ ├── stable_diffusion │ │ │ └── fid_clip.yaml │ │ ├── starcoder2 │ │ │ └── human_eval.yaml │ │ ├── t5 │ │ │ ├── custom_task.yaml │ │ │ └── squad.yaml │ │ └── vit │ │ │ └── imagenet_val.yaml │ ├── export │ │ ├── gpt3 │ │ │ └── export_gpt3.yaml │ │ ├── mt5 │ │ │ └── export_mt5.yaml │ │ └── t5 │ │ │ └── export_t5.yaml │ ├── external_conversion │ │ └── clip │ │ │ └── convert_external_clip.yaml │ ├── fine_tuning │ │ ├── baichuan2 │ │ │ └── squad.yaml │ │ ├── bert_embedding │ │ │ └── sft.yaml │ │ ├── chatglm │ │ │ └── squad.yaml │ │ ├── code_llama │ │ │ └── human_eval.yaml │ │ ├── falcon │ │ │ └── squad.yaml │ │ ├── gpt3 │ │ │ ├── custom_task.yaml │ │ │ └── squad.yaml │ │ ├── llama │ │ │ └── squad.yaml │ │ ├── mamba │ │ │ └── sft.yaml │ │ ├── mistral │ │ │ └── squad.yaml │ │ ├── mixtral │ │ │ ├── squad.yaml │ │ │ └── squad_8x22b.yaml │ │ ├── mt5 │ │ │ ├── custom_task.yaml │ │ │ └── xquad.yaml │ │ ├── neva │ │ │ ├── llama2_13b_chat.yaml │ │ │ ├── llama2_7b_chat.yaml │ │ │ ├── llama3_70b_chat.yaml │ │ │ ├── llama3_8b_chat.yaml │ │ │ ├── mistral_7b_instruct.yaml │ │ │ └── mixtral_8x7b_instruct.yaml │ │ ├── nsfw │ │ │ └── nsfw_L_14.yaml │ │ ├── qwen2 │ │ │ └── squad.yaml │ │ ├── t5 │ │ │ ├── custom_task.yaml │ │ │ └── squad.yaml │ │ ├── video_neva │ │ │ └── llama3_8b_vita.yaml │ │ └── vit │ │ │ └── imagenet1k.yaml │ ├── fw_inference │ │ ├── clip │ │ │ └── clip_similarity.yaml │ │ ├── controlnet │ │ │ └── controlnet_infer.yaml │ │ ├── dreambooth │ │ │ └── text2img.yaml │ │ ├── imagen │ │ │ └── text2img.yaml │ │ ├── instruct_pix2pix │ │ │ └── edit_cli.yaml │ │ ├── neva │ │ │ └── inference.yaml │ │ ├── nsfw │ │ │ └── nsfw.yaml │ │ ├── retro │ │ │ └── retro_inference.yaml │ │ ├── sdxl │ │ │ └── sdxl_infer.yaml │ │ ├── stable_diffusion │ │ │ └── text2img.yaml │ │ ├── video_neva │ │ │ └── inference.yaml │ │ └── vit │ │ │ └── imagenet1k.yaml │ ├── ia3_learning │ │ ├── gpt3 │ │ │ └── squad.yaml │ │ ├── llama │ │ │ └── squad.yaml │ │ └── t5 │ │ │ └── squad.yaml │ ├── peft │ │ ├── baichuan2 │ │ │ └── squad.yaml │ │ ├── chatglm │ │ │ └── squad.yaml │ │ ├── code_llama │ │ │ └── human_eval.yaml │ │ ├── falcon │ │ │ └── squad.yaml │ │ ├── gemma │ │ │ ├── sft.yaml │ │ │ └── squad.yaml │ │ ├── gpt3 │ │ │ └── squad.yaml │ │ ├── griffin │ │ │ ├── sft.yaml │ │ │ └── squad.yaml │ │ ├── llama │ │ │ ├── sft.yaml │ │ │ └── squad.yaml │ │ ├── mistral │ │ │ └── squad.yaml │ │ ├── mistral_embedding │ │ │ └── squad.yaml │ │ ├── mixtral │ │ │ ├── squad.yaml │ │ │ └── squad_8x22b.yaml │ │ ├── nemotron │ │ │ ├── sft.yaml │ │ │ └── squad.yaml │ │ ├── neva │ │ │ ├── llama2_13b_chat.yaml │ │ │ ├── llama2_70b_chat.yaml │ │ │ ├── llama2_7b_chat.yaml │ │ │ ├── llama3_70b_chat.yaml │ │ │ ├── llama3_8b_chat.yaml │ │ │ ├── mistral_7b_instruct.yaml │ │ │ ├── mixtral_8x7b_instruct.yaml │ │ │ └── nemotron4_340b_chat.yaml │ │ ├── qwen2 │ │ │ ├── sft.yaml │ │ │ └── squad.yaml │ │ ├── starcoder2 │ │ │ └── sft.yaml │ │ └── t5 │ │ │ └── squad.yaml │ ├── prompt_learning │ │ ├── gpt3 │ │ │ └── squad.yaml │ │ ├── llama │ │ │ └── squad.yaml │ │ ├── mt5 │ │ │ └── squad.yaml │ │ └── t5 │ │ │ └── squad.yaml │ ├── ptq │ │ └── model │ │ │ └── quantization.yaml │ ├── rag_generating │ │ └── gpt3 │ │ │ ├── 125m.yaml │ │ │ └── 7b.yaml │ ├── rag_indexing │ │ └── bert │ │ │ ├── 110m.yaml │ │ │ └── 340m.yaml │ ├── rlhf_ppo │ │ └── gpt3 │ │ │ └── 2b_ppo.yaml │ ├── rlhf_rm │ │ └── gpt3 │ │ │ └── 2b_rm.yaml │ ├── steerlm_reg │ │ ├── ac_sft │ │ │ └── gpt_sft.yaml │ │ └── rw_sft │ │ │ └── training_rm.yaml │ └── training │ │ ├── baichuan2 │ │ ├── baichuan2_13b.yaml │ │ └── baichuan2_7b.yaml │ │ ├── bert │ │ ├── 100b.yaml │ │ ├── 110m.yaml │ │ ├── 20b.yaml │ │ └── 4b.yaml │ │ ├── chatglm │ │ ├── chatglm2-6b.yaml │ │ └── chatglm3-6b.yaml │ │ ├── clip │ │ ├── siglip_config.yaml │ │ ├── vit_B_32.yaml │ │ ├── vit_H_14.yaml │ │ └── vit_g_14.yaml │ │ ├── controlnet │ │ └── controlnet_v1-5.yaml │ │ ├── dreambooth │ │ └── 860m.yaml │ │ ├── falcon │ │ └── falcon_7b.yaml │ │ ├── gpt3 │ │ ├── 126m.yaml │ │ ├── 175b.yaml │ │ ├── 175b_16k.yaml │ │ ├── 175b_32k.yaml │ │ ├── 175b_mlperf.yaml │ │ ├── 1b_improved.yaml │ │ ├── 20b.yaml │ │ ├── 400m_improved.yaml │ │ ├── 40b.yaml │ │ ├── 40b_16k.yaml │ │ ├── 40b_32k.yaml │ │ ├── 40b_64k.yaml │ │ ├── 40b_improved.yaml │ │ ├── 5b.yaml │ │ ├── 5b_16k.yaml │ │ ├── 5b_32k.yaml │ │ ├── 5b_64k.yaml │ │ ├── 7b_improved.yaml │ │ ├── mlperf-24n.yaml │ │ └── mlperf.yaml │ │ ├── grok │ │ └── grok1_proxy.yaml │ │ ├── imagen │ │ ├── 2b_res_64.yaml │ │ ├── 400m_res_256.yaml │ │ ├── 500m_res_64.yaml │ │ ├── 600m_res_1024.yaml │ │ └── 600m_res_256.yaml │ │ ├── instruct_pix2pix │ │ └── 860m_sd_edit.yaml │ │ ├── llama │ │ ├── llama1_13b.yaml │ │ ├── llama1_30b.yaml │ │ ├── llama1_65b.yaml │ │ ├── llama1_7b.yaml │ │ ├── llama2_13b.yaml │ │ ├── llama2_70b.yaml │ │ ├── llama2_7b.yaml │ │ ├── llama3_1_405b.yaml │ │ ├── llama3_1_70b.yaml │ │ ├── llama3_1_8b.yaml │ │ ├── llama3_70b.yaml │ │ └── llama3_8b.yaml │ │ ├── mistral │ │ ├── mistral_7b.yaml │ │ ├── mistral_nemo_123b.yaml │ │ └── mistral_nemo_12b.yaml │ │ ├── mixtral │ │ ├── mixtral_8x22b.yaml │ │ ├── mixtral_8x3b.yaml │ │ └── mixtral_8x7b.yaml │ │ ├── mt5 │ │ ├── 11b.yaml │ │ ├── 170m.yaml │ │ ├── 23b.yaml │ │ ├── 390m.yaml │ │ └── 3b.yaml │ │ ├── nemotron │ │ ├── nemotron_15b.yaml │ │ ├── nemotron_22b.yaml │ │ ├── nemotron_340b.yaml │ │ ├── nemotron_4b.yaml │ │ └── nemotron_8b.yaml │ │ ├── nerf │ │ ├── dreamfusion-dmtet.yaml │ │ ├── dreamfusion.yaml │ │ └── model │ │ │ ├── background │ │ │ ├── random.yaml │ │ │ ├── static.yaml │ │ │ ├── tcnn.yaml │ │ │ └── torchngp.yaml │ │ │ ├── data │ │ │ └── data.yaml │ │ │ ├── dreamfusion-dmtet.yaml │ │ │ ├── dreamfusion.yaml │ │ │ ├── guidance │ │ │ ├── sd_huggingface.yaml │ │ │ ├── sd_nemo.yaml │ │ │ └── sd_trt.yaml │ │ │ ├── loss │ │ │ ├── dmtet.yaml │ │ │ └── dreamfusion.yaml │ │ │ ├── material │ │ │ └── basic_shading.yaml │ │ │ ├── nerf │ │ │ ├── tcnn.yaml │ │ │ └── torchngp.yaml │ │ │ ├── optim │ │ │ └── adan.yaml │ │ │ └── renderer │ │ │ ├── nerfacc.yaml │ │ │ ├── nvdiffrast.yaml │ │ │ └── torchngp_raymarching.yaml │ │ ├── neva │ │ ├── llama2_13b_chat.yaml │ │ ├── llama2_70b_chat.yaml │ │ ├── llama2_7b_chat.yaml │ │ ├── llama3_70b_chat.yaml │ │ ├── llama3_8b_chat.yaml │ │ ├── mistral_7b_instruct.yaml │ │ ├── mixtral_8x7b_instruct.yaml │ │ └── nemotron4_340b_chat.yaml │ │ ├── qwen2 │ │ ├── qwen2_14b.yaml │ │ ├── qwen2_4b.yaml │ │ ├── qwen2_72b.yaml │ │ └── qwen2_7b.yaml │ │ ├── retro │ │ └── 300m.yaml │ │ ├── sdxl │ │ ├── sdxl_base_train_res_1024_stage_3.yaml │ │ ├── sdxl_base_train_res_256_stage_1.yaml │ │ └── sdxl_base_train_res_512_stage_2.yaml │ │ ├── stable_diffusion │ │ ├── 860m_res_256_pretrain.yaml │ │ ├── 860m_res_256_v2_0_pretrain.yaml │ │ ├── 860m_res_512_v1_1.yaml │ │ ├── 860m_res_512_v1_2.yaml │ │ ├── 860m_res_512_v1_5.yaml │ │ └── 860m_res_512_v2_0_base.yaml │ │ ├── starcoder2 │ │ ├── starcoder2_15b.yaml │ │ ├── starcoder2_3b.yaml │ │ └── starcoder2_7b.yaml │ │ ├── t5 │ │ ├── 11b.yaml │ │ ├── 220m.yaml │ │ ├── 23b.yaml │ │ ├── 3b.yaml │ │ └── 41b.yaml │ │ ├── tp_overlap │ │ ├── ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml │ │ ├── ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml │ │ ├── ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml │ │ ├── ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml │ │ ├── ub_cfg_h100_fp8_h8192_tp4_mbs1_seqlen8192.yaml │ │ ├── ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml │ │ ├── ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml │ │ ├── ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml │ │ ├── ub_cfg_h100_h5120_tp2_mbs1_seqlen4096.yaml │ │ ├── ub_cfg_h100_h6144_tp2_mbs2_seqlen2048.yaml │ │ ├── ub_cfg_h100_h8192_tp2_mbs1_seqlen4096.yaml │ │ ├── ub_cfg_h100_h8192_tp4_mbs1_seqlen4096.yaml │ │ └── ub_cfg_h100_h8192_tp4_mbs1_seqlen8192.yaml │ │ ├── video_neva │ │ ├── llama2_13b_chat.yaml │ │ ├── llama2_70b_chat.yaml │ │ ├── llama2_7b_chat.yaml │ │ ├── mistral_7b_instruct.yaml │ │ └── mixtral_8x7b_instruct.yaml │ │ └── vit │ │ ├── B_16.yaml │ │ ├── H_14.yaml │ │ ├── L_16.yaml │ │ ├── bigG_14.yaml │ │ └── g_14.yaml ├── data │ └── nsfw │ │ └── concepts.txt ├── main.py ├── nemo_launcher │ ├── __init__.py │ ├── collections │ │ ├── __init__.py │ │ ├── auto_blend.py │ │ ├── checkpoint_search.py │ │ ├── conditional_cfgs.py │ │ ├── conf │ │ │ ├── auto_blend.yaml │ │ │ ├── checkpoint_search.yaml │ │ │ ├── get_ag_overlap.yaml │ │ │ ├── get_ln_sm_margin.yaml │ │ │ ├── hparams_override.yaml │ │ │ └── numa_mapping.yaml │ │ ├── datacuration_scripts │ │ │ └── download_fasttext.sh │ │ ├── dataprep_scripts │ │ │ ├── __init__.py │ │ │ ├── anthropichh_dataprep │ │ │ │ └── download_and_process.py │ │ │ ├── custom_dataprep │ │ │ │ ├── __init__.py │ │ │ │ └── preprocess.py │ │ │ ├── dolly_dataprep │ │ │ │ ├── __init__.py │ │ │ │ ├── download.py │ │ │ │ └── preprocess.py │ │ │ ├── fid_evaluation_dataprep │ │ │ │ ├── conf │ │ │ │ │ └── config.yaml │ │ │ │ └── preprocess.py │ │ │ ├── mc4_dataprep │ │ │ │ ├── __init__.py │ │ │ │ ├── download.py │ │ │ │ ├── prepare.py │ │ │ │ ├── preprocess.py │ │ │ │ └── setup_preprocess.py │ │ │ ├── multimodal_dataprep │ │ │ │ ├── conf │ │ │ │ │ └── config.yaml │ │ │ │ ├── download_images.py │ │ │ │ ├── download_parquet.py │ │ │ │ ├── generate_wdinfo.py │ │ │ │ ├── merge_source_tar.py │ │ │ │ ├── precache_encodings.py │ │ │ │ └── reorganize_tar.py │ │ │ ├── pile_dataprep │ │ │ │ ├── __init__.py │ │ │ │ ├── conf │ │ │ │ │ └── config.yaml │ │ │ │ ├── download.py │ │ │ │ ├── extract.py │ │ │ │ └── preprocess.py │ │ │ └── slim_pajama_dataprep │ │ │ │ ├── __init__.py │ │ │ │ ├── concat.sh │ │ │ │ ├── conf │ │ │ │ └── config.yaml │ │ │ │ ├── download.py │ │ │ │ ├── extract.py │ │ │ │ └── preprocess.py │ │ ├── eval_diffusion_fid_clip │ │ │ ├── TFinception_V3.py │ │ │ ├── compute_clip_score.py │ │ │ ├── compute_fid.py │ │ │ ├── eval_fid.py │ │ │ ├── fid_dataset.py │ │ │ └── plot.py │ │ ├── eval_harness │ │ │ ├── __init__.py │ │ │ ├── download.py │ │ │ ├── evaluate.py │ │ │ └── lm_eval │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── evaluator.py │ │ │ │ ├── metrics.py │ │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── dummy.py │ │ │ │ ├── nemo_baichuan2.py │ │ │ │ ├── nemo_chatglm.py │ │ │ │ ├── nemo_falcon.py │ │ │ │ ├── nemo_gpt3.py │ │ │ │ ├── nemo_gpt3_prompt.py │ │ │ │ ├── nemo_llama.py │ │ │ │ ├── nemo_llama_prompt.py │ │ │ │ ├── nemo_mistral.py │ │ │ │ ├── nemo_mixtral.py │ │ │ │ └── nemo_qwen2.py │ │ │ │ ├── tasks │ │ │ │ ├── __init__.py │ │ │ │ ├── common.py │ │ │ │ ├── hellaswag.py │ │ │ │ ├── lambada.py │ │ │ │ ├── piqa.py │ │ │ │ ├── prompt.py │ │ │ │ ├── race.py │ │ │ │ ├── superglue.py │ │ │ │ ├── wikitext.py │ │ │ │ └── winogrande.py │ │ │ │ └── utils.py │ │ ├── export_scripts │ │ │ ├── __init__.py │ │ │ └── prepare_triton_model_config.py │ │ ├── gpu_affinity.py │ │ ├── hparams_override.py │ │ ├── metric_calculation │ │ │ ├── __init__.py │ │ │ ├── fine_tuning_metric_calc.py │ │ │ └── squad_metric_calc.py │ │ ├── numa_mapping.py │ │ ├── pause_and_prime_dns_connections.py │ │ └── run_dask_stage.sh │ ├── core │ │ ├── __init__.py │ │ ├── data_curation_stages.py │ │ ├── data_stages.py │ │ ├── export_stages.py │ │ ├── k8s_templates │ │ │ ├── conversion │ │ │ │ ├── Chart.yaml │ │ │ │ ├── conversion.yaml │ │ │ │ └── values.yaml │ │ │ ├── data_preparation │ │ │ │ ├── Chart.yaml │ │ │ │ ├── data-prep-config.yaml │ │ │ │ ├── data-prep.yaml │ │ │ │ └── values.yaml │ │ │ ├── evaluation │ │ │ │ ├── Chart.yaml │ │ │ │ ├── evaluation-config.yaml │ │ │ │ ├── evaluation.yaml │ │ │ │ └── values.yaml │ │ │ ├── peft │ │ │ │ ├── Chart.yaml │ │ │ │ ├── peft-config.yaml │ │ │ │ ├── peft.yaml │ │ │ │ └── values.yaml │ │ │ ├── rlhf_ppo │ │ │ │ ├── Chart.yaml │ │ │ │ ├── rlhf-ppo-actor.yaml │ │ │ │ ├── rlhf-ppo-config.yaml │ │ │ │ ├── rlhf-ppo-critic.yaml │ │ │ │ └── values.yaml │ │ │ └── training │ │ │ │ ├── Chart.yaml │ │ │ │ ├── training-config.yaml │ │ │ │ ├── training.yaml │ │ │ │ └── values.yaml │ │ ├── launchers.py │ │ ├── logger.py │ │ ├── rlhf_stages.py │ │ ├── stages.py │ │ └── v2 │ │ │ ├── __init__.py │ │ │ ├── config_k8s.py │ │ │ ├── config_k8s_test.py │ │ │ ├── stages.py │ │ │ ├── stages_test.py │ │ │ ├── step_k8s.py │ │ │ └── step_k8s_test.py │ └── utils │ │ ├── __init__.py │ │ ├── data_utils │ │ ├── __init__.py │ │ ├── download_squad.py │ │ ├── prepare_squad.py │ │ └── prompt_learning_squad_preprocessing.py │ │ ├── file_utils.py │ │ └── job_utils.py └── tests │ ├── __init__.py │ └── unit_tests │ ├── __init__.py │ ├── config_tests │ ├── test_cluster_config.py │ ├── test_fault_tol_config.py │ └── test_main_config.py │ ├── stages_tests │ ├── __init__.py │ ├── test_adapters.py │ ├── test_convert.py │ ├── test_data_prep.py │ ├── test_eval.py │ ├── test_export.py │ ├── test_fine_tune.py │ ├── test_ia3.py │ ├── test_prompt_learn.py │ ├── test_ptq.py │ └── test_train.py │ └── utils_tests │ ├── __init__.py │ └── test_file_utils.py ├── requirements.txt └── setup.cfg /.github/workflows/autoconf.yml: -------------------------------------------------------------------------------- 1 | name: autoconf 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - 'main' 7 | 8 | jobs: 9 | unit_tests: 10 | runs-on: ubuntu-latest 11 | env: 12 | working-directory: ./auto_configurator 13 | 14 | steps: 15 | - name: checkout the repo 16 | uses: actions/checkout@v3 17 | 18 | - name: install dependencies 19 | run: | 20 | python -m pip install pytest 21 | pip install -r requirements.txt 22 | pip install requests-mock 23 | 24 | - name: run unit tests 25 | run: pytest 26 | working-directory: ${{env.working-directory}} 27 | -------------------------------------------------------------------------------- /.github/workflows/launcher.yml: -------------------------------------------------------------------------------- 1 | name: launcher 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - 'main' 7 | 8 | jobs: 9 | unit_tests: 10 | runs-on: ubuntu-latest 11 | env: 12 | working-directory: ./launcher_scripts 13 | 14 | steps: 15 | - name: checkout the repo 16 | uses: actions/checkout@v3 17 | 18 | - name: install dependencies 19 | run: | 20 | python -m pip install pytest requests-mock -r requirements.txt 21 | 22 | - name: run unit tests 23 | run: PYTHONPATH=$PWD pytest 24 | working-directory: ${{env.working-directory}} 25 | -------------------------------------------------------------------------------- /.github/workflows/style.yml: -------------------------------------------------------------------------------- 1 | name: code_style 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - 'main' 7 | 8 | jobs: 9 | black: 10 | runs-on: ubuntu-latest 11 | env: 12 | working-directory: . 13 | 14 | steps: 15 | - name: checkout the repo 16 | uses: actions/checkout@v3 17 | 18 | - name: install dependencies 19 | run: pip install --upgrade black==19.10b0 click==8.0.2 20 | 21 | - name: code style check 22 | run: black . --check --verbose --diff 23 | working-directory: ${{env.working-directory}} 24 | -------------------------------------------------------------------------------- /auto_configurator/autoconfig/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_configurator/conf/cluster/bcm.yaml: -------------------------------------------------------------------------------- 1 | partition: null 2 | account: null 3 | exclusive: True 4 | gpus_per_task: null 5 | gpus_per_node: 8 6 | mem: 0 7 | job_name_prefix: "nemo_megatron_autoconfig:" 8 | srun_args: 9 | - "--no-container-mount-home" 10 | -------------------------------------------------------------------------------- /auto_configurator/conf/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - cluster: bcm 4 | - search_config: gpt3/5b 5 | - override hydra/job_logging: stdout 6 | 7 | hydra: 8 | run: 9 | dir: . 10 | output_subdir: null 11 | 12 | run_training_hp_search: True 13 | run_inference_hp_search: True 14 | 15 | cluster_type: bcm # bcm or bcp 16 | auto_configurator_path: ??? # Path to the location of auto_configurator codebase. 17 | launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts 18 | base_results_dir: ${auto_configurator_path}/results 19 | data_dir: ${launcher_scripts_path}/data 20 | 21 | training_container: nvcr.io/nvidia/nemo:24.09 22 | container_mounts: 23 | - null 24 | 25 | wandb: # Weights and Biases (W&B) logging. 26 | enable: False # Whether to save logs to W&B. 27 | api_key_file: null # Path to the file where the w&B api key is stored. Key must be on the first line. 28 | project: nemo-megatron-autoconfig # Name of the W&B project to store the logs in. The name of the run will be populated automatically. 29 | 30 | # Do not modify the code below. 31 | search_config_value: ${hydra:runtime.choices.search_config} 32 | -------------------------------------------------------------------------------- /auto_configurator/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /auto_configurator/tests/base_configs_tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /auto_configurator/tests/code_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/auto_configurator/tests/code_tests/__init__.py -------------------------------------------------------------------------------- /auto_configurator/tests/config_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/auto_configurator/tests/config_tests/__init__.py -------------------------------------------------------------------------------- /auto_configurator/tests/config_tests/test_cluster_config.py: -------------------------------------------------------------------------------- 1 | from omegaconf import OmegaConf 2 | 3 | 4 | class TestClusterConfig: 5 | def test_cluster_bcm_config(self): 6 | conf = OmegaConf.load("conf/cluster/bcm.yaml") 7 | s = """ 8 | partition: null 9 | account: null 10 | exclusive: True 11 | gpus_per_task: null 12 | gpus_per_node: 8 13 | mem: 0 14 | job_name_prefix: "nemo_megatron_autoconfig:" 15 | srun_args: 16 | - "--no-container-mount-home" 17 | """ 18 | expected = OmegaConf.create(s) 19 | assert ( 20 | expected == conf 21 | ), f"conf/cluster/bcm.yaml must be set to {expected} but it currently is {conf}." 22 | -------------------------------------------------------------------------------- /auto_configurator/tuning/README.md: -------------------------------------------------------------------------------- 1 | # NeMo Framework Launcher Fine-Tuning Autoconfigurator 2 | 3 | The fine-tuning autoconfigurator allows to conduct hyperparameter search for fine-tuning jobs on Slurm cluster. Similarly to usual grid search, it starts a sequence of jobs for different hyperparameter configurations, and analyses their results to find the best-performing one in terms of validation loss. 4 | 5 | ## Usage 6 | 1. Specify Slurm cluster parameters in `conf/cluster/bcm.yaml` 7 | 2. Fill all required values in `conf/config.yaml`. The `search_config.param_grid` field corresponds a set of hyperparamter values to use for grid search. Hyperparameter names should be specified in Hydra dot notation. The values should be lists of hyperparameter values to choose from. 8 | 3. Run hyperparameter search with `python3 main.py` 9 | 10 | The following results will be stored in `base_results_dir` for each hyperparameter search: 11 | - `candidate_configs` - .yaml config files, used for different experiments 12 | - `ft_logs` - logs of NeMo fine-tuning jobs 13 | - `final_result` - folder, containing result analysis logs and experiment summary in results.csv -------------------------------------------------------------------------------- /auto_configurator/tuning/conf/cluster/bcm.yaml: -------------------------------------------------------------------------------- 1 | partition: null 2 | account: null 3 | exclusive: True 4 | gpus_per_task: null 5 | gpus_per_node: null 6 | mem: 0 7 | job_name_prefix: ":" 8 | srun_args: 9 | - "--no-container-mount-home" 10 | -------------------------------------------------------------------------------- /auto_configurator/tuning/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Entry point, main file to run to launch fine-tuning autoconfigurator jobs.""" 16 | 17 | import hydra 18 | import omegaconf 19 | from src.search import run_search 20 | 21 | 22 | @hydra.main(config_path="conf", config_name="config") 23 | def main(cfg: omegaconf.dictconfig.DictConfig) -> None: 24 | """ 25 | Entry point for the fine-tuning autoconfigurator pipeline. Reads the config using 26 | hydra, runs fine-tuning hyperparemeter search. 27 | :param omegaconf.dictconfig.DictConfig cfg: OmegaConf object, read using 28 | the @hydra.main decorator. 29 | :return: None 30 | """ 31 | run_search(cfg=cfg) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /csp_tools/aws/build-nccl-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #SBATCH --ntasks=1 17 | #SBATCH --ntasks-per-node=1 18 | 19 | srun --container-mounts="$PWD:/nccl" \ 20 | --container-image=../../nemo_megatron_training.sqsh \ 21 | bash -c " 22 | cd /nccl && 23 | curl -fSsL --proto '=https' https://github.com/NVIDIA/nccl-tests/tarball/master | tar xz && 24 | mv NVIDIA-nccl-tests* nccl-tests && 25 | cd nccl-tests && 26 | make -j CUDA_HOME=/usr/local/cuda MPI=1 MPI_HOME=/opt/amazon/openmpi/" 27 | -------------------------------------------------------------------------------- /csp_tools/aws/dcgmi_diag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #SBATCH --job-name=dcgmi-diag 17 | #SBATCH --gpus-per-node=8 18 | #SBATCH --time=1:00:00 19 | 20 | # This is a Data Center GPU Manager container. This command will run GPU diagnostics. 21 | # This script should not be called manually. It should only be called by cluster_validation.sh 22 | srun --container-image=nvcr.io/nvidia/cloud-native/dcgm:2.3.5-1-ubi8 bash -c "dcgmi diag -r 3" 23 | -------------------------------------------------------------------------------- /csp_tools/azure/build-nccl-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #SBATCH --ntasks=1 17 | #SBATCH --ntasks-per-node=1 18 | 19 | HPCX_PATH="/opt/hpcx-v2.9.0-gcc-MLNX_OFED_LINUX-5.4-1.0.3.0-ubuntu18.04-x86_64" 20 | 21 | export OMPI_MCA_pml=ucx 22 | export OMPI_MCA_btl=^openib 23 | 24 | srun --container-mounts="$PWD:/nccl,$HPCX_PATH:/opt/hpcx" \ 25 | --container-image="nvcr.io/nvidia/pytorch:21.09-py3" \ 26 | --container-name="nccl" \ 27 | bash -c " 28 | cd /nccl && 29 | git clone https://github.com/NVIDIA/nccl-tests.git && 30 | source /opt/hpcx/hpcx-init.sh && 31 | hpcx_load && 32 | cd nccl-tests && 33 | make MPI=1" 34 | -------------------------------------------------------------------------------- /csp_tools/azure/dcgmi_diag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #SBATCH --job-name=dcgmi-diag 17 | #SBATCH --time=1:00:00 18 | 19 | # This is a Data Center GPU Manager container. This command will run GPU diagnostics. 20 | # This script should not be called manually. It should only be called by cluster_validation.sh 21 | srun --container-image=nvcr.io/nvidia/cloud-native/dcgm:2.3.5-1-ubi8 bash -c "dcgmi diag -r 3" 22 | -------------------------------------------------------------------------------- /csp_tools/oci/build-nccl-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #SBATCH --ntasks=1 17 | #SBATCH --ntasks-per-node=1 18 | 19 | HPCX_PATH="/opt/hpcx-v2.11-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.11-x86_64" 20 | 21 | export OMPI_MCA_pml=ucx 22 | export OMPI_MCA_btl=^openib 23 | 24 | srun --container-mounts="$PWD:/nccl,$HPCX_PATH:/opt/hpcx" \ 25 | --container-image="nvcr.io/nvidia/pytorch:21.09-py3" \ 26 | --container-name="nccl" \ 27 | bash -c " 28 | cd /nccl && 29 | git clone https://github.com/NVIDIA/nccl-tests.git && 30 | source /opt/hpcx/hpcx-init.sh && 31 | hpcx_load && 32 | cd nccl-tests && 33 | make MPI=1" 34 | -------------------------------------------------------------------------------- /csp_tools/oci/dcgmi_diag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #SBATCH --job-name=dcgmi-diag 17 | #SBATCH --time=1:00:00 18 | 19 | # This is a Data Center GPU Manager container. This command will run GPU diagnostics. 20 | # This script should not be called manually. It should only be called by cluster_validation.sh 21 | srun --container-image=nvcr.io/nvidia/cloud-native/dcgm:2.3.5-1-ubi8 bash -c "dcgmi diag -r 3" 22 | -------------------------------------------------------------------------------- /examples/peft/llama/a100/lora_4gpu_k8s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 4 | set -eu 5 | 6 | #Users should specify the following directories 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=$(readlink -f ${SCRIPT_DIR}/../../../..) 8 | DATA_DIR=${DATA_DIR} 9 | RESTORE_FROM_PATH=${RESTORE_FROM_PATH} 10 | RUN_NAME=${RUN_NAME:-llama-7b-peft-lora} 11 | PEFT_CONFIG=${PEFT_CONFIG:-llama/squad} 12 | 13 | # peft.model.megatron_amp_O2=false is needed on containers earlier than 23.11 that 14 | # do not include https://github.com/NVIDIA/NeMo/pull/7971 15 | TRANSIENT_OVERRIDES="peft.model.megatron_amp_O2=false" 16 | 17 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 18 | cluster=k8s_v2 \ 19 | cluster_type=k8s \ 20 | cluster.ib_interfaces=null \ 21 | container=nvcr.io/nvidia/nemo:24.09 \ 22 | stages=[peft] \ 23 | peft=${PEFT_CONFIG} \ 24 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 25 | data_dir=${DATA_DIR} \ 26 | peft.run.name="${RUN_NAME}" \ 27 | peft.trainer.num_nodes=1 \ 28 | peft.trainer.devices=4 \ 29 | peft.trainer.max_epochs=null \ 30 | peft.trainer.max_steps=2000 \ 31 | peft.model.global_batch_size=128 \ 32 | peft.model.micro_batch_size=1 \ 33 | peft.model.restore_from_path=$RESTORE_FROM_PATH \ 34 | $TRANSIENT_OVERRIDES \ 35 | $@ 36 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/175b_16node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/175b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="175b_a100_16node" \ 16 | training.trainer.num_nodes=16 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=1 \ 19 | training.model.tensor_model_parallel_size=4 \ 20 | training.model.pipeline_model_parallel_size=8 \ 21 | training.model.virtual_pipeline_model_parallel_size=12 \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/20b_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/20b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="20b_a100_1node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=2 \ 19 | training.model.tensor_model_parallel_size=4 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.run.time_limit=0:20:00 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/20b_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/20b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="20b_a100_8node" \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=2048 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=4 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | training.run.time_limit=0:20:00 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/40b_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/40b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="40b_a100_8node" \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=2 \ 19 | training.model.tensor_model_parallel_size=2 \ 20 | training.model.pipeline_model_parallel_size=4 \ 21 | training.model.virtual_pipeline_model_parallel_size=12 \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/5b_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/5b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="5b_a100_1node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=1 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.run.time_limit=0:20:00 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/5b_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/5b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="5b_a100_8node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=2048 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=1 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | training.run.time_limit=0:20:00 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/fsdp_20b_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #This example does pre-training GPT 5B model using torch FSDP. 4 | 5 | # Users should specify the path to the launcher directory and the dataset in the 6 | # commandline or in this run script. 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 8 | DATA_DIR=${DATA_DIR} 9 | 10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 12 | training=gpt3/5b \ 13 | stages=[training] \ 14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 15 | data_dir=${DATA_DIR} \ 16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 17 | training.trainer.precision="bf16-mixed" \ 18 | training.run.name="fsdp_5b_a100_1node" \ 19 | training.trainer.num_nodes=1 \ 20 | training.model.global_batch_size=256 \ 21 | training.model.megatron_amp_O2=False \ 22 | training.model.use_cpu_initialization=True \ 23 | +training.model.fsdp=True \ 24 | +training.model.fsdp_sharded_checkpoint=True \ 25 | training.model.optim.name="fused_adam" \ 26 | ~training.model.optim.bucket_cap_mb \ 27 | ~training.model.optim.overlap_grad_sync \ 28 | ~training.model.optim.overlap_param_sync \ 29 | ~training.model.optim.contiguous_grad_buffer \ 30 | training.run.time_limit=0:20:00 \ 31 | -------------------------------------------------------------------------------- /examples/training/gpt/a100/fsdp_5b_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #This example does pre-training GPT 5B model using torch FSDP. 4 | 5 | # Users should specify the path to the launcher directory and the dataset in the 6 | # commandline or in this run script. 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 8 | DATA_DIR=${DATA_DIR} 9 | 10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 12 | training=gpt3/5b \ 13 | stages=[training] \ 14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 15 | data_dir=${DATA_DIR} \ 16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 17 | training.trainer.precision="bf16-mixed" \ 18 | training.run.name="fsdp_5b_a100_1node" \ 19 | training.trainer.num_nodes=1 \ 20 | training.model.global_batch_size=256 \ 21 | training.model.megatron_amp_O2=False \ 22 | training.model.use_cpu_initialization=True \ 23 | +training.model.fsdp=True \ 24 | +training.model.fsdp_sharded_checkpoint=True \ 25 | training.model.optim.name="fused_adam" \ 26 | ~training.model.optim.bucket_cap_mb \ 27 | ~training.model.optim.overlap_grad_sync \ 28 | ~training.model.optim.overlap_param_sync \ 29 | ~training.model.optim.contiguous_grad_buffer \ 30 | training.run.time_limit=0:20:00 \ 31 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/175b_bf16_16node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/175b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="175b_h100_bf16_16node" \ 16 | training.trainer.num_nodes=16 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=1 \ 19 | training.model.tensor_model_parallel_size=4 \ 20 | training.model.pipeline_model_parallel_size=8 \ 21 | training.model.virtual_pipeline_model_parallel_size=12 \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/175b_fp8_16node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/175b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="175b_h100_fp8_16node" \ 16 | training.trainer.num_nodes=16 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=1 \ 19 | training.model.tensor_model_parallel_size=4 \ 20 | training.model.pipeline_model_parallel_size=8 \ 21 | training.model.virtual_pipeline_model_parallel_size=12 \ 22 | training.model.fp8=true \ 23 | training.run.time_limit=0:20:00 \ 24 | +training.model.optim.grad_sync_dtype=bf16 \ 25 | +env_vars.NVTE_FUSED_ATTN=1 \ 26 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/20b_bf16_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/20b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="20b_h100_bf16_1node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=2 \ 19 | training.model.tensor_model_parallel_size=4 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.run.time_limit=0:20:00 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/20b_bf16_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/20b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="20b_h100_bf16_8node" \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=2048 \ 18 | training.model.micro_batch_size=2 \ 19 | training.model.tensor_model_parallel_size=2 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.run.time_limit=0:20:00 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/20b_fp8_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/20b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="20b_h100_fp8_1node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=4 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.model.fp8=true \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/20b_fp8_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/20b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="20b_h100_fp8_8node" \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=2048 \ 18 | training.model.micro_batch_size=2 \ 19 | training.model.tensor_model_parallel_size=2 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.model.fp8=true \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/40b_bf16_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/40b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="40b_h100_bf16_8node" \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=2 \ 19 | training.model.tensor_model_parallel_size=2 \ 20 | training.model.pipeline_model_parallel_size=4 \ 21 | training.model.virtual_pipeline_model_parallel_size=12 \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/40b_fp8_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/40b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="40b_h100_fp8_8node" \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=2 \ 19 | training.model.tensor_model_parallel_size=2 \ 20 | training.model.pipeline_model_parallel_size=4 \ 21 | training.model.virtual_pipeline_model_parallel_size=12 \ 22 | training.model.fp8=true \ 23 | training.run.time_limit=0:20:00 \ 24 | +training.model.optim.grad_sync_dtype=bf16 \ 25 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/5b_bf16_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/5b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="5b_h100_bf16_1node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=1 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.run.time_limit=0:20:00 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/5b_bf16_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/5b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="5b_h100_bf16_8node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=2048 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=1 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.run.time_limit=0:20:00 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/5b_fp8_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/5b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="5b_h100_fp8_1node" \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=256 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=1 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.model.fp8=true \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/5b_fp8_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Users should specify the path to the launcher directory and the dataset in the 4 | # commandline or in this run script. 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 6 | DATA_DIR=${DATA_DIR} 7 | 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 10 | training=gpt3/5b \ 11 | stages=[training] \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | data_dir=${DATA_DIR} \ 14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 15 | training.run.name="5b_h100_bf16_8node" \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=2048 \ 18 | training.model.micro_batch_size=4 \ 19 | training.model.tensor_model_parallel_size=1 \ 20 | training.model.pipeline_model_parallel_size=1 \ 21 | training.model.fp8=true \ 22 | training.run.time_limit=0:20:00 \ 23 | +training.model.optim.grad_sync_dtype=bf16 \ 24 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/fsdp_20b_bf16_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #This example does pre-training GPT 5B model using torch FSDP + TP. 4 | 5 | # Users should specify the path to the launcher directory and the dataset in the 6 | # commandline or in this run script. 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 8 | DATA_DIR=${DATA_DIR} 9 | 10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 12 | training=gpt3/20b \ 13 | stages=[training] \ 14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 15 | data_dir=${DATA_DIR} \ 16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 17 | training.trainer.precision="bf16-mixed" \ 18 | training.run.name="fsdp_20b_h100_bf16_1node" \ 19 | training.trainer.num_nodes=1 \ 20 | training.model.global_batch_size=32 \ 21 | training.model.megatron_amp_O2=False \ 22 | training.model.use_cpu_initialization=True \ 23 | +training.model.fsdp=True \ 24 | +training.model.fsdp_sharded_checkpoint=False \ 25 | training.model.optim.name="fused_adam" \ 26 | ~training.model.optim.bucket_cap_mb \ 27 | ~training.model.optim.overlap_grad_sync \ 28 | ~training.model.optim.overlap_param_sync \ 29 | ~training.model.optim.contiguous_grad_buffer \ 30 | training.run.time_limit=0:20:00 \ 31 | -------------------------------------------------------------------------------- /examples/training/gpt/h100/fsdp_5b_bf16_1node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #This example does pre-training GPT 5B model using torch FSDP. 4 | 5 | # Users should specify the path to the launcher directory and the dataset in the 6 | # commandline or in this run script. 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 8 | DATA_DIR=${DATA_DIR} 9 | 10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 12 | training=gpt3/5b \ 13 | stages=[training] \ 14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 15 | data_dir=${DATA_DIR} \ 16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 17 | training.trainer.precision="bf16-mixed" \ 18 | training.run.name="fsdp_5b_h100_bf16_1node" \ 19 | training.trainer.num_nodes=1 \ 20 | training.model.global_batch_size=256 \ 21 | training.model.megatron_amp_O2=False \ 22 | training.model.use_cpu_initialization=True \ 23 | +training.model.fsdp=True \ 24 | +training.model.fsdp_sharded_checkpoint=True \ 25 | training.model.optim.name="fused_adam" \ 26 | ~training.model.optim.bucket_cap_mb \ 27 | ~training.model.optim.overlap_grad_sync \ 28 | ~training.model.optim.overlap_param_sync \ 29 | ~training.model.optim.contiguous_grad_buffer \ 30 | training.run.time_limit=0:20:00 \ 31 | -------------------------------------------------------------------------------- /examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=grok/grok1_proxy \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="grok1_proxy_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.model.tokenizer.model=${TOK_PATH} \ 17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ 18 | training.model.moe_grouped_gemm=False \ 19 | training.model.gradient_accumulation_fusion=True \ 20 | +training.model.optim.grad_sync_dtype=bf16 \ 21 | training.trainer.num_nodes=64 \ 22 | +training.model.context_parallel_size=2 \ 23 | training.model.sequence_parallel=True \ 24 | training.model.tensor_model_parallel_size=4 \ 25 | training.model.pipeline_model_parallel_size=8 \ 26 | training.model.virtual_pipeline_model_parallel_size=8 \ 27 | training.model.gc_interval=40 28 | -------------------------------------------------------------------------------- /examples/training/llama/a100/llama2_13b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_13b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_13b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/a100/llama2_70b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_70b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_70b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/a100/llama2_7b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_7b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_7b_bf16" \ 15 | training.run.time_limit=0:15:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama2_13b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_13b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_13b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama2_13b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_13b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_13b_fp8" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.gc_interval=100 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama2_70b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_70b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_70b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama2_70b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_70b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_70b_fp8" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama2_7b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_7b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_7b_bf16" \ 15 | training.run.time_limit=0:15:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama2_7b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama2_7b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama2_7b_fp8" \ 15 | training.run.time_limit=0:15:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.gc_interval=100 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama3_405b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama3_405b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama3_1_405b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=72 \ 17 | training.model.global_batch_size=252 \ 18 | training.model.tokenizer.model=${TOK_PATH} \ 19 | +training.model.optim.grad_sync_dtype=bf16 \ 20 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama3_405b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama3_405b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama3_1_405b_fp8" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=72 \ 17 | training.model.global_batch_size=252 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | training/tp_overlap@training.model.ub_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192 \ 23 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama3_70b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama3_70b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama3_70b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama3_70b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama3_70b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama3_70b_fp8" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.trainer.num_nodes=8 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | training/tp_overlap@training.model.ub_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h8192_tp4_mbs1_seqlen8192 \ 23 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama3_8b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama3_8b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama3_8b_bf16" \ 15 | training.run.time_limit=0:15:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.optim.grad_sync_dtype=bf16 \ 22 | -------------------------------------------------------------------------------- /examples/training/llama/h100/llama3_8b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=llama/llama3_8b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="llama3_8b_fp8" \ 15 | training.run.time_limit=0:15:00 \ 16 | training.trainer.num_nodes=1 \ 17 | training.model.global_batch_size=128 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | training.model.tokenizer.model=${TOK_PATH} \ 21 | +training.model.gc_interval=100 \ 22 | +training.model.optim.grad_sync_dtype=bf16 \ 23 | -------------------------------------------------------------------------------- /examples/training/mixtral/h100/mixtral_8x3b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=mixtral/mixtral_8x3b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="mixtral_8x3b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.model.tokenizer.model=${TOK_PATH} \ 17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ 18 | -------------------------------------------------------------------------------- /examples/training/mixtral/h100/mixtral_8x3b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=mixtral/mixtral_8x3b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="mixtral_8x3b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.model.tokenizer.model=${TOK_PATH} \ 17 | training.model.fp8=True \ 18 | +training.model.fp8_params=True \ 19 | +training.model.optim.overlap_param_gather_with_optimizer_step=False \ 20 | +training.model.optim.average_in_collective=True \ 21 | -------------------------------------------------------------------------------- /examples/training/mixtral/h100/mixtral_8x7b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=mixtral/mixtral_8x7b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="mixtral_8x7b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.model.tokenizer.model=${TOK_PATH} \ 17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ -------------------------------------------------------------------------------- /examples/training/mixtral/h100/mixtral_8x7b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | TOK_PATH=${TOK_PATH} 7 | 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 9 | training=mixtral/mixtral_8x7b \ 10 | stages=[training] \ 11 | data_dir=${DATA_DIR} \ 12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 14 | training.run.name="mixtral_8x7b_bf16" \ 15 | training.run.time_limit=0:30:00 \ 16 | training.model.tokenizer.model=${TOK_PATH} \ 17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ 18 | training.model.fp8=True \ 19 | +training.model.fp8_params=True \ 20 | +training.model.optim.overlap_param_gather_with_optimizer_step=True \ 21 | +training.model.optim.average_in_collective=True \ 22 | training.model.sequence_parallel=False \ 23 | -------------------------------------------------------------------------------- /examples/training/nemotron/a100/nemotron_22b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 8 | training=nemotron/nemotron_22b \ 9 | stages=[training] \ 10 | data_dir=${DATA_DIR} \ 11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 13 | training.run.name="nemotron_22b_bf16" \ 14 | training.run.time_limit=0:20:00 \ 15 | training.trainer.num_nodes=2 \ 16 | training.model.global_batch_size=32 \ 17 | training.model.micro_batch_size=1 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | -------------------------------------------------------------------------------- /examples/training/nemotron/a100/nemotron_8b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 8 | training=nemotron/nemotron_8b \ 9 | stages=[training] \ 10 | data_dir=${DATA_DIR} \ 11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 13 | training.run.name="nemotron_8b_bf16" \ 14 | training.run.time_limit=0:15:00 \ 15 | training.trainer.num_nodes=1 \ 16 | training.model.global_batch_size=32 \ 17 | training.model.micro_batch_size=2 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | -------------------------------------------------------------------------------- /examples/training/nemotron/h100/nemotron_22b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 8 | training=nemotron/nemotron_22b \ 9 | stages=[training] \ 10 | data_dir=${DATA_DIR} \ 11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 13 | training.run.name="nemotron_22b_bf16" \ 14 | training.run.time_limit=0:20:00 \ 15 | training.trainer.num_nodes=2 \ 16 | training.model.global_batch_size=32 \ 17 | training.model.micro_batch_size=1 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | -------------------------------------------------------------------------------- /examples/training/nemotron/h100/nemotron_22b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 8 | training=nemotron/nemotron_22b \ 9 | stages=[training] \ 10 | data_dir=${DATA_DIR} \ 11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 13 | training.run.name="nemotron_22b_fp8" \ 14 | training.run.time_limit=0:20:00 \ 15 | training.trainer.num_nodes=2 \ 16 | training.model.global_batch_size=32 \ 17 | training.model.micro_batch_size=1 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | +training.model.gc_interval=100 \ 21 | -------------------------------------------------------------------------------- /examples/training/nemotron/h100/nemotron_8b_bf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 8 | training=nemotron/nemotron_8b \ 9 | stages=[training] \ 10 | data_dir=${DATA_DIR} \ 11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 13 | training.run.name="nemotron_8b_bf16" \ 14 | training.run.time_limit=0:15:00 \ 15 | training.trainer.num_nodes=1 \ 16 | training.model.global_batch_size=32 \ 17 | training.model.micro_batch_size=2 \ 18 | training.model.fp8=False \ 19 | training.model.fp8_hybrid=False \ 20 | -------------------------------------------------------------------------------- /examples/training/nemotron/h100/nemotron_8b_fp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} 5 | DATA_DIR=${DATA_DIR} 6 | 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ 8 | training=nemotron/nemotron_8b \ 9 | stages=[training] \ 10 | data_dir=${DATA_DIR} \ 11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ 12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ 13 | training.run.name="nemotron_8b_fp8" \ 14 | training.run.time_limit=0:15:00 \ 15 | training.trainer.num_nodes=1 \ 16 | training.model.global_batch_size=32 \ 17 | training.model.micro_batch_size=2 \ 18 | training.model.fp8=True \ 19 | training.model.fp8_hybrid=True \ 20 | +training.model.gc_interval=100 \ 21 | -------------------------------------------------------------------------------- /img/4B_bert_throughput_2211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/img/4B_bert_throughput_2211.png -------------------------------------------------------------------------------- /img/4b_bert_loss_final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/img/4b_bert_loss_final.png -------------------------------------------------------------------------------- /img/model_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/img/model_overview.png -------------------------------------------------------------------------------- /launcher_scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/__init__.py -------------------------------------------------------------------------------- /launcher_scripts/conf/cluster/bcm.yaml: -------------------------------------------------------------------------------- 1 | partition: null 2 | account: null 3 | exclusive: True 4 | gpus_per_task: null 5 | gpus_per_node: 8 6 | mem: 0 7 | job_name_prefix: 'nemo-megatron-' 8 | nodelist: null 9 | srun_args: 10 | - "--no-container-mount-home" 11 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/baichuan2/convert_baichuan2.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: baichuan2_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_baichuan2.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_baichuan2-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 2 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | tokenizer_model: ${data_dir}/baichuan2/baichuan2_tokenizer.model 23 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/chatglm/convert_chatglm.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: chatglm3_6b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_chatglm.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: gpt # gpt or t5, use t5 for mt5 as well 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_chatglm-*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | tensor_model_parallel_size: 1 19 | pipeline_model_parallel_size: 1 20 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 21 | tokenizer_model: ${data_dir}/chatglm/chatglm_tokenizer.model 22 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/clip/convert_clip.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: clip_vit_B_32 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo_clip.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: megatron_clip 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt_*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | tensor_model_parallel_size: 1 19 | pipeline_model_parallel_size: 1 20 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 21 | vocab_file: ${data_dir}/bpe/vocab.txt 22 | merge_file: null 23 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/controlnet/convert_controlnet.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: controlnet_v1-5 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo-controlnet.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: controlnet 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/ 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | model_parallel_size: 1 -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/dreambooth/convert_dreambooth.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: dreambooth_sd_860m 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo_dreambooth.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: dreambooth 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt_*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | model_parallel_size: 1 -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: gpt3_5b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_gpt.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | vocab_file: ${data_dir}/bpe/vocab.json 23 | merge_file: ${data_dir}/bpe/merges.txt 24 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/imagen/convert_imagen.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: imagen_base64_500m_edm 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo_imagen.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: imagen 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/ 16 | checkpoint_name: latest-EMA # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | model_parallel_size: 1 -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/instruct_pix2pix/convert_instruct_pix2pix.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: instruct_pix2pix_860m_sd_edit 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo_instruct_pix2pix.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: instruct_pix2pix 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/instruct-pix2pix--val 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt_*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | model_parallel_size: 1 -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/llama/convert_llama.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: llama2_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 2 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 23 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/mistral/convert_mistral.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: mistral_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_mistral.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 2 20 | sequence_parallel: True 21 | pipeline_model_parallel_size: 1 22 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 23 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/mixtral/convert_mixtral.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: mixtral 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_mixtral.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_mixtral-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 2 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/mixtral/convert_mixtral_8x22b.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: mixtral 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_mixtral.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_mixtral-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 2 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/nemotron/convert_nemotron.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: nemotron 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_nemotron.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_nemotron-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 2 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/neva/convert_neva.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: neva_llama2_7b_chat 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo_neva.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: neva 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | tensor_model_parallel_size: 4 19 | pipeline_model_parallel_size: 1 20 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: qwen2_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_qwen2.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: gpt # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_qwen2-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 2 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model 23 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/stable_diffusion/convert_stable_diffusion.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: stable_diffusion_860m_res_256_pretrain 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo_stable_diffusion.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: stable_diffusion 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/ 16 | checkpoint_name: latest-EMA # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | model_parallel_size: 1 -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/starcoder2/convert_starcoder2.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: starcoder2_15b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_starcoder2.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: gpt # gpt or t5, use t5 for mt5 as well 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | tensor_model_parallel_size: 2 19 | pipeline_model_parallel_size: 1 20 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 21 | tokenizer_model: ${data_dir}/starcoder2/starcoder2_tokenizer.model 22 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/t5/convert_t5.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: t5_220m 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: megatron_t5.nemo # name of nemo checkpoint; must be .nemo file 12 | pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory 13 | 14 | model: 15 | model_type: t5 # gpt or t5, use t5 for mt5 as well 16 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 17 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 # 1 for 220m, 2 for 3b 20 | pipeline_model_parallel_size: 1 21 | pipeline_model_parallel_split_rank: ${divide_floor:${.pipeline_model_parallel_size}, 2} 22 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 23 | vocab_file: ${data_dir}/bpe/vocab.txt 24 | merge_file: null 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/conversion/vit/convert_vit.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_${conversion.run.model_train_name} 3 | nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} 7 | convert_name: convert_nemo 8 | model_train_name: vit_B_16 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | results_dir: ${.train_dir}/${.convert_name} 11 | nemo_file_name: nemo_vit_classification.nemo # name of nemo checkpoint; must be .nemo file 12 | 13 | model: 14 | model_type: vit_classification 15 | checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints 16 | checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 17 | hparams_file: ${conversion.run.train_dir}/results/hparams.yaml 18 | tensor_model_parallel_size: 1 19 | pipeline_model_parallel_size: 1 20 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/compute_minhashes/compute_minhashes.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'compute-minhashes' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 2 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 72GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | minhash_length: 260 15 | char_ngram: 5 16 | hash_bytes: 4 17 | seed: 42 18 | num_files: -1 19 | files_per_partition: 10 20 | 21 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/connected_component/connected_component.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'connected-component' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 2 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 72GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | jaccard_pairs_path: ${data_curation.run.results_dir}/fuzzy_deduped/dedup_final_results.parquet 15 | output_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_output 16 | cache_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_cache 17 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/fasttext_download/fasttext_download.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'fasttext-download' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "00:20:00" 6 | nodes: 1 7 | node_type: cpu 8 | 9 | filter_config: 10 | input_field: text 11 | filters: 12 | - name: nemo_curator.filters.classifier_filter.FastTextLangId 13 | log_score: True 14 | params: 15 | model_path: lid.176.bin # Will be automatically downloaded if it doesn't exist -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/find_matching_ngrams/find_matching_ngrams.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'find-matching-ngrams' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "08:00:00" 6 | nodes: 2 7 | node_type: cpu 8 | 9 | output_matched_ngram_data: ${.run.results_dir}/matched_ngrams.pkl -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/jaccard_compute/jaccard_compute.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'jaccard-compute' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 2 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 72GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | shuffled_docs_path: ${data_curation.run.results_dir}/fuzzy_deduped/shuffled_docs.parquet 15 | files_per_partition: 5 16 | num_files: -1 17 | 18 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/jaccard_map_buckets/jaccard_map_buckets.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'jaccard-map-buckets' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 2 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 72GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | input_bucket_dir: ${data_curation.run.results_dir}/fuzzy_deduped/buckets.parquet 15 | num_files: -1 16 | text_ddf_blocksize: 512 17 | 18 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/jaccard_shuffle/jaccard_shuffle.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'jaccard-shuffle' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 2 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 72GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | input_bucket_mapping_dir: ${data_curation.run.results_dir}/fuzzy_deduped/anchor_docs_with_bk.parquet 15 | num_files: -1 16 | text_ddf_blocksize: 512 17 | parts_per_worker: 2 18 | 19 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped 20 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/language_identification/language_identification.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'language-identification' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "04:00:00" 6 | nodes: 1 7 | node_type: cpu 8 | 9 | log_scores: store_true 10 | output_retained_document_dir: ${.run.results_dir}/lang_annotated -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/minhash_buckets/minhash_buckets.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'minhash-buckets' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 2 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 72GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | input_minhash_dir: ${data_curation.run.results_dir}/fuzzy_deduped/dedup_test_rapids/minhashes.parquet 15 | minhash_length: 260 16 | num_bands: 20 17 | buckets_per_shuffle: 10 18 | 19 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped 20 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/remove_matching_ngrams/remove_matching_ngrams.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'remove-matching-ngrams' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "08:00:00" 6 | nodes: 2 7 | node_type: cpu 8 | 9 | output_task_deduped_dir: ${data_dir}/task_deduped -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/separate_by_language/separate_by_language.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'separate-by-language' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "01:00:00" 6 | nodes: 1 7 | node_type: cpu 8 | 9 | output_data_dir: ${.run.results_dir}/lang_separated 10 | output_language_distribution: ${.run.results_dir}/lang_distro.json -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/text_cleaning/text_cleaning.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'text-cleaning' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "04:00:00" 6 | nodes: 1 7 | node_type: cpu 8 | 9 | output_clean_dir: ${.run.results_dir}/clean 10 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/verify_all_pairs_jaccard/verify_all_pairs_jaccard.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'verify-all-pairs-jaccard' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 2 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 2GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | output_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_output 15 | cache_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_cache 16 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/common_crawl/write_deduped_result_with_text/write_deduped_result_with_text.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'write-deduped-result-with-text' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | time_limit: "02:00:00" 5 | dependency: "singleton" 6 | nodes: 1 7 | node_type: gpu 8 | 9 | dask: 10 | pool_size: 1GiB 11 | protocol: ucx 12 | interface: ibp12s0 13 | 14 | output_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_output 15 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/sft/curate_sft.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'data-curation' 3 | results_dir: ${base_results_dir}/${.name} 4 | 5 | # Many steps in the data curator do not use GPUs 6 | # Adjust configs here if you would like to use different cluster configurations for jobs that do/don't require GPUs 7 | cpu_config: 8 | partition: 9 | 10 | gpu_config: 11 | partition: 12 | 13 | stages: 14 | - task_deduplication 15 | 16 | task_deduplication: 17 | - prepare_task_data 18 | - find_matching_ngrams 19 | - remove_matching_ngrams 20 | 21 | dataset_name: sft 22 | 23 | defaults: 24 | - sft/prepare_task_data/prepare_task_data 25 | - sft/find_matching_ngrams/find_matching_ngrams 26 | - sft/remove_matching_ngrams/remove_matching_ngrams 27 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/sft/find_matching_ngrams/find_matching_ngrams.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'find-matching-ngrams' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "08:00:00" 6 | nodes: 1 7 | node_type: cpu 8 | 9 | output_matched_ngram_data: ${.run.results_dir}/matched_ngrams.pkl 10 | input_json_text_field: text -------------------------------------------------------------------------------- /launcher_scripts/conf/data_curation/sft/remove_matching_ngrams/remove_matching_ngrams.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: 'remove-matching-ngrams' 3 | results_dir: ${data_curation.run.results_dir}/${.name} 4 | dependency: "singleton" 5 | time_limit: "08:00:00" 6 | nodes: 1 7 | node_type: cpu 8 | 9 | output_task_deduped_dir: ${data_dir}/task_deduped 10 | input_json_text_field: text 11 | max_document_splits: 0 -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/baichuan2/download_baichuan2_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_baichuan2_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | node_array_size: 30 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | download_tokenizer_url: "https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/tokenizer.model" 16 | tokenizer_library: "sentencepiece" 17 | tokenizer_save_dir: ${data_dir}/baichuan2 18 | tokenizer_model: ${.tokenizer_save_dir}/baichuan2_tokenizer.model 19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 20 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/bert/download_bert_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_bert_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | node_array_size: 30 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | download_vocab_url: "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt" # URL to download the vocab from. 16 | download_merges_url: null 17 | vocab_save_dir: ${data_dir} 18 | merges_save_dir: ${data_dir} 19 | tokenizer_type: BertWordPieceLowerCase # Bert model uses BertWordPieceLowerCase tokenizer 20 | rm_downloaded: True # Extract script will remove downloaded zst after extraction 21 | rm_extracted: True # Preprocess script will remove extracted files after preproc. -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/chatglm/download_chatglm_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_chatglm_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | node_array_size: 30 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | download_tokenizer_url: "https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenizer.model" 16 | tokenizer_library: "sentencepiece" 17 | tokenizer_save_dir: ${data_dir}/chatglm 18 | tokenizer_model: ${.tokenizer_save_dir}/chatglm_tokenizer.model 19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 20 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/code_llama/download_human_eval.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_human_eval 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | array: ${..file_numbers} 7 | 8 | human_eval_url: "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz" # Source URL to download The human_eval data. 9 | split_string: "0.7,0.2,0.1" #The ratio to split into train/test/validation 10 | output_dir: ${data_dir}/human_eval #Output to write train.jsonl /test.jsonl /validation.jsonl file 11 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/falcon/download_falcon_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_falcon_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 2 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | tokenizer_library: "huggingface" 16 | tokenizer_type: tiiuae/falcon-7b 17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 18 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/fid_evaluation/download_coco2014.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_coco2014 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "0:30:00" 5 | dependency: "singleton" 6 | 7 | dataset_output_root: ${data_dir}/fid_evaluation/coco2014 8 | 9 | preprocess_images: True 10 | preprocess_captions: True 11 | num_processes: 8 # set to number of CPUs in the job (-1 defaults to slurm cpus_per_task) -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_gpt3_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | node_array_size: 30 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json" # URL to download the vocab from. 16 | download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt" # URL to download the merges from. 17 | vocab_save_dir: ${data_dir}/bpe 18 | merges_save_dir: ${data_dir}/bpe 19 | tokenizer_type: GPT2BPETokenizer 20 | tokenizer_library: megatron 21 | rm_downloaded: True # Extract script will remove downloaded zst after extraction 22 | rm_extracted: True # Preprocess script will remove extracted files after preproc. 23 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_llama_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 30 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | download_tokenizer_url: "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model" 16 | tokenizer_library: "sentencepiece" 17 | tokenizer_save_dir: ${data_dir}/llama 18 | tokenizer_model: ${.tokenizer_save_dir}/llama_tokenizer.model 19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 20 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/mistral/download_mistral_nemo_123b_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_mistral_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 2 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | tokenizer_library: "huggingface" 16 | tokenizer_type: mistralai/Mistral-Large-Instruct-2407 17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 18 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/mistral/download_mistral_nemo_12b_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_mistral_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 2 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | tokenizer_library: "huggingface" 16 | tokenizer_type: mistralai/Mistral-Nemo-Base-2407 17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 18 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/mistral/download_mistral_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_mistral_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 2 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | tokenizer_library: "huggingface" 16 | tokenizer_type: mistralai/Mistral-7B-v0.1 17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 18 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/mixtral/download_mixtral_8x22b_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_mixtral_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 2 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | tokenizer_library: "huggingface" 16 | tokenizer_type: mistral-community/Mixtral-8x22B-v0.1 17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 18 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/mixtral/download_mixtral_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_mixtral_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 2 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | tokenizer_library: "huggingface" 16 | tokenizer_type: mistralai/Mixtral-8x7B-v0.1 17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 18 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 19 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/multimodal/precache_t5xxl.yaml: -------------------------------------------------------------------------------- 1 | batch_size_per_GPU: 64 # as much as it can fit in your GPU memory 2 | dataloader_num_workers: 16 3 | save_original_in_tar: #[video] 4 | encodings: # see README for instructions 5 | - modality: text 6 | extension: text 7 | key: t5xxl 8 | precision: 16 9 | store_pad_tokens: False 10 | encoder_config: 11 | cls: encoders.t5encoder.T5Encoder 12 | max_seq_len: 64 # see webvid caption length distribution (mostly less than 40 words) 13 | encoder_path: /path/to/encoders # contains t5xxl-encoder.bin 14 | # - modality: video 15 | # extension: mp4 16 | 17 | lightning: 18 | devices: 8 19 | num_nodes: 1 20 | max_epochs: 1 # important for caching 21 | precision: 16 22 | accelerator: gpu 23 | enable_checkpointing: False 24 | strategy: ddp -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/nemotron/download_nemotron_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_nemotron_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "1:00:00" 5 | dependency: "singleton" 6 | node_array_size: 30 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | download_tokenizer_url: null 16 | tokenizer_library: "sentencepiece" 17 | tokenizer_save_dir: ${data_dir}/nemotron 18 | tokenizer_model: ${.tokenizer_save_dir}/nemotron_tokenizer.model 19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction 20 | rm_extracted: False # Preprocess script will remove extracted files after preproc. 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/steerlm/steerlm_data_prep1.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: steerlm_dataset_prep1 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | node_array_size: 1 7 | bcp_preproc_npernode: 1 # 2 should be safe to use and x2 times faster. 8 | 9 | prep_stage : "1" # make sure wrap in string type 10 | dataset: helpsteer # either openassistant or helpsteer 11 | output_dir: "${data_dir}/steerlm/" # specify output_directory of the downloaded and preprocessed data 12 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 13 | 14 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/steerlm/steerlm_data_prep2_reg.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: steerlm_dataset_prep2_reg 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | node_array_size: 1 7 | bcp_preproc_npernode: 1 # 2 should be safe to use and x2 times faster. 8 | 9 | prep_stage: "2" # make sure wrap in string type 10 | input_dataset: "${data_dir}/steerlm/merge_train.jsonl" # for merged train or val jsonl data, see https://github.com/NVIDIA/NeMo-Aligner/blob/main/docs/user-guide/SteerLM.rst#step-2-download-and-preprocess-data-for-attribute-prediction-modelling 11 | output_dir: "${data_dir}/steerlm/merged_train_reg.jsonl" # specify output_directory of the downloaded and preprocessed data 12 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 13 | 14 | -------------------------------------------------------------------------------- /launcher_scripts/conf/data_preparation/t5/download_t5_pile.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: download_t5_pile 3 | results_dir: ${base_results_dir}/${.name} 4 | time_limit: "4:00:00" 5 | dependency: "singleton" 6 | node_array_size: 30 7 | array: ${..file_numbers} 8 | bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. 9 | 10 | dataset: pile 11 | download_the_pile: True # Whether to download the pile dataset from the internet. 12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/" # Source URL to download The Pile dataset from. 13 | file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. 14 | preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. 15 | download_vocab_url: "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt" # URL to download the vocab from. 16 | download_merges_url: null 17 | vocab_save_dir: ${data_dir}/bpe 18 | merges_save_dir: ${data_dir}/bpe 19 | tokenizer_type: BertWordPieceCase # T5 model uses BertWordPieceCase tokenizer 20 | rm_downloaded: True # Extract script will remove downloaded zst after extraction 21 | rm_extracted: True # Preprocess script will remove extracted files after preproc. -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/chatglm/evaluate_all.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "04:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: chatglm3_6b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-chatglm 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | #tokenizer_model: ${data_dir}/chatglm/chatglm_tokenizer.model 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/chatglm/evaluate_boolq.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_boolq 8 | model_train_name: chatglm3_6b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: boolq # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-chatglm 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | #tokenizer_model: ${data_dir}/chatglm/chatglm_tokenizer.model 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/clip/imagenet_zeroshot.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: eval_${.task_name}_${.model_train_name} 3 | time_limit: "04:00:00" 4 | dependency: "singleton" 5 | model_train_name: clip_vit_B_32 6 | task_name: "imagenet_zeroshot" # Rename this name to be more clear 7 | fine_tuning_dir: ${base_results_dir}/${.model_train_name}/imagenet_1k 8 | results_dir: ${base_results_dir}/${.model_train_name}/${.name} 9 | 10 | trainer: 11 | devices: 8 12 | num_nodes: 1 13 | accelerator: gpu 14 | logger: False # logger provided by exp_manager 15 | precision: bf16 # 16, 32, or bf16 16 | 17 | model: 18 | restore_from_path: ${base_results_dir}/${evaluation.run.model_train_name}/results/checkpoints/nemo_clip.nemo # Path to a trained CLIP .nemo file 19 | precision: ${evaluation.trainer.precision} 20 | micro_batch_size: 1000 21 | global_batch_size: 8000 22 | 23 | data: 24 | num_workers: 8 25 | imagenet_val: ${data_dir}/imagenet_1k/val # path to imagenet val folder -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/falcon/evaluate_all.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: falcon_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-falcon 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/llama/evaluate_all.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: llama2_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-llama 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_boolq 8 | model_train_name: llama2_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: boolq # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-llama 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/mistral/evaluate_all.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: mistral_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-mistral 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/mixtral/evaluate_all.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: mixtral_8x7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-mixtral 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 8 20 | pipeline_model_parallel_size: 1 21 | sequence_parallel: True 22 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 23 | precision: bf16 # must match training precision - 32, 16 or bf16 24 | eval_batch_size: 4 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/mixtral/evaluate_all_8x22b.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: mixtral_8x7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-mixtral 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 8 20 | pipeline_model_parallel_size: 1 21 | sequence_parallel: True 22 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 23 | precision: bf16 # must match training precision - 32, 16 or bf16 24 | eval_batch_size: 4 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/nemotron/evaluate_all.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: nemotron 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-nemotron 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/prompt_gpt3/squad.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "4:00:00" 4 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 5 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 6 | eval_name: eval_prompt_squad 7 | model_train_name: gpt3_5b 8 | tasks: "prompt" # general prompt task 9 | prompt_learning_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_squad # assume prompt learning was on squad task 10 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 11 | 12 | model: 13 | model_type: nemo-gpt3-prompt 14 | nemo_model: ${evaluation.run.prompt_learning_dir}/results/megatron_gpt_prompt.nemo 15 | tensor_model_parallel_size: 2 #1 for 126m, 2 for 5b, 8 for 20b 16 | pipeline_model_parallel_size: 1 17 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 18 | precision: bf16 # must match training precision - 32, 16 or bf16 19 | eval_batch_size: 4 20 | prompt_dataset_paths: ${data_dir}/prompt_data/v1.1/squad_val.jsonl 21 | disable_special_tokens: False # Whether to disable virtual tokens in prompt model evaluation. This is equivalent to evaluate without prompt-/p-tuning. 22 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/prompt_llama/squad.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "1:00:00" 4 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 5 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 6 | eval_name: eval_prompt_squad 7 | model_train_name: llama_7b 8 | tasks: "prompt" # general prompt task 9 | prompt_learning_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_squad # assume prompt learning was on squad task 10 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 11 | 12 | model: 13 | model_type: nemo-llama-prompt 14 | nemo_model: ${evaluation.run.prompt_learning_dir}/results/megatron_llama_prompt.nemo 15 | tensor_model_parallel_size: 2 #1 for 126m, 2 for 5b, 8 for 20b 16 | pipeline_model_parallel_size: 1 17 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 18 | precision: bf16 # must match training precision - 32, 16 or bf16 19 | eval_batch_size: 4 20 | prompt_dataset_paths: ${data_dir}/prompt_data/v1.1/squad_val.jsonl 21 | disable_special_tokens: False # Whether to disable virtual tokens in prompt model evaluation. This is equivalent to evaluate without prompt-/p-tuning. 22 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_all 8 | model_train_name: qwen2_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-qwen2 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | #tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: ${.eval_name}_${.model_train_name} 3 | time_limit: "02:00:00" 4 | dependency: "singleton" 5 | nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node 6 | ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} 7 | eval_name: eval_boolq 8 | model_train_name: llama2_7b 9 | train_dir: ${base_results_dir}/${.model_train_name} 10 | tasks: boolq # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks 11 | results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} 12 | 13 | model: 14 | model_type: nemo-llama 15 | nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints 16 | #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints 17 | #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) 18 | #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml 19 | tensor_model_parallel_size: 1 20 | pipeline_model_parallel_size: 1 21 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 22 | precision: bf16 # must match training precision - 32, 16 or bf16 23 | eval_batch_size: 4 24 | #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 25 | -------------------------------------------------------------------------------- /launcher_scripts/conf/evaluation/vit/imagenet_val.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: eval_${.task_name}_${.model_train_name} 3 | time_limit: "04:00:00" 4 | dependency: "singleton" 5 | model_train_name: vit_B_16 6 | task_name: "imagenet_val" # Rename this name to be more clear 7 | fine_tuning_dir: ${base_results_dir}/${.model_train_name}/imagenet_1k 8 | results_dir: ${base_results_dir}/${.model_train_name}/${.name} 9 | 10 | trainer: 11 | devices: 1 12 | num_nodes: 1 13 | accelerator: gpu 14 | logger: False # logger provided by exp_manager 15 | precision: bf16 # 16, 32, or bf16 16 | 17 | model: 18 | restore_from_path: ${evaluation.run.fine_tuning_dir}/results/checkpoints/nemo_vit_classification.nemo # Path to a trained vit .nemo file 19 | precision: ${evaluation.trainer.precision} 20 | micro_batch_size: 512 # we only supports DP=1 eval at the moment, GBS=MBS 21 | 22 | data: 23 | num_workers: 8 24 | imagenet_val: ${data_dir}/imagenet_1k/val # path to imagenet val folder -------------------------------------------------------------------------------- /launcher_scripts/conf/export/gpt3/export_gpt3.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: export_${.model_train_name} 3 | time_limit: "2:00:00" 4 | model_train_name: "gpt3_5b" 5 | dependency: "singleton" 6 | training_dir: ${base_results_dir}/${.model_train_name} 7 | config_summary: tp${export.model.tensor_model_parallel_size}_pp${export.triton_deployment.pipeline_model_parallel_size}_${export.model.weight_data_type}_${export.triton_deployment.data_type} 8 | results_dir: ${base_results_dir}/${.model_train_name}/export_${.config_summary} 9 | model_type: "gpt3" 10 | 11 | model: 12 | checkpoint_path: ${export.run.training_dir}/results/checkpoints 13 | # FT checkpoint will be saved in ${.triton_model_dir}/1/${.tensor_model_parallel_size}-gpu 14 | tensor_model_parallel_size: 8 15 | weight_data_type: fp16 # fp32|fp16 16 | processes: 16 17 | load_checkpoints_to_cpu: False 18 | 19 | triton_deployment: 20 | triton_model_dir: ${export.run.results_dir}/model_repo/${export.run.model_train_name} 21 | max_batch_size: 1 22 | pipeline_model_parallel_size: 1 23 | int8_mode: False 24 | enable_custom_all_reduce: False 25 | data_type: fp16 # fp32|fp16|bf16 26 | 27 | benchmark: 28 | input_len: 60 29 | output_len: 20 30 | batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256] 31 | triton_wait_time_s: 300 32 | vocab_size: 51200 33 | -------------------------------------------------------------------------------- /launcher_scripts/conf/export/mt5/export_mt5.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: export_${.model_train_name} 3 | time_limit: "2:00:00" 4 | model_train_name: t5_invalid_model_name # Add here model name. It must match export configuration. 5 | dependency: "singleton" 6 | config_summary: tp${export.model.tensor_model_parallel_size}_pp${export.triton_deployment.pipeline_model_parallel_size}_${export.model.weight_data_type}_${export.triton_deployment.data_type} 7 | results_dir: ${base_results_dir}/${.model_train_name}/export_from_convert_${.config_summary} 8 | model_type: "mt5" 9 | 10 | model: 11 | checkpoint_path: t5_invalid_path # Set here path of model converted from training 12 | # FT checkpoint will be saved in ${.triton_model_dir}/1/${.tensor_model_parallel_size}-gpu 13 | tensor_model_parallel_size: 8 14 | weight_data_type: fp16 # fp32|fp16 15 | processes: 16 16 | load_checkpoints_to_cpu: False 17 | 18 | triton_deployment: 19 | triton_model_dir: ${export.run.results_dir}/model_repo/${export.run.model_train_name} 20 | max_batch_size: 1 21 | pipeline_model_parallel_size: 1 22 | int8_mode: False 23 | enable_custom_all_reduce: False 24 | data_type: fp16 # fp32|fp16|bf16 25 | 26 | benchmark: 27 | input_len: 60 28 | output_len: 20 29 | batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256] 30 | triton_wait_time_s: 300 31 | vocab_size: 250112 32 | -------------------------------------------------------------------------------- /launcher_scripts/conf/export/t5/export_t5.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: export_${.model_train_name} 3 | time_limit: "2:00:00" 4 | model_train_name: mt5_invalid_model_name # Add here model name. It must match export configuration. 5 | dependency: "singleton" 6 | config_summary: tp${export.model.tensor_model_parallel_size}_pp${export.triton_deployment.pipeline_model_parallel_size}_${export.model.weight_data_type}_${export.triton_deployment.data_type} 7 | results_dir: ${base_results_dir}/${.model_train_name}/export_from_convert_${.config_summary} 8 | model_type: "t5" 9 | 10 | model: 11 | checkpoint_path: mt5_invalid_path # Set here path of model converted from training 12 | # FT checkpoint will be saved in ${.triton_model_dir}/1/${.tensor_model_parallel_size}-gpu 13 | tensor_model_parallel_size: 8 14 | weight_data_type: fp16 # fp32|fp16 15 | processes: 16 16 | load_checkpoints_to_cpu: False 17 | 18 | triton_deployment: 19 | triton_model_dir: ${export.run.results_dir}/model_repo/${export.run.model_train_name} 20 | max_batch_size: 1 21 | pipeline_model_parallel_size: 1 22 | int8_mode: False 23 | enable_custom_all_reduce: False 24 | data_type: fp16 # fp32|fp16|bf16 25 | 26 | benchmark: 27 | input_len: 60 28 | output_len: 20 29 | batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256] 30 | triton_wait_time_s: 300 31 | vocab_size: 29184 32 | -------------------------------------------------------------------------------- /launcher_scripts/conf/external_conversion/clip/convert_external_clip.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: convert_external_clip 3 | nodes: ${divide_ceil:${external_conversion.model.model_parallel_size}, 8} # 8 gpus per node 4 | time_limit: "2:00:00" 5 | dependency: "singleton" 6 | ntasks_per_node: ${divide_ceil:${external_conversion.model.model_parallel_size}, ${.nodes}} 7 | results_dir: ${base_results_dir}/${.name} 8 | nemo_file_name: converted_${external_conversion.model.arch}_${external_conversion.model.version}.nemo # name of nemo checkpoint; must be .nemo file 9 | 10 | model: 11 | # If converting from OpenCLIP, specify the architecture (`arch`) and version (`version`) from the 12 | # OpenCLIP model list (https://github.com/mlfoundations/open_clip#usage). 13 | # If converting from Hugging Face, set the version to `huggingface` and the architecture (`arch`) 14 | # to the Hugging Face model name (e.g., `laion/CLIP-ViT-H-14-laion2B-s32B-b79K`). 15 | arch: ViT-H-14 16 | version: laion2b_s32b_b79k 17 | hparams_file: /path/to/modified_hparam.yaml 18 | tensor_model_parallel_size: 1 19 | pipeline_model_parallel_size: 1 20 | model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/fw_inference/clip/clip_similarity.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: fw_inference_${.model_train_name} 3 | time_limit: "04:00:00" 4 | dependency: "singleton" 5 | model_train_name: clip_vit_B_32 6 | results_dir: ${base_results_dir}/${.model_train_name}/${.name} 7 | 8 | image_path: ??? # Path to a image for inference 9 | texts: ??? # List of texts to compute similarity 10 | 11 | trainer: 12 | devices: 1 13 | num_nodes: 1 14 | accelerator: gpu 15 | logger: False # logger provided by exp_manager 16 | precision: bf16 # 16, 32, or bf16 17 | 18 | model: 19 | restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/results/checkpoints/nemo_clip.nemo # Path to a trained CLIP .nemo file 20 | precision: ${fw_inference.trainer.precision} 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/fw_inference/dreambooth/text2img.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: fw_inference_${.model_train_name} 3 | time_limit: "04:00:00" 4 | dependency: "singleton" 5 | model_train_name: dreambooth_sd_860m 6 | results_dir: ${base_results_dir}/${.model_train_name}/${.name} 7 | 8 | infer: 9 | unconditional_guidance_scale: 7.5 10 | num_images_per_prompt: 4 11 | batch_size: 4 12 | height: 512 13 | width: 512 14 | down_factor: 8 15 | inference_steps: 100 16 | sampler_type: 'DDIM' 17 | eta: 0 18 | output_type: 'pil' 19 | save_to_file: True 20 | out_path: ${fw_inference.run.results_dir} 21 | seed: 234 22 | prompts: 23 | - "a photo of a sks dog" 24 | - "a photo of a sks dog in the Acropolis" 25 | - "a photo of a sks dog in front of eiffel tower" 26 | - "a photo of sks dog sleeping" 27 | - "a photo of a sks dog riding a bike" 28 | 29 | trainer: 30 | devices: 1 31 | num_nodes: 1 32 | accelerator: gpu 33 | precision: 16 34 | logger: False # logger provided by exp_manager 35 | 36 | model: 37 | restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/convert_nemo/results/nemo_dreambooth.nemo # Path to a trained CLIP .nemo file 38 | precision: ${fw_inference.trainer.precision} 39 | -------------------------------------------------------------------------------- /launcher_scripts/conf/fw_inference/instruct_pix2pix/edit_cli.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: fw_inference_${.model_train_name} 3 | time_limit: "04:00:00" 4 | dependency: "singleton" 5 | model_train_name: instruct_pix2pix_860m_sd_edit 6 | results_dir: ${base_results_dir}/${.model_train_name}/${.name} 7 | 8 | edit: 9 | resolution: 512 10 | steps: 100 11 | input: ??? # path/to/input/picture 12 | outpath: ${fw_inference.run.results_dir} 13 | prompt: "" 14 | cfg_text: 7.5 15 | cfg_image: 1.2 16 | num_images_per_prompt: 8 17 | combine_images: [2, 4] # [row, column], set to null if don't want to combine 18 | seed: 1234 19 | 20 | trainer: 21 | devices: 1 22 | num_nodes: 1 23 | accelerator: gpu 24 | logger: False # logger provided by exp_manager 25 | precision: bf16 # 16, 32, or bf16 26 | 27 | model: 28 | restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/results/checkpoints/nemo_instruct_pix2pix.nemo # Path to a trained CLIP .nemo file 29 | precision: ${fw_inference.trainer.precision} 30 | 31 | -------------------------------------------------------------------------------- /launcher_scripts/conf/fw_inference/nsfw/nsfw.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: fw_inference_${.model_train_name} 3 | time_limit: "00:10:00" 4 | depencency: "singleton" 5 | model_train_name: nsfw_L_14 6 | results_dir: ${base_results_dir}/${.model_train_name}/${.name} 7 | 8 | image_path: ??? # Path to a image for inference 9 | 10 | trainer: 11 | devices: 1 12 | num_nodes: 1 13 | accelerator: gpu 14 | logger: False # logger provided by exp_manager 15 | precision: 16 # 16, 32, or bf16 16 | 17 | model: 18 | restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/results/checkpoints/nemo_nsfw.nemo 19 | precision: ${fw_inference.trainer.precision} 20 | -------------------------------------------------------------------------------- /launcher_scripts/conf/fw_inference/vit/imagenet1k.yaml: -------------------------------------------------------------------------------- 1 | run: 2 | name: fw_inference_${.model_train_name} 3 | time_limit: "04:00:00" 4 | dependency: "singleton" 5 | model_train_name: vit_B_16 6 | fine_tuning_dir: ${base_results_dir}/${.model_train_name}/imagenet_1k 7 | results_dir: ${base_results_dir}/${.model_train_name}/${.name} 8 | 9 | data_path: ??? # Path to a image folder for inference 10 | 11 | trainer: 12 | devices: 1 13 | num_nodes: 1 14 | accelerator: gpu 15 | logger: False # logger provided by exp_manager 16 | precision: bf16 # 16, 32, or bf16 17 | 18 | model: 19 | restore_from_path: ${fw_inference.run.fine_tuning_dir}/results/checkpoints/nemo_vit_classification.nemo # Path to a trained vit .nemo file 20 | precision: ${fw_inference.trainer.precision} 21 | -------------------------------------------------------------------------------- /launcher_scripts/conf/rag_indexing/bert/110m.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | searchpath: 3 | - file:///opt/NeMo/examples/nlp/language_modeling/conf 4 | 5 | run: 6 | name: ${.eval_name}_${.model_train_name} 7 | time_limit: "4:00:00" 8 | dependency: "singleton" 9 | nodes: 1 10 | ntasks_per_node: 1 11 | eval_name: rag_indexing 12 | model_train_name: bert 13 | results_dir: ${base_results_dir}/${.name} 14 | 15 | trainer: 16 | devices: 1 17 | num_nodes: 1 18 | accelerator: gpu 19 | logger: False # logger provided by exp_manager 20 | precision: 'bf16-mixed' 21 | use_distributed_sampler: False 22 | 23 | indexing: 24 | embedder: 25 | model_type: bert 26 | model_path: null 27 | embed_batch_size: 128 28 | data: 29 | data_path: null 30 | chunk_size: 256 31 | chunk_overlap: 10 32 | index_path: null -------------------------------------------------------------------------------- /launcher_scripts/conf/rag_indexing/bert/340m.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | searchpath: 3 | - file:///opt/NeMo/examples/nlp/language_modeling/conf 4 | 5 | run: 6 | name: ${.eval_name}_${.model_train_name} 7 | time_limit: "4:00:00" 8 | dependency: "singleton" 9 | nodes: 1 10 | ntasks_per_node: 1 11 | eval_name: rag_indexing 12 | model_train_name: bert 13 | results_dir: ${base_results_dir}/${.name} 14 | 15 | trainer: 16 | devices: 1 17 | num_nodes: 1 18 | accelerator: gpu 19 | logger: False # logger provided by exp_manager 20 | precision: 'bf16-mixed' 21 | use_distributed_sampler: False 22 | 23 | indexing: 24 | embedder: 25 | model_type: bert 26 | model_path: null 27 | embed_batch_size: 128 28 | data: 29 | data_path: null 30 | chunk_size: 256 31 | chunk_overlap: 10 32 | index_path: null -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/background/random.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.background.random_background.RandomBackground 2 | base_background: [1, 1, 1] 3 | random_ratio: 0.5 4 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/background/static.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.background.static_background.StaticBackground 2 | background: [0, 0, 1] # rgb 3 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/background/tcnn.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.background.tcnn_background.TCNNBackground 2 | bound: 1 3 | encoder_num_input_dims: 3 # 3 directions 4 | encoder_cfg: 5 | otype: "HashGrid" 6 | n_levels: 16 7 | n_features_per_level: 2 8 | log2_hashmap_size: 19 9 | base_resolution: 16 10 | interpolation: "Smoothstep" 11 | per_level_scale: # default is np.exp2(np.log2(2048 * bound / 16) / (16 - 1)) 12 | 13 | background_net_num_output_dims: 3 # rgb 14 | background_net_cfg: 15 | otype: "FullyFusedMLP" 16 | activation: "ReLU" 17 | output_activation: "None" 18 | n_neurons: 32 19 | n_hidden_layers: 2 20 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/background/torchngp.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.background.torchngp_background.TorchNGPBackground 2 | 3 | encoder_type: "frequency" 4 | encoder_input_dims: 3 5 | encoder_multi_res: 6 6 | 7 | num_output_dims: 3 8 | net_cfg: 9 | num_hidden_dims: 32 10 | num_layers: 2 11 | bias: True 12 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/data/data.yaml: -------------------------------------------------------------------------------- 1 | _target_: data.AggregatorDataModule 2 | 3 | train_batch_size: 1 4 | train_shuffle: false 5 | train_dataset: 6 | _target_: nemo.collections.multimodal.data.nerf.random_poses.RandomPosesDataset 7 | internal_batch_size: 100 8 | width: 64 9 | height: 64 10 | radius_range: [3.0, 3.5] 11 | theta_range: [45, 105] 12 | phi_range: [-180, 180] 13 | fovx_range: [10, 30] 14 | fovy_range: [10, 30] 15 | jitter: False 16 | jitter_center: 0.2 17 | jitter_target: 0.2 18 | jitter_up: 0.02 19 | uniform_sphere_rate: 0 20 | angle_overhead: 30 21 | angle_front: 60 22 | 23 | val_batch_size: 1 24 | val_shuffle: false 25 | val_dataset: 26 | _target_: nemo.collections.multimodal.data.nerf.circle_poses.CirclePosesDataset 27 | size: 5 28 | width: 800 29 | height: 800 30 | angle_overhead: 30 31 | angle_front: 60 32 | 33 | test_batch_size: 1 34 | test_shuffle: false 35 | test_dataset: 36 | _target_: nemo.collections.multimodal.data.nerf.circle_poses.CirclePosesDataset 37 | size: 100 38 | width: 800 39 | height: 800 40 | angle_overhead: 30 41 | angle_front: 60 42 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/dreamfusion-dmtet.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion # TODO(ahmadki): dreamfusion-dmetet should have it's own class 2 | defaults: 3 | - nerf: torchngp 4 | - background: torchngp 5 | - material: basic_shading 6 | - renderer: nvdiffrast 7 | - guidance: sd_huggingface 8 | - optim: adan 9 | - loss: dmtet 10 | - data: data 11 | - _self_ 12 | 13 | ### model options 14 | resume_from_checkpoint: 15 | prompt: 'a hamburger' 16 | negative_prompt: '' 17 | front_prompt: ', front view' 18 | side_prompt: ', side view' 19 | back_prompt: ', back view' 20 | update_extra_interval: 16 21 | guidance_scale: 100 22 | export_video: False 23 | 24 | iters: ${training.trainer.max_steps} 25 | # TODO(ahmadki): move to database 26 | latent_iter_ratio: 0.0 27 | albedo_iter_ratio: 0 28 | min_ambient_ratio: 0.1 29 | textureless_ratio: 0.2 30 | 31 | data: 32 | train_dataset: 33 | width: 512 34 | height: 512 35 | val_dataset: 36 | width: 800 37 | height: 800 38 | test_dataset: 39 | width: 800 40 | height: 800 41 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/dreamfusion.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion 2 | defaults: 3 | - nerf: torchngp 4 | - background: static 5 | - material: basic_shading 6 | - renderer: torchngp_raymarching 7 | - guidance: sd_nemo 8 | - optim: adan 9 | - loss: dreamfusion 10 | - data: data 11 | - _self_ 12 | 13 | ### model options 14 | resume_from_checkpoint: 15 | prompt: 'a hamburger' 16 | negative_prompt: '' 17 | front_prompt: ', front view' 18 | side_prompt: ', side view' 19 | back_prompt: ', back view' 20 | update_extra_interval: 16 21 | guidance_scale: 100 22 | export_video: False 23 | 24 | iters: ${training.trainer.max_steps} 25 | # TODO(ahmadki): move to database 26 | latent_iter_ratio: 0.2 27 | albedo_iter_ratio: 0.0 28 | min_ambient_ratio: 0.1 29 | textureless_ratio: 0.2 30 | 31 | data: 32 | train_dataset: 33 | width: 64 34 | height: 64 35 | val_dataset: 36 | width: 800 37 | height: 800 38 | test_dataset: 39 | width: 800 40 | height: 800 41 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/guidance/sd_huggingface.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.guidance.stablediffusion_huggingface_pipeline.StableDiffusion 2 | precision: ${training.trainer.precision} 3 | model_key: stabilityai/stable-diffusion-2-1-base 4 | t_range: [0.02, 0.98] 5 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/guidance/sd_nemo.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.guidance.stablediffusion_nemo_pipeline.StableDiffusion 2 | checkpoint: /sd_checkpoints/nemo-1.5/sd-1.5.nemo 3 | sampler_type: 'DDIM' 4 | t_range: [0.02, 0.98] 5 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/guidance/sd_trt.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.guidance.stablediffusion_trt_pipeline.StableDiffusion 2 | checkpoint: /sd_checkpoints/nemo-1.5/sd-1.5.nemo 3 | plan_dir: /sd_checkpoints/nemo-1.5/plan 4 | sampler_type=: DDIM" 5 | t_range: [0.02, 0.98] 6 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/loss/dmtet.yaml: -------------------------------------------------------------------------------- 1 | lambda_sds: 1.0 2 | lambda_opacity: 0.0 3 | lambda_entropy: 0.0 4 | lambda_orientation: 0.0 5 | lambda_2d_normal_smooth: 0.0 6 | lambda_3d_normal_smooth: 0.0 7 | lambda_mesh_normal: 0.5 8 | lambda_mesh_laplacian: 0.5 9 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/loss/dreamfusion.yaml: -------------------------------------------------------------------------------- 1 | lambda_sds: 1.0 2 | lambda_opacity: 0.0 3 | lambda_entropy: 1e-3 4 | lambda_orientation: 1e-2 5 | lambda_2d_normal_smooth: 0.0 6 | lambda_3d_normal_smooth: 0.0 7 | lambda_mesh_normal: 0.0 8 | lambda_mesh_laplacian: 0.0 9 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/material/basic_shading.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.materials.basic_shading.BasicShading 2 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/nerf/tcnn.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.geometry.tcnn_nerf.TCNNNerf 2 | num_input_dims: 3 # 3D space 3 | bound: 1 4 | density_activation: softplus # softplus, exp 5 | blob_radius: 0.5 6 | blob_density: 10 7 | normal_type: central_finite_difference 8 | 9 | encoder_cfg: 10 | otype: "HashGrid" 11 | n_levels: 16 12 | n_features_per_level: 2 13 | log2_hashmap_size: 19 14 | base_resolution: 16 15 | interpolation: "Smoothstep" 16 | per_level_scale: # default is np.exp2(np.log2(2048 * bound / 16) / (16 - 1)) 17 | 18 | sigma_net_num_output_dims: 1 # density 19 | sigma_net_cfg: 20 | otype: "FullyFusedMLP" 21 | activation: "ReLU" 22 | output_activation: "None" 23 | n_neurons: 64 24 | n_hidden_layers: 3 25 | 26 | features_net_num_output_dims: 3 # rgb 27 | features_net_cfg: 28 | otype: "FullyFusedMLP" 29 | activation: "ReLU" 30 | output_activation: "None" 31 | n_neurons: 64 32 | n_hidden_layers: 3 33 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/nerf/torchngp.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.geometry.torchngp_nerf.TorchNGPNerf 2 | num_input_dims: 3 # 3D space 3 | bound: 1 4 | density_activation: exp # softplus, exp 5 | blob_radius: 0.2 6 | blob_density: 5 7 | normal_type: central_finite_difference 8 | 9 | encoder_cfg: 10 | encoder_type: 'hashgrid' 11 | encoder_max_level: 12 | log2_hashmap_size: 19 13 | desired_resolution: 2048 14 | interpolation: smoothstep 15 | 16 | sigma_net_num_output_dims: 1 # density 17 | sigma_net_cfg: 18 | num_hidden_dims: 64 19 | num_layers: 3 20 | bias: True # FIXME(ahmadki):exp: does it makes sense that it's True ? 21 | 22 | features_net_num_output_dims: 3 # rgb 23 | features_net_cfg: 24 | num_hidden_dims: 64 25 | num_layers: 3 26 | bias: True 27 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/optim/adan.yaml: -------------------------------------------------------------------------------- 1 | name: adan 2 | lr: 5e-3 3 | eps: 1e-8 4 | weight_decay: 2e-5 5 | max_grad_norm: 5.0 6 | foreach: False 7 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/renderer/nerfacc.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.renderers.nerfacc_volume_renderer.NerfaccVolumeBaseRenderer 2 | grid_resolution: 128 3 | grid_levels: 3 4 | bound: 1 # ${training.model.nerf.bound} # FIXME(ahmadki) 5 | render_step_size: 1.e-3 6 | near_plane: 0.2 7 | cone_angle: 0.004 8 | alpha_thre: 1.e-2 9 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/renderer/nvdiffrast.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.renderers.nvdiffrast_renderer.NVDiffRastRenderer 2 | bound: 1 # ${training.model.nerf.bound} # FIXME(ahmadki) 3 | grid_resolution: 128 4 | density_thresh: 10.0 5 | update_interval: 16 6 | quartet_file: "/results/tets/128_tets.npz" # FIXME(ahmadki): documentation 7 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/nerf/model/renderer/torchngp_raymarching.yaml: -------------------------------------------------------------------------------- 1 | _target_: nemo.collections.multimodal.modules.nerf.renderers.torchngp_volume_renderer.TorchNGPVolumeRenderer 2 | bound: 1 # ${training.model.nerf.bound} # FIXME(ahmadki) 3 | update_interval: 16 4 | grid_resolution: 128 5 | density_thresh: 10 6 | max_steps: 1024 7 | dt_gamma: 0 8 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16 3 | 4 | # Bulk overlap with AllGather 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 2 8 | set_sm_margin: 0 9 | 10 | qkv_wgrad: 11 | method: bulk 12 | num_sm: 2 13 | set_sm_margin: 0 14 | 15 | fc1_dgrad: 16 | method: bulk 17 | num_sm: 2 18 | set_sm_margin: 0 19 | 20 | fc1_wgrad: 21 | method: bulk 22 | num_sm: 2 23 | set_sm_margin: 0 24 | 25 | ## Ring-exchange overlap with AllGather 26 | qkv_fprop: 27 | method: ring_exchange 28 | aggregate: 0 29 | 30 | proj_dgrad: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | fc1_fprop: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc2_dgrad: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | # Chunked-collective overlap with ReduceScatter 43 | proj_fprop: 44 | method: pipeline 45 | num_sm: 4 46 | num_splits: 4 47 | set_sm_margin: 0 48 | 49 | fc2_fprop: 50 | method: pipeline 51 | num_sm: 4 52 | num_splits: 4 53 | set_sm_margin: 0 54 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16 3 | 4 | # Bulk overlap with AllGather 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 2 8 | set_sm_margin: 0 9 | 10 | qkv_wgrad: 11 | method: bulk 12 | num_sm: 2 13 | set_sm_margin: 0 14 | 15 | fc1_dgrad: 16 | method: bulk 17 | num_sm: 2 18 | set_sm_margin: 0 19 | 20 | fc1_wgrad: 21 | method: bulk 22 | num_sm: 2 23 | set_sm_margin: 0 24 | 25 | ## Ring-exchange overlap with AllGather 26 | qkv_fprop: 27 | method: ring_exchange 28 | aggregate: 0 29 | 30 | proj_dgrad: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | fc1_fprop: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc2_dgrad: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | # Chunked-collective overlap with ReduceScatter 43 | proj_fprop: 44 | method: pipeline 45 | num_sm: 8 46 | num_splits: 4 47 | set_sm_margin: 0 48 | 49 | fc2_fprop: 50 | method: pipeline 51 | num_sm: 4 52 | num_splits: 4 53 | set_sm_margin: 0 54 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 2 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 24 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 2 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 1 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 1 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 1 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 1 45 | 46 | # Chunked 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | fp8_buf: 0 54 | atomic_gemm: 0 55 | 56 | fc2_fprop: 57 | method: pipeline 58 | num_sm: 8 59 | cga_size: 2 60 | num_splits: 4 61 | set_sm_margin: 1 62 | fp8_buf: 0 63 | atomic_gemm: 0 -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 2 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 24 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 2 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 1 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 1 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 1 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | # Chunked 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | fp8_buf: 1 54 | atomic_gemm: 0 55 | 56 | fc2_fprop: 57 | method: pipeline 58 | num_sm: 16 59 | cga_size: 2 60 | num_splits: 4 61 | set_sm_margin: 1 62 | fp8_buf: 1 63 | atomic_gemm: 0 -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h8192_tp4_mbs1_seqlen8192.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/70B/TP8/MBS1/SeqLen8K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 4 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 24 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 4 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | # Chunked-collective overlap with ReduceScatter 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | fp8_buf: 1 54 | 55 | fc2_fprop: 56 | method: pipeline 57 | num_sm: 16 58 | cga_size: 2 59 | num_splits: 4 60 | set_sm_margin: 1 61 | fp8_buf: 1 62 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 4 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 4 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 4 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | # Chunked-collective overlap with ReduceScatter 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | fp8_buf: 1 54 | 55 | fc2_fprop: 56 | method: ring_exchange 57 | num_sm: 1 58 | set_sm_margin: 1 59 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 4 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 4 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 4 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | # Chunked-collective overlap with ReduceScatter 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | fp8_buf: 1 54 | 55 | fc2_fprop: 56 | method: ring_exchange 57 | num_sm: 1 58 | set_sm_margin: 1 59 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8 3 | 4 | # Bulk overlap with AllGather 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 8 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 16 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 4 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 16 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 1 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | # Chunked-collective overlap with ReduceScatter 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 16 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | 54 | fc2_fprop: 55 | method: pipeline 56 | num_sm: 24 57 | cga_size: 2 58 | num_splits: 4 59 | set_sm_margin: 1 60 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h5120_tp2_mbs1_seqlen4096.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/13B/TP2/MBS1/SeqLen4K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 4 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 8 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 4 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | # Chunked-collective overlap with ReduceScatter 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | 54 | fc2_fprop: 55 | method: pipeline 56 | num_sm: 20 57 | cga_size: 2 58 | num_splits: 4 59 | set_sm_margin: 1 60 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h8192_tp2_mbs1_seqlen4096.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/70B/TP2/MBS1/SeqLen4K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 8 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 32 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 8 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | proj_fprop: 47 | method: ring_exchange 48 | num_sm: 1 49 | set_sm_margin: 1 50 | 51 | fc2_fprop: 52 | method: ring_exchange 53 | num_sm: 1 54 | set_sm_margin: 1 55 | 56 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h8192_tp4_mbs1_seqlen4096.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/70B/TP2/MBS1/SeqLen4K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 8 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 16 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 4 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 1 45 | 46 | # Chunked-collective overlap with ReduceScatter 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | 54 | fc2_fprop: 55 | method: pipeline 56 | num_sm: 16 57 | cga_size: 2 58 | num_splits: 4 59 | set_sm_margin: 1 60 | -------------------------------------------------------------------------------- /launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h8192_tp4_mbs1_seqlen8192.yaml: -------------------------------------------------------------------------------- 1 | # UB communicator configurations 2 | # Model configs: H100/70B/TP8/MBS1/SeqLen8K/FP8 3 | 4 | # Bulk overlap with AllGather / ReduceScatter 5 | qkv_dgrad: 6 | method: bulk 7 | num_sm: 4 8 | cga_size: 2 9 | set_sm_margin: 0 10 | 11 | qkv_wgrad: 12 | method: bulk 13 | num_sm: 24 14 | cga_size: 2 15 | set_sm_margin: 0 16 | 17 | fc1_dgrad: 18 | method: bulk 19 | num_sm: 2 20 | cga_size: 2 21 | set_sm_margin: 0 22 | 23 | fc1_wgrad: 24 | method: bulk 25 | num_sm: 4 26 | cga_size: 2 27 | set_sm_margin: 0 28 | 29 | ## Ring-exchange overlap with AllGather 30 | qkv_fprop: 31 | method: ring_exchange 32 | aggregate: 0 33 | 34 | proj_dgrad: 35 | method: ring_exchange 36 | aggregate: 0 37 | 38 | fc1_fprop: 39 | method: ring_exchange 40 | aggregate: 0 41 | 42 | fc2_dgrad: 43 | method: ring_exchange 44 | aggregate: 0 45 | 46 | # Chunked-collective overlap with ReduceScatter 47 | proj_fprop: 48 | method: pipeline 49 | num_sm: 24 50 | cga_size: 2 51 | num_splits: 4 52 | set_sm_margin: 1 53 | 54 | fc2_fprop: 55 | method: pipeline 56 | num_sm: 16 57 | cga_size: 2 58 | num_splits: 4 59 | set_sm_margin: 1 60 | -------------------------------------------------------------------------------- /launcher_scripts/data/nsfw/concepts.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/conf/auto_blend.yaml: -------------------------------------------------------------------------------- 1 | model_type: mt5 2 | preprocessed_dir: null 3 | blending_alpha: 1.0 -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/conf/checkpoint_search.yaml: -------------------------------------------------------------------------------- 1 | checkpoint_folder: null 2 | checkpoint_name: latest 3 | tensor_model_parallel_size: 1 4 | pipeline_model_parallel_size: 1 -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/conf/get_ag_overlap.yaml: -------------------------------------------------------------------------------- 1 | name: 'get_ag_overlap' 2 | fp8: null -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/conf/get_ln_sm_margin.yaml: -------------------------------------------------------------------------------- 1 | name: 'get_ln_sm_margin' -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/conf/hparams_override.yaml: -------------------------------------------------------------------------------- 1 | hparams_file: null 2 | output_path: null 3 | 4 | vocab_file: null 5 | merge_file: null 6 | tokenizer_model: null -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/conf/numa_mapping.yaml: -------------------------------------------------------------------------------- 1 | # GPU Numa Mapping Config 2 | enable: True # Set to False to disable all mapping (performance will suffer). 3 | mode: unique_contiguous # One of: all, single, single_unique, unique_interleaved or unique_contiguous. 4 | scope: node # Either node or socket. 5 | cores: all_logical # Either all_logical or single_logical. 6 | balanced: True # Whether to assing an equal number of physical cores to each process. 7 | min_cores: 1 # Minimum number of physical cores per process. 8 | max_cores: 8 # Maximum number of physical cores per process. Can be null to use all available cores. -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/datacuration_scripts/download_fasttext.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # 19 | # Downloads the FastText classifier 20 | # 21 | 22 | 23 | set -eu 24 | 25 | res_file=$1 26 | 27 | ## Download the fasttext model 28 | if [ ! -f ${res_file} ]; then 29 | wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O ${res_file} 30 | fi -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/custom_dataprep/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/fid_evaluation_dataprep/conf/config.yaml: -------------------------------------------------------------------------------- 1 | preprocess_images: True 2 | preprocess_captions: True 3 | root_dir: /path/to/fid_evaluation/coco2014/ 4 | num_processes: 8 -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/mc4_dataprep/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/multimodal_dataprep/conf/config.yaml: -------------------------------------------------------------------------------- 1 | # this file is only a template for the hydra arguments 2 | # you should not edit or use this file unless you are debugging data prep scripts directly 3 | 4 | dataset_repo_id: 5 | dataset_output_root: 6 | 7 | input_dir: 8 | output_dir: 9 | parquet_subpartitions: 10 | parquet_pattern: 11 | num_parquets_downloaded: 12 | download_num_processes: 13 | download_num_threads: 14 | img2dataset_additional_arguments: 15 | node_array_size: 16 | tar_chunk_size: 17 | file_ext_in_tar: 18 | precache_config_path: 19 | output_wdinfo_path: 20 | append_tar_dir: 21 | source_dir: 22 | source_extensions: 23 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml: -------------------------------------------------------------------------------- 1 | data_config: download_gpt3_pile 2 | cluster_type: bcm 3 | launcher_scripts_path: null 4 | data_dir: null 5 | the_pile_url: null 6 | file_numbers: null 7 | rm_downloaded: True 8 | rm_extracted: True 9 | tokenizer_type: null 10 | vocab_save_dir: null 11 | merges_save_dir: null 12 | tokenizer_library: null 13 | tokenizer_model: null 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/slim_pajama_dataprep/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/dataprep_scripts/slim_pajama_dataprep/conf/config.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /hydra/hydra_logging@_group_: none 4 | - override /hydra/job_logging@_group_: none 5 | hydra: 6 | run: 7 | dir: . 8 | output_subdir: null 9 | data_config: download_slim_pajama 10 | cluster_type: bcm 11 | launcher_scripts_path: null 12 | data_dir: null 13 | slim_pajama_url: null 14 | approved_sources: null 15 | file_numbers: null 16 | rm_downloaded: True 17 | rm_extracted: True 18 | tokenizer_type: null 19 | vocab_save_dir: null 20 | merges_save_dir: null 21 | tokenizer_library: null 22 | tokenizer_model: null 23 | preprocessed_dir: null -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/eval_harness/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/export_scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/nemo_launcher/collections/export_scripts/__init__.py -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/collections/metric_calculation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/nemo_launcher/collections/metric_calculation/__init__.py -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: NeMo Framework Base Model Conversion 4 | name: nemo-framework-conversion 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: NeMo Framework Data Preparation 4 | name: nemo-framework-data-prep 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: data-prep-config 5 | data: 6 | config.yaml: |- 7 | {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }} 8 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | trainingImage: cfg.container 3 | pullPolicy: IfNotPresent 4 | 5 | # Insert the name of your container registry pull secret # 6 | pullSecret: nvcr.io 7 | 8 | nodes: training.trainer.num_nodes 9 | 10 | dataPrepConfig: 11 | # Specify the amount of shared memory to attach to the Pods # 12 | shmSize: 512Gi 13 | 14 | # Insert the address for the NFS server if using NFS for model storage # 15 | NFSServer: 16 | 17 | # Insert the path to save data on the NFS server # 18 | NFSPath: 19 | 20 | # Insert the total number of processes to spawn on the cluster # 21 | totalProcesses: 22 | 23 | # Insert the number of processes to spawn per node # 24 | procsPerNode: 25 | 26 | # Insert the data preparation stage, such as download, extract, or preprocess # 27 | stage: 28 | 29 | # Insert the dnsPolicy # 30 | dnsPolicy: "nil" 31 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: NeMo Framework Evaluation 4 | name: nemo-framework-evaluation 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: evaluation-config 5 | data: 6 | hparams.yaml: |- 7 | {{ (.Files.Glob "config/hparams.yaml").AsConfig | indent 4 }} 8 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/peft/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: NeMo Framework PEFT 4 | name: nemo-framework-peft 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/peft/peft-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ .Release.Name }}-peft-config 5 | data: 6 | config.yaml: |- 7 | {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }} 8 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/rlhf_ppo/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: NeMo Framework RLHF PPO training 4 | name: nemo-framework-rlhf-ppo 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/rlhf_ppo/rlhf-ppo-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ .Release.Name }}-config 5 | data: 6 | config_critic.yaml: |- 7 | {{ (.Files.Glob "config/gpt_ppo_critic.yaml").AsConfig | indent 4 }} 8 | config_actor.yaml: |- 9 | {{ (.Files.Glob "config/gpt_ppo_actor.yaml").AsConfig | indent 4 }} 10 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/rlhf_ppo/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | trainingImage: cfg.container 3 | pullPolicy: IfNotPresent 4 | 5 | # Insert the name of your container registry pull secret # 6 | pullSecret: nvcr.io 7 | 8 | trainingConfig: 9 | # Specify the amount of shared memory to attach to the Pods # 10 | shmSize: 512Gi 11 | 12 | # Insert the address for the NFS server if using NFS for model storage # 13 | NFSServer: 14 | 15 | # Insert the path to save data on the NFS server # 16 | NFSPath: 17 | 18 | # Specify the k8s resource name for IB devices # 19 | ibResourceName: nvidia.com/hostdev 20 | 21 | # Specity the number of IB devices to include in pods # 22 | ibCount: "0" 23 | 24 | # Specify the WandB API key if using WandB for logging # 25 | wandbKey: "nil" 26 | 27 | # Insert the dnsPolicy # 28 | dnsPolicy: "nil" 29 | 30 | critic: 31 | numGPUs: rlhf_ppo.critic.trainer.devices 32 | nodes: rlhf_ppo.critic.trainer.num_nodes 33 | 34 | actor: 35 | numGPUs: rlhf_ppo.actor.trainer.devices 36 | nodes: rlhf_ppo.actor.trainer.num_nodes 37 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | appVersion: "1.0" 3 | description: NeMo Framework Base Model Training 4 | name: nemo-framework-training 5 | version: 1.0.0 6 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: training-config 5 | data: 6 | config.yaml: |- 7 | {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }} 8 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | trainingImage: cfg.container 3 | pullPolicy: IfNotPresent 4 | 5 | # Insert the name of your container registry pull secret # 6 | pullSecret: nvcr.io 7 | 8 | numGPUs: training.trainer.devices 9 | nodes: training.trainer.num_nodes 10 | 11 | trainingConfig: 12 | # Specify the amount of shared memory to attach to the Pods # 13 | shmSize: 512Gi 14 | 15 | # Insert the address for the NFS server if using NFS for model storage # 16 | NFSServer: 17 | 18 | # Insert the path to save data on the NFS server # 19 | NFSPath: 20 | 21 | # Specify the k8s resource name for IB devices. Can be string or list of strings. If list, must be same length as ibCount # 22 | ibResourceName: nvidia.com/hostdev 23 | 24 | # Specity the number of IB devices to include in pods. Can be string or list. If list, must be same length as ibResourceName # 25 | ibCount: "8" 26 | 27 | # Specity the number of IB networks to include in pods. Should be a comma separated set of networks. # 28 | ibNetworkAnnotation: "" 29 | 30 | # Specify the WandB API key if using WandB for logging # 31 | wandbKey: "nil" 32 | 33 | # Insert the dnsPolicy # 34 | dnsPolicy: "nil" 35 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/core/v2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/nemo_launcher/core/v2/__init__.py -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/nemo_launcher/utils/data_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /launcher_scripts/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/__init__.py -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/config_tests/test_cluster_config.py: -------------------------------------------------------------------------------- 1 | from omegaconf import OmegaConf 2 | 3 | 4 | class TestClusterConfig: 5 | def test_cluster_bcm_config(self): 6 | conf = OmegaConf.load("conf/cluster/bcm.yaml") 7 | s = """ 8 | partition: null 9 | account: null 10 | exclusive: True 11 | gpus_per_task: null 12 | gpus_per_node: 8 13 | mem: 0 14 | job_name_prefix: "nemo-megatron-" 15 | nodelist: null 16 | srun_args: 17 | - "--no-container-mount-home" 18 | """ 19 | expected = OmegaConf.create(s) 20 | assert ( 21 | expected == conf 22 | ), f"conf/cluster/bcm.yaml must be set to {expected} but it currently is {conf}." 23 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/unit_tests/stages_tests/__init__.py -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/test_adapters.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?' 5 | 6 | 7 | def adapter_learning(model_type): 8 | cmd = ( 9 | "python3 main.py " 10 | "stages=[adapter_learning] " 11 | f"adapter_learning={model_type}/squad " 12 | "launcher_scripts_path=. " 13 | "base_results_dir=test_folder" 14 | ) 15 | 16 | command = subprocess.Popen( 17 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 18 | ) 19 | output, errors = command.communicate() 20 | 21 | return errors.decode() 22 | 23 | 24 | class TestAdapterLearn: 25 | def test_gpt3(self): 26 | 27 | output = adapter_learning("gpt3") 28 | assert ERROR in output 29 | 30 | def test_t5(self): 31 | 32 | output = adapter_learning("t5") 33 | assert ERROR in output 34 | 35 | def test_remove_folders(self): 36 | os.system("rm -rf test_folder") 37 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/test_convert.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?' 5 | 6 | 7 | def convert(model_type): 8 | cmd = ( 9 | "python3 main.py " 10 | "stages=[conversion] " 11 | f"conversion={model_type}/convert_{model_type} " 12 | "launcher_scripts_path=. " 13 | "base_results_dir=test_folder" 14 | ) 15 | 16 | command = subprocess.Popen( 17 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 18 | ) 19 | output, errors = command.communicate() 20 | 21 | return errors.decode() 22 | 23 | 24 | class TestConvert: 25 | def test_gpt3(self): 26 | 27 | output = convert("gpt3") 28 | assert ERROR in output 29 | 30 | def test_prompt_t5(self): 31 | 32 | output = convert("t5") 33 | assert ERROR in output 34 | 35 | def test_prompt_mt5(self): 36 | 37 | output = convert("mt5") 38 | assert ERROR in output 39 | 40 | def test_remove_folders(self): 41 | os.system("rm -rf test_folder") 42 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/test_data_prep.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?' 5 | 6 | 7 | def data_prep(model_type, data_type): 8 | cmd = ( 9 | "python3 main.py " 10 | "stages=[data_preparation] " 11 | f"data_preparation={model_type}/{data_type} " 12 | "launcher_scripts_path=. " 13 | "base_results_dir=test_folder" 14 | ) 15 | 16 | command = subprocess.Popen( 17 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 18 | ) 19 | output, errors = command.communicate() 20 | 21 | return errors.decode() 22 | 23 | 24 | class TestDataPrep: 25 | def test_gpt3(self): 26 | 27 | output = data_prep("gpt3", "download_gpt3_pile") 28 | assert ERROR in output 29 | 30 | def test_t5(self): 31 | 32 | output = data_prep("t5", "download_t5_pile") 33 | assert ERROR in output 34 | 35 | def test_mt5(self): 36 | 37 | output = data_prep("mt5", "download_mc4") 38 | assert ERROR in output 39 | 40 | def test_bert(self): 41 | 42 | output = data_prep("bert", "download_bert_pile") 43 | assert ERROR in output 44 | 45 | def test_remove_folders(self): 46 | os.system("rm -rf test_folder") 47 | os.system("rm -rf data") 48 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/test_export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?' 5 | 6 | 7 | def export(model_type): 8 | cmd = ( 9 | "python3 main.py " 10 | "stages=[export] " 11 | f"export={model_type}/export_{model_type} " 12 | "launcher_scripts_path=. " 13 | "base_results_dir=test_folder" 14 | ) 15 | 16 | command = subprocess.Popen( 17 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 18 | ) 19 | output, errors = command.communicate() 20 | 21 | return errors.decode() 22 | 23 | 24 | class TestExport: 25 | def test_gpt3(self): 26 | 27 | output = export("gpt3") 28 | assert ERROR in output 29 | 30 | def test_t5(self): 31 | 32 | output = export("t5") 33 | assert ERROR in output 34 | 35 | def test_mt5(self): 36 | 37 | output = export("mt5") 38 | assert ERROR in output 39 | 40 | def test_remove_folders(self): 41 | os.system("rm -rf test_folder") 42 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/test_fine_tune.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?' 5 | 6 | 7 | def fine_tune(model_type, task_type): 8 | cmd = ( 9 | "python3 main.py " 10 | "stages=[fine_tuning] " 11 | f"fine_tuning={model_type}/{task_type} " 12 | "launcher_scripts_path=. " 13 | "base_results_dir=test_folder" 14 | ) 15 | 16 | command = subprocess.Popen( 17 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 18 | ) 19 | output, errors = command.communicate() 20 | 21 | return errors.decode() 22 | 23 | 24 | class TestFineTune: 25 | def test_t5(self): 26 | 27 | output = fine_tune("t5", "squad") 28 | assert ERROR in output 29 | 30 | def test_mt5(self): 31 | 32 | output = fine_tune("mt5", "xquad") 33 | assert ERROR in output 34 | 35 | def test_remove_folders(self): 36 | os.system("rm -rf test_folder") 37 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/test_ia3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?' 5 | 6 | 7 | def ia3_learning(model_type): 8 | cmd = ( 9 | "python3 main.py " 10 | "stages=[ia3_learning] " 11 | f"ia3_learning={model_type}/squad " 12 | "launcher_scripts_path=. " 13 | "base_results_dir=test_folder" 14 | ) 15 | 16 | command = subprocess.Popen( 17 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 18 | ) 19 | output, errors = command.communicate() 20 | 21 | return errors.decode() 22 | 23 | 24 | class TestIA3Learn: 25 | def test_gpt3(self): 26 | 27 | output = ia3_learning("gpt3") 28 | assert ERROR in output 29 | 30 | def test_t5(self): 31 | 32 | output = ia3_learning("t5") 33 | assert ERROR in output 34 | 35 | def test_remove_folders(self): 36 | os.system("rm -rf test_folder") 37 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/stages_tests/test_prompt_learn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?' 5 | 6 | 7 | def prompt_learn(model_type): 8 | cmd = ( 9 | "python3 main.py " 10 | "stages=[prompt_learning] " 11 | f"prompt_learning={model_type}/squad " 12 | "launcher_scripts_path=. " 13 | "base_results_dir=test_folder" 14 | ) 15 | 16 | command = subprocess.Popen( 17 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True 18 | ) 19 | output, errors = command.communicate() 20 | 21 | return errors.decode() 22 | 23 | 24 | class TestPromptLearn: 25 | def test_gpt3(self): 26 | 27 | output = prompt_learn("gpt3") 28 | assert ERROR in output 29 | 30 | def test_t5(self): 31 | 32 | output = prompt_learn("t5") 33 | assert ERROR in output 34 | 35 | def test_mt5(self): 36 | 37 | output = prompt_learn("mt5") 38 | assert ERROR in output 39 | 40 | def test_remove_folders(self): 41 | os.system("rm -rf test_folder") 42 | os.system("rm -rf data") 43 | -------------------------------------------------------------------------------- /launcher_scripts/tests/unit_tests/utils_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/unit_tests/utils_tests/__init__.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | best_download>=0.0.6 2 | dask 3 | huggingface_hub>=0.13.0 4 | hydra-core==1.3.2 5 | img2dataset 6 | omegaconf>=2.2,<2.3 7 | pynvml==11.4.1 8 | pytablewriter==0.58.0 9 | requests==2.26.0 10 | tqdm==4.62.3 11 | zstandard==0.15.2 12 | hera 13 | pydantic 14 | kubeflow-training>=1.8 15 | kubernetes 16 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 119 3 | profile = black 4 | --------------------------------------------------------------------------------