├── .github
    └── workflows
    │   ├── autoconf.yml
    │   ├── launcher.yml
    │   └── style.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── README.md
├── auto_configurator
    ├── autoconfig
    │   ├── __init__.py
    │   ├── base_config.py
    │   ├── inference_summary.py
    │   ├── inference_sweep.py
    │   ├── scripts
    │   │   └── compare_throughput.py
    │   ├── search_config.py
    │   ├── train.py
    │   ├── training_config.py
    │   └── utils.py
    ├── base_configs
    │   ├── baichuan2_13b.yaml
    │   ├── baichuan2_7b.yaml
    │   ├── bert.yaml
    │   ├── chatglm_6b.yaml
    │   ├── gpt3.yaml
    │   ├── llama2_13b.yaml
    │   ├── llama2_70b.yaml
    │   ├── llama2_7b.yaml
    │   ├── llama3_70b.yaml
    │   ├── llama3_8b.yaml
    │   ├── mixtral_3b.yaml
    │   ├── mixtral_7b.yaml
    │   ├── mt5.yaml
    │   ├── qwen2_14b.yaml
    │   ├── qwen2_4b.yaml
    │   ├── qwen2_72b.yaml
    │   ├── qwen2_7b.yaml
    │   └── t5.yaml
    ├── conf
    │   ├── cluster
    │   │   └── bcm.yaml
    │   ├── config.yaml
    │   └── search_config
    │   │   ├── baichuan2
    │   │       ├── 13b.yaml
    │   │       └── 7b.yaml
    │   │   ├── bert
    │   │       ├── 0.11b.yaml
    │   │       ├── 100b.yaml
    │   │       ├── 20b.yaml
    │   │       ├── 4b.yaml
    │   │       └── unknown_size.yaml
    │   │   ├── chatglm
    │   │       └── 6b.yaml
    │   │   ├── gpt3
    │   │       ├── 0.126b.yaml
    │   │       ├── 0.843b.yaml
    │   │       ├── 175b.yaml
    │   │       ├── 20b.yaml
    │   │       ├── 2b.yaml
    │   │       ├── 43b.yaml
    │   │       ├── 5b.yaml
    │   │       ├── 8b.yaml
    │   │       └── unknown_size.yaml
    │   │   ├── llama
    │   │       ├── llama2_13b.yaml
    │   │       ├── llama2_70b.yaml
    │   │       ├── llama2_7b.yaml
    │   │       ├── llama2_7b_nemo.yaml
    │   │       ├── llama3_70b.yaml
    │   │       └── llama3_8b.yaml
    │   │   ├── mixtral
    │   │       ├── 3b.yaml
    │   │       └── 7b.yaml
    │   │   ├── mt5
    │   │       ├── 0.17b.yaml
    │   │       ├── 0.39b.yaml
    │   │       ├── 101.6b.yaml
    │   │       ├── 11.9b.yaml
    │   │       ├── 206b.yaml
    │   │       ├── 24.65b.yaml
    │   │       ├── 3.2b.yaml
    │   │       ├── 42.54b.yaml
    │   │       └── unknown_size.yaml
    │   │   ├── qwen2
    │   │       ├── 14b.yaml
    │   │       ├── 4b.yaml
    │   │       ├── 72b.yaml
    │   │       └── 7b.yaml
    │   │   └── t5
    │   │       ├── 0.22b.yaml
    │   │       ├── 100b.yaml
    │   │       ├── 11b.yaml
    │   │       ├── 2.8b.yaml
    │   │       ├── 203b.yaml
    │   │       ├── 23.5b.yaml
    │   │       ├── 41.2b.yaml
    │   │       └── unknown_size.yaml
    ├── main.py
    ├── tests
    │   ├── __init__.py
    │   ├── base_configs_tests
    │   │   ├── __init__.py
    │   │   └── test_base_configs.py
    │   ├── code_tests
    │   │   ├── __init__.py
    │   │   ├── test_base_config.py
    │   │   ├── test_training_config.py
    │   │   └── test_utils.py
    │   └── config_tests
    │   │   ├── __init__.py
    │   │   ├── test_bert_config.py
    │   │   ├── test_cluster_config.py
    │   │   ├── test_gpt3_config.py
    │   │   ├── test_llama_config.py
    │   │   ├── test_main_config.py
    │   │   ├── test_mixtral_config.py
    │   │   ├── test_mt5_config.py
    │   │   └── test_t5_config.py
    └── tuning
    │   ├── README.md
    │   ├── conf
    │       ├── cluster
    │       │   └── bcm.yaml
    │       └── config.yaml
    │   ├── main.py
    │   └── src
    │       ├── result_analysis.py
    │       ├── search.py
    │       └── utils.py
├── csp_tools
    ├── aws
    │   ├── Dockerfile
    │   ├── build-nccl-tests.sh
    │   ├── cluster-validation-sample-output
    │   │   └── dcgmi-220.out
    │   ├── cluster_validation.sh
    │   ├── dcgmi_diag.sh
    │   ├── nccl.sh
    │   └── topo.xml
    ├── azure
    │   ├── build-nccl-tests.sh
    │   ├── cluster_validation.sh
    │   ├── dcgmi_diag.sh
    │   ├── nccl.sh
    │   └── topo.xml
    └── oci
    │   ├── build-nccl-tests.sh
    │   ├── cluster_validation.sh
    │   ├── dcgmi_diag.sh
    │   ├── nccl.sh
    │   └── topo.xml
├── examples
    ├── README.md
    ├── peft
    │   ├── llama
    │   │   ├── a100
    │   │   │   ├── 13b_1node.sh
    │   │   │   ├── 13b_1node_lora.sh
    │   │   │   ├── 70b_1node_lora.sh
    │   │   │   ├── 70b_2node.sh
    │   │   │   ├── 7b_1node.sh
    │   │   │   ├── 7b_1node_lora.sh
    │   │   │   └── lora_4gpu_k8s.sh
    │   │   └── h100
    │   │   │   ├── 13b_1node.sh
    │   │   │   ├── 13b_1node_lora.sh
    │   │   │   ├── 70b_1node_lora.sh
    │   │   │   ├── 70b_2node.sh
    │   │   │   ├── 7b_1node.sh
    │   │   │   └── 7b_1node_lora.sh
    │   └── nemotron
    │   │   └── h100
    │   │       ├── 22b_1node.sh
    │   │       ├── 22b_1node_lora.sh
    │   │       ├── 8b_1node.sh
    │   │       └── 8b_1node_lora.sh
    └── training
    │   ├── gpt
    │       ├── a100
    │       │   ├── 175b_16node.sh
    │       │   ├── 20b_1node.sh
    │       │   ├── 20b_8node.sh
    │       │   ├── 40b_8node.sh
    │       │   ├── 5b_1node.sh
    │       │   ├── 5b_8node.sh
    │       │   ├── fsdp_20b_1node.sh
    │       │   ├── fsdp_40b_32node.sh
    │       │   └── fsdp_5b_1node.sh
    │       └── h100
    │       │   ├── 175b_bf16_16node.sh
    │       │   ├── 175b_fp8_16node.sh
    │       │   ├── 20b_bf16_1node.sh
    │       │   ├── 20b_bf16_8node.sh
    │       │   ├── 20b_fp8_1node.sh
    │       │   ├── 20b_fp8_8node.sh
    │       │   ├── 40b_bf16_8node.sh
    │       │   ├── 40b_fp8_8node.sh
    │       │   ├── 5b_bf16_1node.sh
    │       │   ├── 5b_bf16_8node.sh
    │       │   ├── 5b_fp8_1node.sh
    │       │   ├── 5b_fp8_8node.sh
    │       │   ├── fsdp_20b_bf16_1node.sh
    │       │   ├── fsdp_40b_bf16_32node.sh
    │       │   └── fsdp_5b_bf16_1node.sh
    │   ├── grok1-proxy
    │       └── h100
    │       │   ├── grok1_proxy_bf16.sh
    │       │   └── grok1_proxy_fp8.sh
    │   ├── llama
    │       ├── a100
    │       │   ├── llama2_13b_bf16.sh
    │       │   ├── llama2_70b_bf16.sh
    │       │   └── llama2_7b_bf16.sh
    │       └── h100
    │       │   ├── llama2_13b_bf16.sh
    │       │   ├── llama2_13b_fp8.sh
    │       │   ├── llama2_70b_bf16.sh
    │       │   ├── llama2_70b_fp8.sh
    │       │   ├── llama2_7b_bf16.sh
    │       │   ├── llama2_7b_fp8.sh
    │       │   ├── llama3_405b_bf16.sh
    │       │   ├── llama3_405b_fp8.sh
    │       │   ├── llama3_70b_bf16.sh
    │       │   ├── llama3_70b_fp8.sh
    │       │   ├── llama3_8b_bf16.sh
    │       │   └── llama3_8b_fp8.sh
    │   ├── mixtral
    │       └── h100
    │       │   ├── mixtral_8x3b_bf16.sh
    │       │   ├── mixtral_8x3b_fp8.sh
    │       │   ├── mixtral_8x7b_bf16.sh
    │       │   └── mixtral_8x7b_fp8.sh
    │   └── nemotron
    │       ├── a100
    │           ├── nemotron_22b_bf16.sh
    │           └── nemotron_8b_bf16.sh
    │       └── h100
    │           ├── nemotron_22b_bf16.sh
    │           ├── nemotron_22b_fp8.sh
    │           ├── nemotron_8b_bf16.sh
    │           └── nemotron_8b_fp8.sh
├── img
    ├── 170M_mT5_loss_final.svg
    ├── 175B_GPT_3_throughput.svg
    ├── 220M_T5_loss_final.svg
    ├── 390M_mT5_loss_final.svg
    ├── 3B_T5_loss_100percent.svg
    ├── 3B_T5_loss_75percent.svg
    ├── 3B_T5_throughput_2205.svg
    ├── 3B_T5_throughput_2208.svg
    ├── 3B_mT5_loss_75percent.svg
    ├── 3B_mT5_loss_final.svg
    ├── 3B_mT5_throughput_2205.svg
    ├── 3B_mT5_throughput_2208.svg
    ├── 4B_bert_throughput_2211.png
    ├── 4b_bert_loss_final.png
    ├── 5B_GPT_3_loss_final.svg
    ├── 5B_GPT_3_throughput.svg
    ├── infer_model_size_gpt3.svg
    ├── infer_model_size_gpt3.svg_old
    ├── infer_model_size_mt5.svg
    ├── infer_model_size_t5.svg
    └── model_overview.png
├── launcher_scripts
    ├── __init__.py
    ├── conf
    │   ├── adapter_learning
    │   │   ├── gpt3
    │   │   │   └── squad.yaml
    │   │   ├── llama
    │   │   │   └── squad.yaml
    │   │   └── t5
    │   │   │   └── squad.yaml
    │   ├── cluster
    │   │   ├── bcm.yaml
    │   │   ├── k8s.yaml
    │   │   └── k8s_v2.yaml
    │   ├── config.yaml
    │   ├── conversion
    │   │   ├── baichuan2
    │   │   │   └── convert_baichuan2.yaml
    │   │   ├── chatglm
    │   │   │   └── convert_chatglm.yaml
    │   │   ├── clip
    │   │   │   └── convert_clip.yaml
    │   │   ├── controlnet
    │   │   │   └── convert_controlnet.yaml
    │   │   ├── dreambooth
    │   │   │   └── convert_dreambooth.yaml
    │   │   ├── gpt3
    │   │   │   └── convert_gpt3.yaml
    │   │   ├── imagen
    │   │   │   └── convert_imagen.yaml
    │   │   ├── instruct_pix2pix
    │   │   │   └── convert_instruct_pix2pix.yaml
    │   │   ├── llama
    │   │   │   └── convert_llama.yaml
    │   │   ├── mistral
    │   │   │   └── convert_mistral.yaml
    │   │   ├── mixtral
    │   │   │   ├── convert_mixtral.yaml
    │   │   │   └── convert_mixtral_8x22b.yaml
    │   │   ├── mt5
    │   │   │   └── convert_mt5.yaml
    │   │   ├── nemotron
    │   │   │   └── convert_nemotron.yaml
    │   │   ├── neva
    │   │   │   └── convert_neva.yaml
    │   │   ├── qwen2
    │   │   │   └── convert_qwen2.yaml
    │   │   ├── stable_diffusion
    │   │   │   └── convert_stable_diffusion.yaml
    │   │   ├── starcoder2
    │   │   │   └── convert_starcoder2.yaml
    │   │   ├── t5
    │   │   │   └── convert_t5.yaml
    │   │   └── vit
    │   │   │   └── convert_vit.yaml
    │   ├── conversion_hf2nemo
    │   │   ├── hf_llama2
    │   │   │   └── convert_llama2_nemo.yaml
    │   │   ├── hf_mistral_7b
    │   │   │   └── convert_mistral_7b_nemo.yaml
    │   │   └── hf_mixtral
    │   │   │   ├── convert_mixtral_8x22b_nemo.yaml
    │   │   │   └── convert_mixtral_nemo.yaml
    │   ├── data_curation
    │   │   ├── common_crawl
    │   │   │   ├── compute_minhashes
    │   │   │   │   └── compute_minhashes.yaml
    │   │   │   ├── connected_component
    │   │   │   │   └── connected_component.yaml
    │   │   │   ├── curate_common_crawl.yaml
    │   │   │   ├── fasttext_download
    │   │   │   │   └── fasttext_download.yaml
    │   │   │   ├── find_matching_ngrams
    │   │   │   │   └── find_matching_ngrams.yaml
    │   │   │   ├── jaccard_compute
    │   │   │   │   └── jaccard_compute.yaml
    │   │   │   ├── jaccard_map_buckets
    │   │   │   │   └── jaccard_map_buckets.yaml
    │   │   │   ├── jaccard_shuffle
    │   │   │   │   └── jaccard_shuffle.yaml
    │   │   │   ├── language_identification
    │   │   │   │   └── language_identification.yaml
    │   │   │   ├── minhash_buckets
    │   │   │   │   └── minhash_buckets.yaml
    │   │   │   ├── prepare_task_data
    │   │   │   │   └── prepare_task_data.yaml
    │   │   │   ├── quality_filtering
    │   │   │   │   └── heuristic_english.yaml
    │   │   │   ├── remove_matching_ngrams
    │   │   │   │   └── remove_matching_ngrams.yaml
    │   │   │   ├── separate_by_language
    │   │   │   │   └── separate_by_language.yaml
    │   │   │   ├── text_cleaning
    │   │   │   │   └── text_cleaning.yaml
    │   │   │   ├── verify_all_pairs_jaccard
    │   │   │   │   └── verify_all_pairs_jaccard.yaml
    │   │   │   └── write_deduped_result_with_text
    │   │   │   │   └── write_deduped_result_with_text.yaml
    │   │   └── sft
    │   │   │   ├── curate_sft.yaml
    │   │   │   ├── find_matching_ngrams
    │   │   │       └── find_matching_ngrams.yaml
    │   │   │   ├── prepare_task_data
    │   │   │       └── prepare_task_data.yaml
    │   │   │   └── remove_matching_ngrams
    │   │   │       └── remove_matching_ngrams.yaml
    │   ├── data_preparation
    │   │   ├── baichuan2
    │   │   │   └── download_baichuan2_pile.yaml
    │   │   ├── bert
    │   │   │   └── download_bert_pile.yaml
    │   │   ├── chatglm
    │   │   │   └── download_chatglm_pile.yaml
    │   │   ├── code_llama
    │   │   │   └── download_human_eval.yaml
    │   │   ├── falcon
    │   │   │   └── download_falcon_pile.yaml
    │   │   ├── fid_evaluation
    │   │   │   └── download_coco2014.yaml
    │   │   ├── generic
    │   │   │   └── custom_dataset.yaml
    │   │   ├── gpt
    │   │   │   └── download_slim_pajama.yaml
    │   │   ├── gpt3
    │   │   │   └── download_gpt3_pile.yaml
    │   │   ├── llama
    │   │   │   └── download_llama_pile.yaml
    │   │   ├── mistral
    │   │   │   ├── download_mistral_nemo_123b_pile.yaml
    │   │   │   ├── download_mistral_nemo_12b_pile.yaml
    │   │   │   └── download_mistral_pile.yaml
    │   │   ├── mixtral
    │   │   │   ├── download_mixtral_8x22b_pile.yaml
    │   │   │   └── download_mixtral_pile.yaml
    │   │   ├── mt5
    │   │   │   └── download_mc4.yaml
    │   │   ├── multimodal
    │   │   │   ├── download_multimodal.yaml
    │   │   │   ├── precache_sd.yaml
    │   │   │   └── precache_t5xxl.yaml
    │   │   ├── nemotron
    │   │   │   └── download_nemotron_pile.yaml
    │   │   ├── steerlm
    │   │   │   ├── steerlm_data_prep1.yaml
    │   │   │   └── steerlm_data_prep2_reg.yaml
    │   │   └── t5
    │   │   │   └── download_t5_pile.yaml
    │   ├── evaluation
    │   │   ├── adapter_gpt3
    │   │   │   └── squad.yaml
    │   │   ├── adapter_t5
    │   │   │   └── squad.yaml
    │   │   ├── baichuan2
    │   │   │   ├── evaluate_all.yaml
    │   │   │   └── evaluate_boolq.yaml
    │   │   ├── chatglm
    │   │   │   ├── evaluate_all.yaml
    │   │   │   └── evaluate_boolq.yaml
    │   │   ├── clip
    │   │   │   └── imagenet_zeroshot.yaml
    │   │   ├── code_llama
    │   │   │   └── human_eval.yaml
    │   │   ├── falcon
    │   │   │   └── evaluate_all.yaml
    │   │   ├── gpt3
    │   │   │   ├── evaluate_all.yaml
    │   │   │   └── evaluate_lambada.yaml
    │   │   ├── ia3_gpt3
    │   │   │   └── squad.yaml
    │   │   ├── ia3_t5
    │   │   │   └── squad.yaml
    │   │   ├── imagen
    │   │   │   └── fid_clip.yaml
    │   │   ├── llama
    │   │   │   ├── evaluate_all.yaml
    │   │   │   └── evaluate_boolq.yaml
    │   │   ├── mistral
    │   │   │   └── evaluate_all.yaml
    │   │   ├── mixtral
    │   │   │   ├── evaluate_all.yaml
    │   │   │   └── evaluate_all_8x22b.yaml
    │   │   ├── mt5
    │   │   │   ├── custom_task.yaml
    │   │   │   └── xquad.yaml
    │   │   ├── nemotron
    │   │   │   └── evaluate_all.yaml
    │   │   ├── peft_baichuan2
    │   │   │   └── squad.yaml
    │   │   ├── peft_chatglm
    │   │   │   └── squad.yaml
    │   │   ├── peft_falcon
    │   │   │   └── squad.yaml
    │   │   ├── peft_llama
    │   │   │   └── squad.yaml
    │   │   ├── peft_mistral
    │   │   │   └── squad.yaml
    │   │   ├── peft_mixtral
    │   │   │   ├── squad.yaml
    │   │   │   └── squad_8x22b.yaml
    │   │   ├── peft_t5
    │   │   │   └── squad.yaml
    │   │   ├── prompt_gpt3
    │   │   │   └── squad.yaml
    │   │   ├── prompt_llama
    │   │   │   └── squad.yaml
    │   │   ├── prompt_mt5
    │   │   │   └── squad.yaml
    │   │   ├── prompt_t5
    │   │   │   └── squad.yaml
    │   │   ├── qwen2
    │   │   │   ├── evaluate_all.yaml
    │   │   │   └── evaluate_boolq.yaml
    │   │   ├── retro
    │   │   │   ├── evaluate_nq.yaml
    │   │   │   └── evaluate_tqa.yaml
    │   │   ├── stable_diffusion
    │   │   │   └── fid_clip.yaml
    │   │   ├── starcoder2
    │   │   │   └── human_eval.yaml
    │   │   ├── t5
    │   │   │   ├── custom_task.yaml
    │   │   │   └── squad.yaml
    │   │   └── vit
    │   │   │   └── imagenet_val.yaml
    │   ├── export
    │   │   ├── gpt3
    │   │   │   └── export_gpt3.yaml
    │   │   ├── mt5
    │   │   │   └── export_mt5.yaml
    │   │   └── t5
    │   │   │   └── export_t5.yaml
    │   ├── external_conversion
    │   │   └── clip
    │   │   │   └── convert_external_clip.yaml
    │   ├── fine_tuning
    │   │   ├── baichuan2
    │   │   │   └── squad.yaml
    │   │   ├── bert_embedding
    │   │   │   └── sft.yaml
    │   │   ├── chatglm
    │   │   │   └── squad.yaml
    │   │   ├── code_llama
    │   │   │   └── human_eval.yaml
    │   │   ├── falcon
    │   │   │   └── squad.yaml
    │   │   ├── gpt3
    │   │   │   ├── custom_task.yaml
    │   │   │   └── squad.yaml
    │   │   ├── llama
    │   │   │   └── squad.yaml
    │   │   ├── mamba
    │   │   │   └── sft.yaml
    │   │   ├── mistral
    │   │   │   └── squad.yaml
    │   │   ├── mixtral
    │   │   │   ├── squad.yaml
    │   │   │   └── squad_8x22b.yaml
    │   │   ├── mt5
    │   │   │   ├── custom_task.yaml
    │   │   │   └── xquad.yaml
    │   │   ├── neva
    │   │   │   ├── llama2_13b_chat.yaml
    │   │   │   ├── llama2_7b_chat.yaml
    │   │   │   ├── llama3_70b_chat.yaml
    │   │   │   ├── llama3_8b_chat.yaml
    │   │   │   ├── mistral_7b_instruct.yaml
    │   │   │   └── mixtral_8x7b_instruct.yaml
    │   │   ├── nsfw
    │   │   │   └── nsfw_L_14.yaml
    │   │   ├── qwen2
    │   │   │   └── squad.yaml
    │   │   ├── t5
    │   │   │   ├── custom_task.yaml
    │   │   │   └── squad.yaml
    │   │   ├── video_neva
    │   │   │   └── llama3_8b_vita.yaml
    │   │   └── vit
    │   │   │   └── imagenet1k.yaml
    │   ├── fw_inference
    │   │   ├── clip
    │   │   │   └── clip_similarity.yaml
    │   │   ├── controlnet
    │   │   │   └── controlnet_infer.yaml
    │   │   ├── dreambooth
    │   │   │   └── text2img.yaml
    │   │   ├── imagen
    │   │   │   └── text2img.yaml
    │   │   ├── instruct_pix2pix
    │   │   │   └── edit_cli.yaml
    │   │   ├── neva
    │   │   │   └── inference.yaml
    │   │   ├── nsfw
    │   │   │   └── nsfw.yaml
    │   │   ├── retro
    │   │   │   └── retro_inference.yaml
    │   │   ├── sdxl
    │   │   │   └── sdxl_infer.yaml
    │   │   ├── stable_diffusion
    │   │   │   └── text2img.yaml
    │   │   ├── video_neva
    │   │   │   └── inference.yaml
    │   │   └── vit
    │   │   │   └── imagenet1k.yaml
    │   ├── ia3_learning
    │   │   ├── gpt3
    │   │   │   └── squad.yaml
    │   │   ├── llama
    │   │   │   └── squad.yaml
    │   │   └── t5
    │   │   │   └── squad.yaml
    │   ├── peft
    │   │   ├── baichuan2
    │   │   │   └── squad.yaml
    │   │   ├── chatglm
    │   │   │   └── squad.yaml
    │   │   ├── code_llama
    │   │   │   └── human_eval.yaml
    │   │   ├── falcon
    │   │   │   └── squad.yaml
    │   │   ├── gemma
    │   │   │   ├── sft.yaml
    │   │   │   └── squad.yaml
    │   │   ├── gpt3
    │   │   │   └── squad.yaml
    │   │   ├── griffin
    │   │   │   ├── sft.yaml
    │   │   │   └── squad.yaml
    │   │   ├── llama
    │   │   │   ├── sft.yaml
    │   │   │   └── squad.yaml
    │   │   ├── mistral
    │   │   │   └── squad.yaml
    │   │   ├── mistral_embedding
    │   │   │   └── squad.yaml
    │   │   ├── mixtral
    │   │   │   ├── squad.yaml
    │   │   │   └── squad_8x22b.yaml
    │   │   ├── nemotron
    │   │   │   ├── sft.yaml
    │   │   │   └── squad.yaml
    │   │   ├── neva
    │   │   │   ├── llama2_13b_chat.yaml
    │   │   │   ├── llama2_70b_chat.yaml
    │   │   │   ├── llama2_7b_chat.yaml
    │   │   │   ├── llama3_70b_chat.yaml
    │   │   │   ├── llama3_8b_chat.yaml
    │   │   │   ├── mistral_7b_instruct.yaml
    │   │   │   ├── mixtral_8x7b_instruct.yaml
    │   │   │   └── nemotron4_340b_chat.yaml
    │   │   ├── qwen2
    │   │   │   ├── sft.yaml
    │   │   │   └── squad.yaml
    │   │   ├── starcoder2
    │   │   │   └── sft.yaml
    │   │   └── t5
    │   │   │   └── squad.yaml
    │   ├── prompt_learning
    │   │   ├── gpt3
    │   │   │   └── squad.yaml
    │   │   ├── llama
    │   │   │   └── squad.yaml
    │   │   ├── mt5
    │   │   │   └── squad.yaml
    │   │   └── t5
    │   │   │   └── squad.yaml
    │   ├── ptq
    │   │   └── model
    │   │   │   └── quantization.yaml
    │   ├── rag_generating
    │   │   └── gpt3
    │   │   │   ├── 125m.yaml
    │   │   │   └── 7b.yaml
    │   ├── rag_indexing
    │   │   └── bert
    │   │   │   ├── 110m.yaml
    │   │   │   └── 340m.yaml
    │   ├── rlhf_ppo
    │   │   └── gpt3
    │   │   │   └── 2b_ppo.yaml
    │   ├── rlhf_rm
    │   │   └── gpt3
    │   │   │   └── 2b_rm.yaml
    │   ├── steerlm_reg
    │   │   ├── ac_sft
    │   │   │   └── gpt_sft.yaml
    │   │   └── rw_sft
    │   │   │   └── training_rm.yaml
    │   └── training
    │   │   ├── baichuan2
    │   │       ├── baichuan2_13b.yaml
    │   │       └── baichuan2_7b.yaml
    │   │   ├── bert
    │   │       ├── 100b.yaml
    │   │       ├── 110m.yaml
    │   │       ├── 20b.yaml
    │   │       └── 4b.yaml
    │   │   ├── chatglm
    │   │       ├── chatglm2-6b.yaml
    │   │       └── chatglm3-6b.yaml
    │   │   ├── clip
    │   │       ├── siglip_config.yaml
    │   │       ├── vit_B_32.yaml
    │   │       ├── vit_H_14.yaml
    │   │       └── vit_g_14.yaml
    │   │   ├── controlnet
    │   │       └── controlnet_v1-5.yaml
    │   │   ├── dreambooth
    │   │       └── 860m.yaml
    │   │   ├── falcon
    │   │       └── falcon_7b.yaml
    │   │   ├── gpt3
    │   │       ├── 126m.yaml
    │   │       ├── 175b.yaml
    │   │       ├── 175b_16k.yaml
    │   │       ├── 175b_32k.yaml
    │   │       ├── 175b_mlperf.yaml
    │   │       ├── 1b_improved.yaml
    │   │       ├── 20b.yaml
    │   │       ├── 400m_improved.yaml
    │   │       ├── 40b.yaml
    │   │       ├── 40b_16k.yaml
    │   │       ├── 40b_32k.yaml
    │   │       ├── 40b_64k.yaml
    │   │       ├── 40b_improved.yaml
    │   │       ├── 5b.yaml
    │   │       ├── 5b_16k.yaml
    │   │       ├── 5b_32k.yaml
    │   │       ├── 5b_64k.yaml
    │   │       ├── 7b_improved.yaml
    │   │       ├── mlperf-24n.yaml
    │   │       └── mlperf.yaml
    │   │   ├── grok
    │   │       └── grok1_proxy.yaml
    │   │   ├── imagen
    │   │       ├── 2b_res_64.yaml
    │   │       ├── 400m_res_256.yaml
    │   │       ├── 500m_res_64.yaml
    │   │       ├── 600m_res_1024.yaml
    │   │       └── 600m_res_256.yaml
    │   │   ├── instruct_pix2pix
    │   │       └── 860m_sd_edit.yaml
    │   │   ├── llama
    │   │       ├── llama1_13b.yaml
    │   │       ├── llama1_30b.yaml
    │   │       ├── llama1_65b.yaml
    │   │       ├── llama1_7b.yaml
    │   │       ├── llama2_13b.yaml
    │   │       ├── llama2_70b.yaml
    │   │       ├── llama2_7b.yaml
    │   │       ├── llama3_1_405b.yaml
    │   │       ├── llama3_1_70b.yaml
    │   │       ├── llama3_1_8b.yaml
    │   │       ├── llama3_70b.yaml
    │   │       └── llama3_8b.yaml
    │   │   ├── mistral
    │   │       ├── mistral_7b.yaml
    │   │       ├── mistral_nemo_123b.yaml
    │   │       └── mistral_nemo_12b.yaml
    │   │   ├── mixtral
    │   │       ├── mixtral_8x22b.yaml
    │   │       ├── mixtral_8x3b.yaml
    │   │       └── mixtral_8x7b.yaml
    │   │   ├── mt5
    │   │       ├── 11b.yaml
    │   │       ├── 170m.yaml
    │   │       ├── 23b.yaml
    │   │       ├── 390m.yaml
    │   │       └── 3b.yaml
    │   │   ├── nemotron
    │   │       ├── nemotron_15b.yaml
    │   │       ├── nemotron_22b.yaml
    │   │       ├── nemotron_340b.yaml
    │   │       ├── nemotron_4b.yaml
    │   │       └── nemotron_8b.yaml
    │   │   ├── nerf
    │   │       ├── dreamfusion-dmtet.yaml
    │   │       ├── dreamfusion.yaml
    │   │       └── model
    │   │       │   ├── background
    │   │       │       ├── random.yaml
    │   │       │       ├── static.yaml
    │   │       │       ├── tcnn.yaml
    │   │       │       └── torchngp.yaml
    │   │       │   ├── data
    │   │       │       └── data.yaml
    │   │       │   ├── dreamfusion-dmtet.yaml
    │   │       │   ├── dreamfusion.yaml
    │   │       │   ├── guidance
    │   │       │       ├── sd_huggingface.yaml
    │   │       │       ├── sd_nemo.yaml
    │   │       │       └── sd_trt.yaml
    │   │       │   ├── loss
    │   │       │       ├── dmtet.yaml
    │   │       │       └── dreamfusion.yaml
    │   │       │   ├── material
    │   │       │       └── basic_shading.yaml
    │   │       │   ├── nerf
    │   │       │       ├── tcnn.yaml
    │   │       │       └── torchngp.yaml
    │   │       │   ├── optim
    │   │       │       └── adan.yaml
    │   │       │   └── renderer
    │   │       │       ├── nerfacc.yaml
    │   │       │       ├── nvdiffrast.yaml
    │   │       │       └── torchngp_raymarching.yaml
    │   │   ├── neva
    │   │       ├── llama2_13b_chat.yaml
    │   │       ├── llama2_70b_chat.yaml
    │   │       ├── llama2_7b_chat.yaml
    │   │       ├── llama3_70b_chat.yaml
    │   │       ├── llama3_8b_chat.yaml
    │   │       ├── mistral_7b_instruct.yaml
    │   │       ├── mixtral_8x7b_instruct.yaml
    │   │       └── nemotron4_340b_chat.yaml
    │   │   ├── qwen2
    │   │       ├── qwen2_14b.yaml
    │   │       ├── qwen2_4b.yaml
    │   │       ├── qwen2_72b.yaml
    │   │       └── qwen2_7b.yaml
    │   │   ├── retro
    │   │       └── 300m.yaml
    │   │   ├── sdxl
    │   │       ├── sdxl_base_train_res_1024_stage_3.yaml
    │   │       ├── sdxl_base_train_res_256_stage_1.yaml
    │   │       └── sdxl_base_train_res_512_stage_2.yaml
    │   │   ├── stable_diffusion
    │   │       ├── 860m_res_256_pretrain.yaml
    │   │       ├── 860m_res_256_v2_0_pretrain.yaml
    │   │       ├── 860m_res_512_v1_1.yaml
    │   │       ├── 860m_res_512_v1_2.yaml
    │   │       ├── 860m_res_512_v1_5.yaml
    │   │       └── 860m_res_512_v2_0_base.yaml
    │   │   ├── starcoder2
    │   │       ├── starcoder2_15b.yaml
    │   │       ├── starcoder2_3b.yaml
    │   │       └── starcoder2_7b.yaml
    │   │   ├── t5
    │   │       ├── 11b.yaml
    │   │       ├── 220m.yaml
    │   │       ├── 23b.yaml
    │   │       ├── 3b.yaml
    │   │       └── 41b.yaml
    │   │   ├── tp_overlap
    │   │       ├── ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml
    │   │       ├── ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml
    │   │       ├── ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml
    │   │       ├── ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml
    │   │       ├── ub_cfg_h100_fp8_h8192_tp4_mbs1_seqlen8192.yaml
    │   │       ├── ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
    │   │       ├── ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml
    │   │       ├── ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml
    │   │       ├── ub_cfg_h100_h5120_tp2_mbs1_seqlen4096.yaml
    │   │       ├── ub_cfg_h100_h6144_tp2_mbs2_seqlen2048.yaml
    │   │       ├── ub_cfg_h100_h8192_tp2_mbs1_seqlen4096.yaml
    │   │       ├── ub_cfg_h100_h8192_tp4_mbs1_seqlen4096.yaml
    │   │       └── ub_cfg_h100_h8192_tp4_mbs1_seqlen8192.yaml
    │   │   ├── video_neva
    │   │       ├── llama2_13b_chat.yaml
    │   │       ├── llama2_70b_chat.yaml
    │   │       ├── llama2_7b_chat.yaml
    │   │       ├── mistral_7b_instruct.yaml
    │   │       └── mixtral_8x7b_instruct.yaml
    │   │   └── vit
    │   │       ├── B_16.yaml
    │   │       ├── H_14.yaml
    │   │       ├── L_16.yaml
    │   │       ├── bigG_14.yaml
    │   │       └── g_14.yaml
    ├── data
    │   └── nsfw
    │   │   └── concepts.txt
    ├── main.py
    ├── nemo_launcher
    │   ├── __init__.py
    │   ├── collections
    │   │   ├── __init__.py
    │   │   ├── auto_blend.py
    │   │   ├── checkpoint_search.py
    │   │   ├── conditional_cfgs.py
    │   │   ├── conf
    │   │   │   ├── auto_blend.yaml
    │   │   │   ├── checkpoint_search.yaml
    │   │   │   ├── get_ag_overlap.yaml
    │   │   │   ├── get_ln_sm_margin.yaml
    │   │   │   ├── hparams_override.yaml
    │   │   │   └── numa_mapping.yaml
    │   │   ├── datacuration_scripts
    │   │   │   └── download_fasttext.sh
    │   │   ├── dataprep_scripts
    │   │   │   ├── __init__.py
    │   │   │   ├── anthropichh_dataprep
    │   │   │   │   └── download_and_process.py
    │   │   │   ├── custom_dataprep
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── preprocess.py
    │   │   │   ├── dolly_dataprep
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── download.py
    │   │   │   │   └── preprocess.py
    │   │   │   ├── fid_evaluation_dataprep
    │   │   │   │   ├── conf
    │   │   │   │   │   └── config.yaml
    │   │   │   │   └── preprocess.py
    │   │   │   ├── mc4_dataprep
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── download.py
    │   │   │   │   ├── prepare.py
    │   │   │   │   ├── preprocess.py
    │   │   │   │   └── setup_preprocess.py
    │   │   │   ├── multimodal_dataprep
    │   │   │   │   ├── conf
    │   │   │   │   │   └── config.yaml
    │   │   │   │   ├── download_images.py
    │   │   │   │   ├── download_parquet.py
    │   │   │   │   ├── generate_wdinfo.py
    │   │   │   │   ├── merge_source_tar.py
    │   │   │   │   ├── precache_encodings.py
    │   │   │   │   └── reorganize_tar.py
    │   │   │   ├── pile_dataprep
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── conf
    │   │   │   │   │   └── config.yaml
    │   │   │   │   ├── download.py
    │   │   │   │   ├── extract.py
    │   │   │   │   └── preprocess.py
    │   │   │   └── slim_pajama_dataprep
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── concat.sh
    │   │   │   │   ├── conf
    │   │   │   │       └── config.yaml
    │   │   │   │   ├── download.py
    │   │   │   │   ├── extract.py
    │   │   │   │   └── preprocess.py
    │   │   ├── eval_diffusion_fid_clip
    │   │   │   ├── TFinception_V3.py
    │   │   │   ├── compute_clip_score.py
    │   │   │   ├── compute_fid.py
    │   │   │   ├── eval_fid.py
    │   │   │   ├── fid_dataset.py
    │   │   │   └── plot.py
    │   │   ├── eval_harness
    │   │   │   ├── __init__.py
    │   │   │   ├── download.py
    │   │   │   ├── evaluate.py
    │   │   │   └── lm_eval
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── evaluator.py
    │   │   │   │   ├── metrics.py
    │   │   │   │   ├── models
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── dummy.py
    │   │   │   │       ├── nemo_baichuan2.py
    │   │   │   │       ├── nemo_chatglm.py
    │   │   │   │       ├── nemo_falcon.py
    │   │   │   │       ├── nemo_gpt3.py
    │   │   │   │       ├── nemo_gpt3_prompt.py
    │   │   │   │       ├── nemo_llama.py
    │   │   │   │       ├── nemo_llama_prompt.py
    │   │   │   │       ├── nemo_mistral.py
    │   │   │   │       ├── nemo_mixtral.py
    │   │   │   │       └── nemo_qwen2.py
    │   │   │   │   ├── tasks
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── common.py
    │   │   │   │       ├── hellaswag.py
    │   │   │   │       ├── lambada.py
    │   │   │   │       ├── piqa.py
    │   │   │   │       ├── prompt.py
    │   │   │   │       ├── race.py
    │   │   │   │       ├── superglue.py
    │   │   │   │       ├── wikitext.py
    │   │   │   │       └── winogrande.py
    │   │   │   │   └── utils.py
    │   │   ├── export_scripts
    │   │   │   ├── __init__.py
    │   │   │   └── prepare_triton_model_config.py
    │   │   ├── gpu_affinity.py
    │   │   ├── hparams_override.py
    │   │   ├── metric_calculation
    │   │   │   ├── __init__.py
    │   │   │   ├── fine_tuning_metric_calc.py
    │   │   │   └── squad_metric_calc.py
    │   │   ├── numa_mapping.py
    │   │   ├── pause_and_prime_dns_connections.py
    │   │   └── run_dask_stage.sh
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── data_curation_stages.py
    │   │   ├── data_stages.py
    │   │   ├── export_stages.py
    │   │   ├── k8s_templates
    │   │   │   ├── conversion
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── conversion.yaml
    │   │   │   │   └── values.yaml
    │   │   │   ├── data_preparation
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── data-prep-config.yaml
    │   │   │   │   ├── data-prep.yaml
    │   │   │   │   └── values.yaml
    │   │   │   ├── evaluation
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── evaluation-config.yaml
    │   │   │   │   ├── evaluation.yaml
    │   │   │   │   └── values.yaml
    │   │   │   ├── peft
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── peft-config.yaml
    │   │   │   │   ├── peft.yaml
    │   │   │   │   └── values.yaml
    │   │   │   ├── rlhf_ppo
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── rlhf-ppo-actor.yaml
    │   │   │   │   ├── rlhf-ppo-config.yaml
    │   │   │   │   ├── rlhf-ppo-critic.yaml
    │   │   │   │   └── values.yaml
    │   │   │   └── training
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── training-config.yaml
    │   │   │   │   ├── training.yaml
    │   │   │   │   └── values.yaml
    │   │   ├── launchers.py
    │   │   ├── logger.py
    │   │   ├── rlhf_stages.py
    │   │   ├── stages.py
    │   │   └── v2
    │   │   │   ├── __init__.py
    │   │   │   ├── config_k8s.py
    │   │   │   ├── config_k8s_test.py
    │   │   │   ├── stages.py
    │   │   │   ├── stages_test.py
    │   │   │   ├── step_k8s.py
    │   │   │   └── step_k8s_test.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── data_utils
    │   │       ├── __init__.py
    │   │       ├── download_squad.py
    │   │       ├── prepare_squad.py
    │   │       └── prompt_learning_squad_preprocessing.py
    │   │   ├── file_utils.py
    │   │   └── job_utils.py
    └── tests
    │   ├── __init__.py
    │   └── unit_tests
    │       ├── __init__.py
    │       ├── config_tests
    │           ├── test_cluster_config.py
    │           ├── test_fault_tol_config.py
    │           └── test_main_config.py
    │       ├── stages_tests
    │           ├── __init__.py
    │           ├── test_adapters.py
    │           ├── test_convert.py
    │           ├── test_data_prep.py
    │           ├── test_eval.py
    │           ├── test_export.py
    │           ├── test_fine_tune.py
    │           ├── test_ia3.py
    │           ├── test_prompt_learn.py
    │           ├── test_ptq.py
    │           └── test_train.py
    │       └── utils_tests
    │           ├── __init__.py
    │           └── test_file_utils.py
├── requirements.txt
└── setup.cfg


/.github/workflows/autoconf.yml:
--------------------------------------------------------------------------------
 1 | name: autoconf
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - 'main'
 7 | 
 8 | jobs:
 9 |   unit_tests:
10 |     runs-on: ubuntu-latest
11 |     env:
12 |       working-directory: ./auto_configurator
13 | 
14 |     steps:
15 |       - name: checkout the repo
16 |         uses: actions/checkout@v3
17 | 
18 |       - name: install dependencies
19 |         run: |
20 |           python -m pip install pytest
21 |           pip install -r requirements.txt
22 |           pip install requests-mock
23 | 
24 |       - name: run unit tests
25 |         run: pytest
26 |         working-directory: ${{env.working-directory}}
27 | 


--------------------------------------------------------------------------------
/.github/workflows/launcher.yml:
--------------------------------------------------------------------------------
 1 | name: launcher
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - 'main'
 7 | 
 8 | jobs:
 9 |   unit_tests:
10 |     runs-on: ubuntu-latest
11 |     env:
12 |       working-directory: ./launcher_scripts
13 | 
14 |     steps:
15 |       - name: checkout the repo
16 |         uses: actions/checkout@v3
17 | 
18 |       - name: install dependencies
19 |         run: |
20 |           python -m pip install pytest requests-mock -r requirements.txt
21 | 
22 |       - name: run unit tests
23 |         run: PYTHONPATH=$PWD pytest
24 |         working-directory: ${{env.working-directory}}
25 | 


--------------------------------------------------------------------------------
/.github/workflows/style.yml:
--------------------------------------------------------------------------------
 1 | name: code_style
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - 'main'
 7 | 
 8 | jobs:
 9 |   black:
10 |     runs-on: ubuntu-latest
11 |     env:
12 |       working-directory: .
13 | 
14 |     steps:
15 |       - name: checkout the repo
16 |         uses: actions/checkout@v3
17 | 
18 |       - name: install dependencies
19 |         run: pip install --upgrade black==19.10b0 click==8.0.2
20 | 
21 |       - name: code style check
22 |         run: black . --check --verbose --diff
23 |         working-directory: ${{env.working-directory}}
24 | 


--------------------------------------------------------------------------------
/auto_configurator/autoconfig/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_configurator/conf/cluster/bcm.yaml:
--------------------------------------------------------------------------------
 1 | partition: null
 2 | account: null
 3 | exclusive: True
 4 | gpus_per_task: null
 5 | gpus_per_node: 8
 6 | mem: 0
 7 | job_name_prefix: "nemo_megatron_autoconfig:"
 8 | srun_args:
 9 |   - "--no-container-mount-home"
10 | 


--------------------------------------------------------------------------------
/auto_configurator/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - cluster: bcm
 4 |   - search_config: gpt3/5b
 5 |   - override hydra/job_logging: stdout
 6 | 
 7 | hydra:
 8 |   run:
 9 |     dir: .
10 |   output_subdir: null
11 | 
12 | run_training_hp_search: True
13 | run_inference_hp_search: True
14 | 
15 | cluster_type: bcm  # bcm or bcp
16 | auto_configurator_path: ???  # Path to the location of auto_configurator codebase.
17 | launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
18 | base_results_dir: ${auto_configurator_path}/results
19 | data_dir: ${launcher_scripts_path}/data 
20 | 
21 | training_container: nvcr.io/nvidia/nemo:24.09
22 | container_mounts:
23 |   - null
24 | 
25 | wandb:  # Weights and Biases (W&B) logging.
26 |   enable: False  # Whether to save logs to W&B.
27 |   api_key_file: null # Path to the file where the w&B api key is stored. Key must be on the first line.
28 |   project: nemo-megatron-autoconfig # Name of the W&B project to store the logs in. The name of the run will be populated automatically.
29 | 
30 | # Do not modify the code below.
31 | search_config_value: ${hydra:runtime.choices.search_config}
32 | 


--------------------------------------------------------------------------------
/auto_configurator/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/auto_configurator/tests/base_configs_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/auto_configurator/tests/code_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/auto_configurator/tests/code_tests/__init__.py


--------------------------------------------------------------------------------
/auto_configurator/tests/config_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/auto_configurator/tests/config_tests/__init__.py


--------------------------------------------------------------------------------
/auto_configurator/tests/config_tests/test_cluster_config.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import OmegaConf
 2 | 
 3 | 
 4 | class TestClusterConfig:
 5 |     def test_cluster_bcm_config(self):
 6 |         conf = OmegaConf.load("conf/cluster/bcm.yaml")
 7 |         s = """
 8 |         partition: null
 9 |         account: null
10 |         exclusive: True
11 |         gpus_per_task: null
12 |         gpus_per_node: 8
13 |         mem: 0
14 |         job_name_prefix: "nemo_megatron_autoconfig:"
15 |         srun_args:
16 |           - "--no-container-mount-home"
17 |         """
18 |         expected = OmegaConf.create(s)
19 |         assert (
20 |             expected == conf
21 |         ), f"conf/cluster/bcm.yaml must be set to {expected} but it currently is {conf}."
22 | 


--------------------------------------------------------------------------------
/auto_configurator/tuning/README.md:
--------------------------------------------------------------------------------
 1 | # NeMo Framework Launcher Fine-Tuning Autoconfigurator
 2 | 
 3 | The fine-tuning autoconfigurator allows to conduct hyperparameter search for fine-tuning jobs on Slurm cluster. Similarly to usual grid search, it starts a sequence of jobs for different hyperparameter configurations, and analyses their results to find the best-performing one in terms of validation loss.
 4 | 
 5 | ## Usage
 6 | 1. Specify Slurm cluster parameters in `conf/cluster/bcm.yaml`
 7 | 2. Fill all required values in `conf/config.yaml`. The `search_config.param_grid` field corresponds a set of hyperparamter values to use for grid search. Hyperparameter names should be specified in Hydra dot notation. The values should be lists of hyperparameter values to choose from.
 8 | 3. Run hyperparameter search with `python3 main.py`
 9 | 
10 | The following results will be stored in `base_results_dir` for each hyperparameter search:
11 | - `candidate_configs` - .yaml config files, used for different experiments
12 | - `ft_logs` - logs of NeMo fine-tuning jobs
13 | - `final_result` - folder, containing result analysis logs and experiment summary in results.csv


--------------------------------------------------------------------------------
/auto_configurator/tuning/conf/cluster/bcm.yaml:
--------------------------------------------------------------------------------
 1 | partition: null
 2 | account: null
 3 | exclusive: True
 4 | gpus_per_task: null
 5 | gpus_per_node: null
 6 | mem: 0
 7 | job_name_prefix: "<JOB_PREFIX>:"
 8 | srun_args:
 9 |   - "--no-container-mount-home"
10 | 


--------------------------------------------------------------------------------
/auto_configurator/tuning/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Entry point, main file to run to launch fine-tuning autoconfigurator jobs."""
16 | 
17 | import hydra
18 | import omegaconf
19 | from src.search import run_search
20 | 
21 | 
22 | @hydra.main(config_path="conf", config_name="config")
23 | def main(cfg: omegaconf.dictconfig.DictConfig) -> None:
24 |     """
25 |     Entry point for the fine-tuning autoconfigurator pipeline. Reads the config using
26 |       hydra, runs fine-tuning hyperparemeter search.
27 |     :param omegaconf.dictconfig.DictConfig cfg: OmegaConf object, read using
28 |       the @hydra.main decorator.
29 |     :return: None
30 |     """
31 |     run_search(cfg=cfg)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/csp_tools/aws/build-nccl-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #SBATCH --ntasks=1
17 | #SBATCH --ntasks-per-node=1
18 | 
19 | srun --container-mounts="$PWD:/nccl" \
20 |      --container-image=../../nemo_megatron_training.sqsh \
21 |      bash -c "
22 |      cd /nccl &&
23 |      curl -fSsL --proto '=https' https://github.com/NVIDIA/nccl-tests/tarball/master | tar xz &&
24 |      mv NVIDIA-nccl-tests* nccl-tests &&
25 |      cd nccl-tests &&
26 |      make -j CUDA_HOME=/usr/local/cuda MPI=1 MPI_HOME=/opt/amazon/openmpi/"
27 | 


--------------------------------------------------------------------------------
/csp_tools/aws/dcgmi_diag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #SBATCH --job-name=dcgmi-diag
17 | #SBATCH --gpus-per-node=8
18 | #SBATCH --time=1:00:00
19 | 
20 | # This is a Data Center GPU Manager container. This command will run GPU diagnostics.
21 | # This script should not be called manually. It should only be called by cluster_validation.sh
22 | srun --container-image=nvcr.io/nvidia/cloud-native/dcgm:2.3.5-1-ubi8 bash -c "dcgmi diag -r 3"
23 | 


--------------------------------------------------------------------------------
/csp_tools/azure/build-nccl-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #SBATCH --ntasks=1
17 | #SBATCH --ntasks-per-node=1
18 | 
19 | HPCX_PATH="/opt/hpcx-v2.9.0-gcc-MLNX_OFED_LINUX-5.4-1.0.3.0-ubuntu18.04-x86_64"
20 | 
21 | export OMPI_MCA_pml=ucx
22 | export OMPI_MCA_btl=^openib
23 | 
24 | srun --container-mounts="$PWD:/nccl,$HPCX_PATH:/opt/hpcx" \
25 |      --container-image="nvcr.io/nvidia/pytorch:21.09-py3" \
26 |      --container-name="nccl" \
27 |      bash -c "
28 |      cd /nccl &&
29 |      git clone https://github.com/NVIDIA/nccl-tests.git &&
30 |      source /opt/hpcx/hpcx-init.sh &&
31 |      hpcx_load &&
32 |      cd nccl-tests &&
33 |      make MPI=1"
34 | 


--------------------------------------------------------------------------------
/csp_tools/azure/dcgmi_diag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #SBATCH --job-name=dcgmi-diag
17 | #SBATCH --time=1:00:00
18 | 
19 | # This is a Data Center GPU Manager container. This command will run GPU diagnostics.
20 | # This script should not be called manually. It should only be called by cluster_validation.sh
21 | srun --container-image=nvcr.io/nvidia/cloud-native/dcgm:2.3.5-1-ubi8 bash -c "dcgmi diag -r 3"
22 | 


--------------------------------------------------------------------------------
/csp_tools/oci/build-nccl-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #SBATCH --ntasks=1
17 | #SBATCH --ntasks-per-node=1
18 | 
19 | HPCX_PATH="/opt/hpcx-v2.11-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.11-x86_64"
20 | 
21 | export OMPI_MCA_pml=ucx
22 | export OMPI_MCA_btl=^openib
23 | 
24 | srun --container-mounts="$PWD:/nccl,$HPCX_PATH:/opt/hpcx" \
25 |      --container-image="nvcr.io/nvidia/pytorch:21.09-py3" \
26 |      --container-name="nccl" \
27 |      bash -c "
28 |      cd /nccl &&
29 |      git clone https://github.com/NVIDIA/nccl-tests.git &&
30 |      source /opt/hpcx/hpcx-init.sh &&
31 |      hpcx_load &&
32 |      cd nccl-tests &&
33 |      make MPI=1"
34 | 


--------------------------------------------------------------------------------
/csp_tools/oci/dcgmi_diag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #SBATCH --job-name=dcgmi-diag
17 | #SBATCH --time=1:00:00
18 | 
19 | # This is a Data Center GPU Manager container. This command will run GPU diagnostics.
20 | # This script should not be called manually. It should only be called by cluster_validation.sh
21 | srun --container-image=nvcr.io/nvidia/cloud-native/dcgm:2.3.5-1-ubi8 bash -c "dcgmi diag -r 3"
22 | 


--------------------------------------------------------------------------------
/examples/peft/llama/a100/lora_4gpu_k8s.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 4 | set -eu
 5 | 
 6 | #Users should specify the following directories
 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=$(readlink -f ${SCRIPT_DIR}/../../../..)
 8 | DATA_DIR=${DATA_DIR}
 9 | RESTORE_FROM_PATH=${RESTORE_FROM_PATH}
10 | RUN_NAME=${RUN_NAME:-llama-7b-peft-lora}
11 | PEFT_CONFIG=${PEFT_CONFIG:-llama/squad}
12 | 
13 | # peft.model.megatron_amp_O2=false is needed on containers earlier than 23.11 that
14 | # do not include https://github.com/NVIDIA/NeMo/pull/7971
15 | TRANSIENT_OVERRIDES="peft.model.megatron_amp_O2=false"
16 | 
17 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
18 | cluster=k8s_v2 \
19 | cluster_type=k8s \
20 | cluster.ib_interfaces=null \
21 | container=nvcr.io/nvidia/nemo:24.09 \
22 | stages=[peft] \
23 | peft=${PEFT_CONFIG} \
24 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
25 | data_dir=${DATA_DIR} \
26 | peft.run.name="${RUN_NAME}" \
27 | peft.trainer.num_nodes=1 \
28 | peft.trainer.devices=4 \
29 | peft.trainer.max_epochs=null \
30 | peft.trainer.max_steps=2000 \
31 | peft.model.global_batch_size=128 \
32 | peft.model.micro_batch_size=1 \
33 | peft.model.restore_from_path=$RESTORE_FROM_PATH \
34 | $TRANSIENT_OVERRIDES \
35 | $@
36 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/175b_16node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/175b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="175b_a100_16node" \
16 | training.trainer.num_nodes=16 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=1 \
19 | training.model.tensor_model_parallel_size=4 \
20 | training.model.pipeline_model_parallel_size=8 \
21 | training.model.virtual_pipeline_model_parallel_size=12 \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/20b_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/20b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="20b_a100_1node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=2 \
19 | training.model.tensor_model_parallel_size=4 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.run.time_limit=0:20:00 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/20b_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/20b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="20b_a100_8node" \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=2048 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=4 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | training.run.time_limit=0:20:00 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/40b_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/40b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="40b_a100_8node" \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=2 \
19 | training.model.tensor_model_parallel_size=2 \
20 | training.model.pipeline_model_parallel_size=4 \
21 | training.model.virtual_pipeline_model_parallel_size=12 \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/5b_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/5b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="5b_a100_1node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=1 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.run.time_limit=0:20:00 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/5b_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/5b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="5b_a100_8node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=2048 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=1 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | training.run.time_limit=0:20:00 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/fsdp_20b_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #This example does pre-training GPT 5B model using torch FSDP.
 4 | 
 5 | # Users should specify the path to the launcher directory and the dataset in the
 6 | # commandline or in this run script.
 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 8 | DATA_DIR=${DATA_DIR}
 9 | 
10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
12 | training=gpt3/5b \
13 | stages=[training] \
14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
15 | data_dir=${DATA_DIR} \
16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
17 | training.trainer.precision="bf16-mixed" \
18 | training.run.name="fsdp_5b_a100_1node" \
19 | training.trainer.num_nodes=1 \
20 | training.model.global_batch_size=256 \
21 | training.model.megatron_amp_O2=False \
22 | training.model.use_cpu_initialization=True \
23 | +training.model.fsdp=True \
24 | +training.model.fsdp_sharded_checkpoint=True \
25 | training.model.optim.name="fused_adam" \
26 | ~training.model.optim.bucket_cap_mb \
27 | ~training.model.optim.overlap_grad_sync \
28 | ~training.model.optim.overlap_param_sync \
29 | ~training.model.optim.contiguous_grad_buffer \
30 | training.run.time_limit=0:20:00 \
31 | 


--------------------------------------------------------------------------------
/examples/training/gpt/a100/fsdp_5b_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #This example does pre-training GPT 5B model using torch FSDP.
 4 | 
 5 | # Users should specify the path to the launcher directory and the dataset in the
 6 | # commandline or in this run script.
 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 8 | DATA_DIR=${DATA_DIR}
 9 | 
10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
12 | training=gpt3/5b \
13 | stages=[training] \
14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
15 | data_dir=${DATA_DIR} \
16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
17 | training.trainer.precision="bf16-mixed" \
18 | training.run.name="fsdp_5b_a100_1node" \
19 | training.trainer.num_nodes=1 \
20 | training.model.global_batch_size=256 \
21 | training.model.megatron_amp_O2=False \
22 | training.model.use_cpu_initialization=True \
23 | +training.model.fsdp=True \
24 | +training.model.fsdp_sharded_checkpoint=True \
25 | training.model.optim.name="fused_adam" \
26 | ~training.model.optim.bucket_cap_mb \
27 | ~training.model.optim.overlap_grad_sync \
28 | ~training.model.optim.overlap_param_sync \
29 | ~training.model.optim.contiguous_grad_buffer \
30 | training.run.time_limit=0:20:00 \
31 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/175b_bf16_16node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/175b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="175b_h100_bf16_16node" \
16 | training.trainer.num_nodes=16 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=1 \
19 | training.model.tensor_model_parallel_size=4 \
20 | training.model.pipeline_model_parallel_size=8 \
21 | training.model.virtual_pipeline_model_parallel_size=12 \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/175b_fp8_16node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/175b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="175b_h100_fp8_16node" \
16 | training.trainer.num_nodes=16 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=1 \
19 | training.model.tensor_model_parallel_size=4 \
20 | training.model.pipeline_model_parallel_size=8 \
21 | training.model.virtual_pipeline_model_parallel_size=12 \
22 | training.model.fp8=true \
23 | training.run.time_limit=0:20:00 \
24 | +training.model.optim.grad_sync_dtype=bf16 \
25 | +env_vars.NVTE_FUSED_ATTN=1 \
26 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/20b_bf16_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/20b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="20b_h100_bf16_1node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=2 \
19 | training.model.tensor_model_parallel_size=4 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.run.time_limit=0:20:00 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/20b_bf16_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/20b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="20b_h100_bf16_8node" \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=2048 \
18 | training.model.micro_batch_size=2 \
19 | training.model.tensor_model_parallel_size=2 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.run.time_limit=0:20:00 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/20b_fp8_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/20b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="20b_h100_fp8_1node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=4 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.model.fp8=true \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/20b_fp8_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/20b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="20b_h100_fp8_8node" \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=2048 \
18 | training.model.micro_batch_size=2 \
19 | training.model.tensor_model_parallel_size=2 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.model.fp8=true \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/40b_bf16_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/40b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="40b_h100_bf16_8node" \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=2 \
19 | training.model.tensor_model_parallel_size=2 \
20 | training.model.pipeline_model_parallel_size=4 \
21 | training.model.virtual_pipeline_model_parallel_size=12 \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/40b_fp8_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/40b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="40b_h100_fp8_8node" \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=2 \
19 | training.model.tensor_model_parallel_size=2 \
20 | training.model.pipeline_model_parallel_size=4 \
21 | training.model.virtual_pipeline_model_parallel_size=12 \
22 | training.model.fp8=true \
23 | training.run.time_limit=0:20:00 \
24 | +training.model.optim.grad_sync_dtype=bf16 \
25 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/5b_bf16_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/5b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="5b_h100_bf16_1node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=1 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.run.time_limit=0:20:00 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/5b_bf16_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/5b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="5b_h100_bf16_8node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=2048 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=1 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.run.time_limit=0:20:00 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/5b_fp8_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/5b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="5b_h100_fp8_1node" \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=256 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=1 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.model.fp8=true \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/5b_fp8_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Users should specify the path to the launcher directory and the dataset in the
 4 | # commandline or in this run script.
 5 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 6 | DATA_DIR=${DATA_DIR}
 7 | 
 8 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 9 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
10 | training=gpt3/5b \
11 | stages=[training] \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | data_dir=${DATA_DIR} \
14 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
15 | training.run.name="5b_h100_bf16_8node" \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=2048 \
18 | training.model.micro_batch_size=4 \
19 | training.model.tensor_model_parallel_size=1 \
20 | training.model.pipeline_model_parallel_size=1 \
21 | training.model.fp8=true \
22 | training.run.time_limit=0:20:00 \
23 | +training.model.optim.grad_sync_dtype=bf16 \
24 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/fsdp_20b_bf16_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #This example does pre-training GPT 5B model using torch FSDP + TP.
 4 | 
 5 | # Users should specify the path to the launcher directory and the dataset in the
 6 | # commandline or in this run script.
 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 8 | DATA_DIR=${DATA_DIR}
 9 | 
10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
12 | training=gpt3/20b \
13 | stages=[training] \
14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
15 | data_dir=${DATA_DIR} \
16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
17 | training.trainer.precision="bf16-mixed" \
18 | training.run.name="fsdp_20b_h100_bf16_1node" \
19 | training.trainer.num_nodes=1 \
20 | training.model.global_batch_size=32 \
21 | training.model.megatron_amp_O2=False \
22 | training.model.use_cpu_initialization=True \
23 | +training.model.fsdp=True \
24 | +training.model.fsdp_sharded_checkpoint=False \
25 | training.model.optim.name="fused_adam" \
26 | ~training.model.optim.bucket_cap_mb \
27 | ~training.model.optim.overlap_grad_sync \
28 | ~training.model.optim.overlap_param_sync \
29 | ~training.model.optim.contiguous_grad_buffer \
30 | training.run.time_limit=0:20:00 \
31 | 


--------------------------------------------------------------------------------
/examples/training/gpt/h100/fsdp_5b_bf16_1node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #This example does pre-training GPT 5B model using torch FSDP.
 4 | 
 5 | # Users should specify the path to the launcher directory and the dataset in the
 6 | # commandline or in this run script.
 7 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 8 | DATA_DIR=${DATA_DIR}
 9 | 
10 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
11 | python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
12 | training=gpt3/5b \
13 | stages=[training] \
14 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
15 | data_dir=${DATA_DIR} \
16 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
17 | training.trainer.precision="bf16-mixed" \
18 | training.run.name="fsdp_5b_h100_bf16_1node" \
19 | training.trainer.num_nodes=1 \
20 | training.model.global_batch_size=256 \
21 | training.model.megatron_amp_O2=False \
22 | training.model.use_cpu_initialization=True \
23 | +training.model.fsdp=True \
24 | +training.model.fsdp_sharded_checkpoint=True \
25 | training.model.optim.name="fused_adam" \
26 | ~training.model.optim.bucket_cap_mb \
27 | ~training.model.optim.overlap_grad_sync \
28 | ~training.model.optim.overlap_param_sync \
29 | ~training.model.optim.contiguous_grad_buffer \
30 | training.run.time_limit=0:20:00 \
31 | 


--------------------------------------------------------------------------------
/examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=grok/grok1_proxy \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="grok1_proxy_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.model.tokenizer.model=${TOK_PATH} \
17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
18 | training.model.moe_grouped_gemm=False \
19 | training.model.gradient_accumulation_fusion=True \
20 | +training.model.optim.grad_sync_dtype=bf16 \
21 | training.trainer.num_nodes=64 \
22 | +training.model.context_parallel_size=2 \
23 | training.model.sequence_parallel=True \
24 | training.model.tensor_model_parallel_size=4 \
25 | training.model.pipeline_model_parallel_size=8 \
26 | training.model.virtual_pipeline_model_parallel_size=8 \
27 | training.model.gc_interval=40
28 | 


--------------------------------------------------------------------------------
/examples/training/llama/a100/llama2_13b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_13b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_13b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/a100/llama2_70b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_70b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_70b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/a100/llama2_7b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_7b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_7b_bf16" \
15 | training.run.time_limit=0:15:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama2_13b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_13b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_13b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama2_13b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_13b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_13b_fp8" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.gc_interval=100 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama2_70b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_70b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_70b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama2_70b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_70b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_70b_fp8" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama2_7b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_7b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_7b_bf16" \
15 | training.run.time_limit=0:15:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama2_7b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama2_7b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama2_7b_fp8" \
15 | training.run.time_limit=0:15:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.gc_interval=100 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama3_405b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama3_405b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama3_1_405b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=72 \
17 | training.model.global_batch_size=252 \
18 | training.model.tokenizer.model=${TOK_PATH} \
19 | +training.model.optim.grad_sync_dtype=bf16 \
20 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama3_405b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama3_405b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama3_1_405b_fp8" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=72 \
17 | training.model.global_batch_size=252 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | training/tp_overlap@training.model.ub_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192 \
23 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama3_70b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama3_70b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama3_70b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama3_70b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama3_70b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama3_70b_fp8" \
15 | training.run.time_limit=0:30:00 \
16 | training.trainer.num_nodes=8 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | training/tp_overlap@training.model.ub_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h8192_tp4_mbs1_seqlen8192 \
23 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama3_8b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama3_8b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama3_8b_bf16" \
15 | training.run.time_limit=0:15:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.optim.grad_sync_dtype=bf16 \
22 | 


--------------------------------------------------------------------------------
/examples/training/llama/h100/llama3_8b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=llama/llama3_8b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="llama3_8b_fp8" \
15 | training.run.time_limit=0:15:00 \
16 | training.trainer.num_nodes=1 \
17 | training.model.global_batch_size=128 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | training.model.tokenizer.model=${TOK_PATH} \
21 | +training.model.gc_interval=100 \
22 | +training.model.optim.grad_sync_dtype=bf16 \
23 | 


--------------------------------------------------------------------------------
/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=mixtral/mixtral_8x3b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="mixtral_8x3b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.model.tokenizer.model=${TOK_PATH} \
17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
18 | 


--------------------------------------------------------------------------------
/examples/training/mixtral/h100/mixtral_8x3b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=mixtral/mixtral_8x3b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="mixtral_8x3b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.model.tokenizer.model=${TOK_PATH} \
17 | training.model.fp8=True \
18 | +training.model.fp8_params=True \
19 | +training.model.optim.overlap_param_gather_with_optimizer_step=False \
20 | +training.model.optim.average_in_collective=True \
21 | 


--------------------------------------------------------------------------------
/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=mixtral/mixtral_8x7b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="mixtral_8x7b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.model.tokenizer.model=${TOK_PATH} \
17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \


--------------------------------------------------------------------------------
/examples/training/mixtral/h100/mixtral_8x7b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | TOK_PATH=${TOK_PATH}
 7 | 
 8 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 9 | training=mixtral/mixtral_8x7b \
10 | stages=[training] \
11 | data_dir=${DATA_DIR} \
12 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
13 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
14 | training.run.name="mixtral_8x7b_bf16" \
15 | training.run.time_limit=0:30:00 \
16 | training.model.tokenizer.model=${TOK_PATH} \
17 | +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
18 | training.model.fp8=True \
19 | +training.model.fp8_params=True \
20 | +training.model.optim.overlap_param_gather_with_optimizer_step=True \
21 | +training.model.optim.average_in_collective=True \
22 | training.model.sequence_parallel=False \
23 | 


--------------------------------------------------------------------------------
/examples/training/nemotron/a100/nemotron_22b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | 
 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 8 | training=nemotron/nemotron_22b \
 9 | stages=[training] \
10 | data_dir=${DATA_DIR} \
11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
13 | training.run.name="nemotron_22b_bf16" \
14 | training.run.time_limit=0:20:00 \
15 | training.trainer.num_nodes=2 \
16 | training.model.global_batch_size=32 \
17 | training.model.micro_batch_size=1 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | 


--------------------------------------------------------------------------------
/examples/training/nemotron/a100/nemotron_8b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | 
 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 8 | training=nemotron/nemotron_8b \
 9 | stages=[training] \
10 | data_dir=${DATA_DIR} \
11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
13 | training.run.name="nemotron_8b_bf16" \
14 | training.run.time_limit=0:15:00 \
15 | training.trainer.num_nodes=1 \
16 | training.model.global_batch_size=32 \
17 | training.model.micro_batch_size=2 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | 


--------------------------------------------------------------------------------
/examples/training/nemotron/h100/nemotron_22b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | 
 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 8 | training=nemotron/nemotron_22b \
 9 | stages=[training] \
10 | data_dir=${DATA_DIR} \
11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
13 | training.run.name="nemotron_22b_bf16" \
14 | training.run.time_limit=0:20:00 \
15 | training.trainer.num_nodes=2 \
16 | training.model.global_batch_size=32 \
17 | training.model.micro_batch_size=1 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | 


--------------------------------------------------------------------------------
/examples/training/nemotron/h100/nemotron_22b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | 
 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 8 | training=nemotron/nemotron_22b \
 9 | stages=[training] \
10 | data_dir=${DATA_DIR} \
11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
13 | training.run.name="nemotron_22b_fp8" \
14 | training.run.time_limit=0:20:00 \
15 | training.trainer.num_nodes=2 \
16 | training.model.global_batch_size=32 \
17 | training.model.micro_batch_size=1 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | +training.model.gc_interval=100 \
21 | 


--------------------------------------------------------------------------------
/examples/training/nemotron/h100/nemotron_8b_bf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | 
 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 8 | training=nemotron/nemotron_8b \
 9 | stages=[training] \
10 | data_dir=${DATA_DIR} \
11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
13 | training.run.name="nemotron_8b_bf16" \
14 | training.run.time_limit=0:15:00 \
15 | training.trainer.num_nodes=1 \
16 | training.model.global_batch_size=32 \
17 | training.model.micro_batch_size=2 \
18 | training.model.fp8=False \
19 | training.model.fp8_hybrid=False \
20 | 


--------------------------------------------------------------------------------
/examples/training/nemotron/h100/nemotron_8b_fp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Users should setup their cluster type in /launcher_scripts/conf/config.yaml
 4 | NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
 5 | DATA_DIR=${DATA_DIR}
 6 | 
 7 | HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
 8 | training=nemotron/nemotron_8b \
 9 | stages=[training] \
10 | data_dir=${DATA_DIR} \
11 | launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
12 | base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
13 | training.run.name="nemotron_8b_fp8" \
14 | training.run.time_limit=0:15:00 \
15 | training.trainer.num_nodes=1 \
16 | training.model.global_batch_size=32 \
17 | training.model.micro_batch_size=2 \
18 | training.model.fp8=True \
19 | training.model.fp8_hybrid=True \
20 | +training.model.gc_interval=100 \
21 | 


--------------------------------------------------------------------------------
/img/4B_bert_throughput_2211.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/img/4B_bert_throughput_2211.png


--------------------------------------------------------------------------------
/img/4b_bert_loss_final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/img/4b_bert_loss_final.png


--------------------------------------------------------------------------------
/img/model_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/img/model_overview.png


--------------------------------------------------------------------------------
/launcher_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/__init__.py


--------------------------------------------------------------------------------
/launcher_scripts/conf/cluster/bcm.yaml:
--------------------------------------------------------------------------------
 1 | partition: null
 2 | account: null
 3 | exclusive: True
 4 | gpus_per_task: null
 5 | gpus_per_node: 8
 6 | mem: 0
 7 | job_name_prefix: 'nemo-megatron-'
 8 | nodelist: null
 9 | srun_args:
10 |   - "--no-container-mount-home"
11 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/baichuan2/convert_baichuan2.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: baichuan2_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_baichuan2.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_baichuan2-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 2
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   tokenizer_model: ${data_dir}/baichuan2/baichuan2_tokenizer.model
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/chatglm/convert_chatglm.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: chatglm3_6b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_chatglm.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_chatglm-*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   tensor_model_parallel_size: 1
19 |   pipeline_model_parallel_size: 1
20 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
21 |   tokenizer_model: ${data_dir}/chatglm/chatglm_tokenizer.model
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/clip/convert_clip.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: clip_vit_B_32
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo_clip.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: megatron_clip
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt_*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   tensor_model_parallel_size: 1
19 |   pipeline_model_parallel_size: 1
20 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
21 |   vocab_file: ${data_dir}/bpe/vocab.txt
22 |   merge_file: null
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/controlnet/convert_controlnet.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: controlnet_v1-5
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo-controlnet.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: controlnet
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   model_parallel_size: 1


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/dreambooth/convert_dreambooth.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: dreambooth_sd_860m
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo_dreambooth.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: dreambooth
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt_*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   model_parallel_size: 1


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: gpt3_5b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_gpt.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   vocab_file: ${data_dir}/bpe/vocab.json
23 |   merge_file: ${data_dir}/bpe/merges.txt
24 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/imagen/convert_imagen.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: imagen_base64_500m_edm
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo_imagen.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: imagen
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/
16 |   checkpoint_name: latest-EMA # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   model_parallel_size: 1


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/instruct_pix2pix/convert_instruct_pix2pix.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: instruct_pix2pix_860m_sd_edit
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo_instruct_pix2pix.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: instruct_pix2pix
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/instruct-pix2pix--val
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt_*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   model_parallel_size: 1


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/llama/convert_llama.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: llama2_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 2
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   tokenizer_model: ${data_dir}/llama/llama_tokenizer.model
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/mistral/convert_mistral.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: mistral_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_mistral.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 2
20 |   sequence_parallel: True
21 |   pipeline_model_parallel_size: 1
22 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/mixtral/convert_mixtral.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: mixtral
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_mixtral.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_mixtral-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 2
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/mixtral/convert_mixtral_8x22b.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: mixtral
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_mixtral.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_mixtral-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 2
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/nemotron/convert_nemotron.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: nemotron
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_nemotron.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_nemotron-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 2
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/neva/convert_neva.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: neva_llama2_7b_chat
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo_neva.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: neva
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   tensor_model_parallel_size: 4
19 |   pipeline_model_parallel_size: 1
20 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/qwen2/convert_qwen2.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: qwen2_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_qwen2.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_qwen2-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 2
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/stable_diffusion/convert_stable_diffusion.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: stable_diffusion_860m_res_256_pretrain
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo_stable_diffusion.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: stable_diffusion
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints/
16 |   checkpoint_name: latest-EMA # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   model_parallel_size: 1


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/starcoder2/convert_starcoder2.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: starcoder2_15b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_starcoder2.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: gpt # gpt or t5, use t5 for mt5 as well
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   tensor_model_parallel_size: 2
19 |   pipeline_model_parallel_size: 1
20 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
21 |   tokenizer_model: ${data_dir}/starcoder2/starcoder2_tokenizer.model
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/t5/convert_t5.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: t5_220m
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: megatron_t5.nemo # name of nemo checkpoint; must be .nemo file
12 |   pack_nemo_file: True # true to compress as a .nemo file, false to write files under nemo_file_name as a directory
13 | 
14 | model:
15 |   model_type: t5 # gpt or t5, use t5 for mt5 as well
16 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
17 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1 # 1 for 220m, 2 for 3b
20 |   pipeline_model_parallel_size: 1
21 |   pipeline_model_parallel_split_rank: ${divide_floor:${.pipeline_model_parallel_size}, 2}
22 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
23 |   vocab_file: ${data_dir}/bpe/vocab.txt
24 |   merge_file: null
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/conversion/vit/convert_vit.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_${conversion.run.model_train_name}
 3 |   nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
 7 |   convert_name: convert_nemo
 8 |   model_train_name: vit_B_16
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   results_dir: ${.train_dir}/${.convert_name}
11 |   nemo_file_name: nemo_vit_classification.nemo # name of nemo checkpoint; must be .nemo file
12 | 
13 | model:
14 |   model_type: vit_classification
15 |   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
16 |   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
17 |   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
18 |   tensor_model_parallel_size: 1
19 |   pipeline_model_parallel_size: 1
20 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/compute_minhashes/compute_minhashes.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'compute-minhashes'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 2
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 72GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | minhash_length: 260
15 | char_ngram: 5
16 | hash_bytes: 4
17 | seed: 42
18 | num_files: -1
19 | files_per_partition: 10
20 | 
21 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/connected_component/connected_component.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'connected-component'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 2
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 72GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | jaccard_pairs_path: ${data_curation.run.results_dir}/fuzzy_deduped/dedup_final_results.parquet
15 | output_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_output
16 | cache_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_cache
17 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/fasttext_download/fasttext_download.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'fasttext-download'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   dependency: "singleton"
 5 |   time_limit: "00:20:00"
 6 |   nodes: 1
 7 |   node_type: cpu
 8 | 
 9 | filter_config:
10 |   input_field: text
11 |   filters:
12 |     - name: nemo_curator.filters.classifier_filter.FastTextLangId
13 |       log_score: True
14 |       params:
15 |         model_path: lid.176.bin # Will be automatically downloaded if it doesn't exist


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/find_matching_ngrams/find_matching_ngrams.yaml:
--------------------------------------------------------------------------------
1 | run:
2 |   name: 'find-matching-ngrams'
3 |   results_dir: ${data_curation.run.results_dir}/${.name}
4 |   dependency: "singleton"
5 |   time_limit: "08:00:00"
6 |   nodes: 2
7 |   node_type: cpu
8 | 
9 | output_matched_ngram_data: ${.run.results_dir}/matched_ngrams.pkl


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/jaccard_compute/jaccard_compute.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'jaccard-compute'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 2
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 72GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | shuffled_docs_path: ${data_curation.run.results_dir}/fuzzy_deduped/shuffled_docs.parquet
15 | files_per_partition: 5
16 | num_files: -1
17 | 
18 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/jaccard_map_buckets/jaccard_map_buckets.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'jaccard-map-buckets'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 2
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 72GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | input_bucket_dir: ${data_curation.run.results_dir}/fuzzy_deduped/buckets.parquet 
15 | num_files: -1
16 | text_ddf_blocksize: 512
17 | 
18 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/jaccard_shuffle/jaccard_shuffle.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'jaccard-shuffle'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 2
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 72GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | input_bucket_mapping_dir: ${data_curation.run.results_dir}/fuzzy_deduped/anchor_docs_with_bk.parquet
15 | num_files: -1
16 | text_ddf_blocksize: 512
17 | parts_per_worker: 2
18 | 
19 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped
20 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/language_identification/language_identification.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'language-identification'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   dependency: "singleton"
 5 |   time_limit: "04:00:00"
 6 |   nodes: 1
 7 |   node_type: cpu
 8 | 
 9 | log_scores: store_true
10 | output_retained_document_dir: ${.run.results_dir}/lang_annotated


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/minhash_buckets/minhash_buckets.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'minhash-buckets'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 2
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 72GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | input_minhash_dir: ${data_curation.run.results_dir}/fuzzy_deduped/dedup_test_rapids/minhashes.parquet
15 | minhash_length: 260
16 | num_bands: 20
17 | buckets_per_shuffle: 10
18 | 
19 | output_fuzzy_deduped_dir: ${data_curation.run.results_dir}/fuzzy_deduped
20 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/remove_matching_ngrams/remove_matching_ngrams.yaml:
--------------------------------------------------------------------------------
1 | run:
2 |   name: 'remove-matching-ngrams'
3 |   results_dir: ${data_curation.run.results_dir}/${.name}
4 |   dependency: "singleton"
5 |   time_limit: "08:00:00"
6 |   nodes: 2
7 |   node_type: cpu
8 | 
9 | output_task_deduped_dir: ${data_dir}/task_deduped


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/separate_by_language/separate_by_language.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'separate-by-language'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   dependency: "singleton"
 5 |   time_limit: "01:00:00"
 6 |   nodes: 1
 7 |   node_type: cpu
 8 | 
 9 | output_data_dir: ${.run.results_dir}/lang_separated
10 | output_language_distribution: ${.run.results_dir}/lang_distro.json


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/text_cleaning/text_cleaning.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'text-cleaning'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   dependency: "singleton"
 5 |   time_limit: "04:00:00"
 6 |   nodes: 1
 7 |   node_type: cpu
 8 | 
 9 | output_clean_dir: ${.run.results_dir}/clean
10 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/verify_all_pairs_jaccard/verify_all_pairs_jaccard.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'verify-all-pairs-jaccard'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 2
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 2GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | output_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_output
15 | cache_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_cache
16 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/common_crawl/write_deduped_result_with_text/write_deduped_result_with_text.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'write-deduped-result-with-text'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   time_limit: "02:00:00"
 5 |   dependency: "singleton"
 6 |   nodes: 1
 7 |   node_type: gpu
 8 | 
 9 | dask:
10 |   pool_size: 1GiB
11 |   protocol: ucx
12 |   interface: ibp12s0
13 | 
14 | output_dir: ${data_curation.run.results_dir}/fuzzy_deduped/cc_output
15 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/sft/curate_sft.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'data-curation'
 3 |   results_dir: ${base_results_dir}/${.name}
 4 | 
 5 | # Many steps in the data curator do not use GPUs
 6 | # Adjust configs here if you would like to use different cluster configurations for jobs that do/don't require GPUs
 7 | cpu_config:
 8 |   partition: 
 9 | 
10 | gpu_config:
11 |   partition: 
12 | 
13 | stages:
14 |   - task_deduplication
15 | 
16 | task_deduplication:
17 |   - prepare_task_data
18 |   - find_matching_ngrams
19 |   - remove_matching_ngrams
20 | 
21 | dataset_name: sft
22 | 
23 | defaults:
24 |   - sft/prepare_task_data/prepare_task_data
25 |   - sft/find_matching_ngrams/find_matching_ngrams
26 |   - sft/remove_matching_ngrams/remove_matching_ngrams
27 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/sft/find_matching_ngrams/find_matching_ngrams.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'find-matching-ngrams'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   dependency: "singleton"
 5 |   time_limit: "08:00:00"
 6 |   nodes: 1
 7 |   node_type: cpu
 8 | 
 9 | output_matched_ngram_data: ${.run.results_dir}/matched_ngrams.pkl
10 | input_json_text_field: text


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_curation/sft/remove_matching_ngrams/remove_matching_ngrams.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: 'remove-matching-ngrams'
 3 |   results_dir: ${data_curation.run.results_dir}/${.name}
 4 |   dependency: "singleton"
 5 |   time_limit: "08:00:00"
 6 |   nodes: 1
 7 |   node_type: cpu
 8 | 
 9 | output_task_deduped_dir: ${data_dir}/task_deduped
10 | input_json_text_field: text
11 | max_document_splits: 0


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/baichuan2/download_baichuan2_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_baichuan2_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 30
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | download_tokenizer_url: "https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/tokenizer.model"
16 | tokenizer_library: "sentencepiece"
17 | tokenizer_save_dir: ${data_dir}/baichuan2
18 | tokenizer_model:  ${.tokenizer_save_dir}/baichuan2_tokenizer.model
19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
20 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/bert/download_bert_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_bert_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 30
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | download_vocab_url: "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt"  # URL to download the vocab from.
16 | download_merges_url: null
17 | vocab_save_dir: ${data_dir}
18 | merges_save_dir: ${data_dir}
19 | tokenizer_type: BertWordPieceLowerCase # Bert model uses BertWordPieceLowerCase tokenizer
20 | rm_downloaded: True # Extract script will remove downloaded zst after extraction
21 | rm_extracted: True # Preprocess script will remove extracted files after preproc.


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/chatglm/download_chatglm_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_chatglm_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 30
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | download_tokenizer_url: "https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenizer.model"
16 | tokenizer_library: "sentencepiece"
17 | tokenizer_save_dir: ${data_dir}/chatglm
18 | tokenizer_model:  ${.tokenizer_save_dir}/chatglm_tokenizer.model
19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
20 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/code_llama/download_human_eval.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_human_eval
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   array: ${..file_numbers}
 7 | 
 8 | human_eval_url: "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"  # Source URL to download The human_eval data.
 9 | split_string: "0.7,0.2,0.1" #The ratio to split into train/test/validation
10 | output_dir: ${data_dir}/human_eval #Output to write train.jsonl /test.jsonl /validation.jsonl file
11 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/falcon/download_falcon_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_falcon_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 2
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | tokenizer_library: "huggingface"
16 | tokenizer_type: tiiuae/falcon-7b
17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
18 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/fid_evaluation/download_coco2014.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_coco2014
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "0:30:00"
 5 |   dependency: "singleton"
 6 | 
 7 | dataset_output_root: ${data_dir}/fid_evaluation/coco2014
 8 | 
 9 | preprocess_images: True
10 | preprocess_captions: True
11 | num_processes: 8  # set to number of CPUs in the job (-1 defaults to slurm cpus_per_task)


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_gpt3_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 30
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json"  # URL to download the vocab from.
16 | download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt"  # URL to download the merges from.
17 | vocab_save_dir: ${data_dir}/bpe
18 | merges_save_dir: ${data_dir}/bpe
19 | tokenizer_type: GPT2BPETokenizer
20 | tokenizer_library: megatron
21 | rm_downloaded: True # Extract script will remove downloaded zst after extraction
22 | rm_extracted: True # Preprocess script will remove extracted files after preproc.
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_llama_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 30
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | download_tokenizer_url: "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
16 | tokenizer_library: "sentencepiece"
17 | tokenizer_save_dir: ${data_dir}/llama
18 | tokenizer_model:  ${.tokenizer_save_dir}/llama_tokenizer.model
19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
20 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/mistral/download_mistral_nemo_123b_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_mistral_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 2
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | tokenizer_library: "huggingface"
16 | tokenizer_type: mistralai/Mistral-Large-Instruct-2407
17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
18 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/mistral/download_mistral_nemo_12b_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_mistral_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 2
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | tokenizer_library: "huggingface"
16 | tokenizer_type: mistralai/Mistral-Nemo-Base-2407
17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
18 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/mistral/download_mistral_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_mistral_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 2
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | tokenizer_library: "huggingface"
16 | tokenizer_type: mistralai/Mistral-7B-v0.1
17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
18 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/mixtral/download_mixtral_8x22b_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_mixtral_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 2
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | tokenizer_library: "huggingface"
16 | tokenizer_type: mistral-community/Mixtral-8x22B-v0.1
17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
18 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/mixtral/download_mixtral_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_mixtral_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 2
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | tokenizer_library: "huggingface"
16 | tokenizer_type: mistralai/Mixtral-8x7B-v0.1
17 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
18 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
19 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/multimodal/precache_t5xxl.yaml:
--------------------------------------------------------------------------------
 1 | batch_size_per_GPU: 64  # as much as it can fit in your GPU memory
 2 | dataloader_num_workers: 16
 3 | save_original_in_tar: #[video]
 4 | encodings: # see README for instructions
 5 |   - modality: text
 6 |     extension: text
 7 |     key: t5xxl
 8 |     precision: 16
 9 |     store_pad_tokens: False
10 |     encoder_config:
11 |       cls: encoders.t5encoder.T5Encoder
12 |       max_seq_len: 64  # see webvid caption length distribution (mostly less than 40 words)
13 |       encoder_path: /path/to/encoders # contains t5xxl-encoder.bin
14 | #  - modality: video
15 | #    extension: mp4
16 | 
17 | lightning:
18 |   devices: 8
19 |   num_nodes: 1
20 |   max_epochs: 1  # important for caching
21 |   precision: 16
22 |   accelerator: gpu
23 |   enable_checkpointing: False
24 |   strategy: ddp


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/nemotron/download_nemotron_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_nemotron_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "1:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 30
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | download_tokenizer_url: null
16 | tokenizer_library: "sentencepiece"
17 | tokenizer_save_dir: ${data_dir}/nemotron
18 | tokenizer_model:  ${.tokenizer_save_dir}/nemotron_tokenizer.model
19 | rm_downloaded: False # Extract script will remove downloaded zst after extraction
20 | rm_extracted: False # Preprocess script will remove extracted files after preproc.
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/steerlm/steerlm_data_prep1.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: steerlm_dataset_prep1
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 1
 7 |   bcp_preproc_npernode: 1 # 2 should be safe to use and x2 times faster.
 8 | 
 9 | prep_stage : "1" # make sure wrap in string type
10 | dataset: helpsteer # either openassistant or helpsteer
11 | output_dir: "${data_dir}/steerlm/"  # specify output_directory of the downloaded and preprocessed data
12 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
13 | 
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/steerlm/steerlm_data_prep2_reg.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: steerlm_dataset_prep2_reg
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 1
 7 |   bcp_preproc_npernode: 1 # 2 should be safe to use and x2 times faster.
 8 | 
 9 | prep_stage: "2" # make sure wrap in string type
10 | input_dataset: "${data_dir}/steerlm/merge_train.jsonl"  # for merged train or val jsonl data, see https://github.com/NVIDIA/NeMo-Aligner/blob/main/docs/user-guide/SteerLM.rst#step-2-download-and-preprocess-data-for-attribute-prediction-modelling
11 | output_dir: "${data_dir}/steerlm/merged_train_reg.jsonl"  # specify output_directory of the downloaded and preprocessed data
12 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
13 | 
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/data_preparation/t5/download_t5_pile.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: download_t5_pile
 3 |   results_dir: ${base_results_dir}/${.name}
 4 |   time_limit: "4:00:00"
 5 |   dependency: "singleton"
 6 |   node_array_size: 30
 7 |   array: ${..file_numbers}
 8 |   bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
 9 | 
10 | dataset: pile
11 | download_the_pile: True  # Whether to download the pile dataset from the internet.
12 | the_pile_url: "https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/"  # Source URL to download The Pile dataset from.
13 | file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
14 | preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
15 | download_vocab_url: "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt"  # URL to download the vocab from.
16 | download_merges_url: null
17 | vocab_save_dir: ${data_dir}/bpe
18 | merges_save_dir: ${data_dir}/bpe
19 | tokenizer_type: BertWordPieceCase # T5 model uses BertWordPieceCase tokenizer
20 | rm_downloaded: True # Extract script will remove downloaded zst after extraction
21 | rm_extracted: True # Preprocess script will remove extracted files after preproc.


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/chatglm/evaluate_all.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "04:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: chatglm3_6b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-chatglm
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 |   #tokenizer_model: ${data_dir}/chatglm/chatglm_tokenizer.model 
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/chatglm/evaluate_boolq.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_boolq
 8 |   model_train_name: chatglm3_6b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: boolq  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-chatglm
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 |   #tokenizer_model: ${data_dir}/chatglm/chatglm_tokenizer.model 
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/clip/imagenet_zeroshot.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: eval_${.task_name}_${.model_train_name}
 3 |   time_limit: "04:00:00"
 4 |   dependency: "singleton"
 5 |   model_train_name: clip_vit_B_32
 6 |   task_name: "imagenet_zeroshot"  # Rename this name to be more clear
 7 |   fine_tuning_dir: ${base_results_dir}/${.model_train_name}/imagenet_1k
 8 |   results_dir: ${base_results_dir}/${.model_train_name}/${.name}
 9 | 
10 | trainer:
11 |   devices: 8
12 |   num_nodes: 1
13 |   accelerator: gpu
14 |   logger: False # logger provided by exp_manager
15 |   precision: bf16 # 16, 32, or bf16
16 | 
17 | model:
18 |   restore_from_path: ${base_results_dir}/${evaluation.run.model_train_name}/results/checkpoints/nemo_clip.nemo  # Path to a trained CLIP .nemo file
19 |   precision: ${evaluation.trainer.precision}
20 |   micro_batch_size: 1000
21 |   global_batch_size: 8000
22 | 
23 |   data:
24 |     num_workers: 8
25 |     imagenet_val: ${data_dir}/imagenet_1k/val # path to imagenet val folder


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/falcon/evaluate_all.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: falcon_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-falcon
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: llama2_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-llama
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 |   #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_boolq
 8 |   model_train_name: llama2_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: boolq  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-llama
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 |   #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/mistral/evaluate_all.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: mistral_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-mistral
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/mixtral/evaluate_all.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: mixtral_8x7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-mixtral
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 8
20 |   pipeline_model_parallel_size: 1
21 |   sequence_parallel: True
22 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
23 |   precision: bf16 # must match training precision - 32, 16 or bf16
24 |   eval_batch_size: 4
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/mixtral/evaluate_all_8x22b.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: mixtral_8x7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-mixtral
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 8
20 |   pipeline_model_parallel_size: 1
21 |   sequence_parallel: True
22 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
23 |   precision: bf16 # must match training precision - 32, 16 or bf16
24 |   eval_batch_size: 4
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/nemotron/evaluate_all.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: nemotron
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-nemotron
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/prompt_gpt3/squad.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "4:00:00"
 4 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 5 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 6 |   eval_name: eval_prompt_squad
 7 |   model_train_name: gpt3_5b
 8 |   tasks: "prompt" # general prompt task
 9 |   prompt_learning_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_squad # assume prompt learning was on squad task
10 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
11 | 
12 | model:
13 |   model_type: nemo-gpt3-prompt
14 |   nemo_model: ${evaluation.run.prompt_learning_dir}/results/megatron_gpt_prompt.nemo
15 |   tensor_model_parallel_size: 2 #1 for 126m, 2 for 5b, 8 for 20b
16 |   pipeline_model_parallel_size: 1
17 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
18 |   precision: bf16 # must match training precision - 32, 16 or bf16
19 |   eval_batch_size: 4
20 |   prompt_dataset_paths: ${data_dir}/prompt_data/v1.1/squad_val.jsonl
21 |   disable_special_tokens: False # Whether to disable virtual tokens in prompt model evaluation. This is equivalent to evaluate without prompt-/p-tuning.
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "1:00:00"
 4 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 5 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 6 |   eval_name: eval_prompt_squad
 7 |   model_train_name: llama_7b
 8 |   tasks: "prompt" # general prompt task
 9 |   prompt_learning_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_squad # assume prompt learning was on squad task
10 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
11 | 
12 | model:
13 |   model_type: nemo-llama-prompt
14 |   nemo_model: ${evaluation.run.prompt_learning_dir}/results/megatron_llama_prompt.nemo
15 |   tensor_model_parallel_size: 2 #1 for 126m, 2 for 5b, 8 for 20b
16 |   pipeline_model_parallel_size: 1
17 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
18 |   precision: bf16 # must match training precision - 32, 16 or bf16
19 |   eval_batch_size: 4
20 |   prompt_dataset_paths: ${data_dir}/prompt_data/v1.1/squad_val.jsonl
21 |   disable_special_tokens: False # Whether to disable virtual tokens in prompt model evaluation. This is equivalent to evaluate without prompt-/p-tuning.
22 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/qwen2/evaluate_all.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_all
 8 |   model_train_name: qwen2_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-qwen2
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 |   #tokenizer_model: ${data_dir}/qwen2/qwen2_tokenizer.model 
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/qwen2/evaluate_boolq.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: ${.eval_name}_${.model_train_name}
 3 |   time_limit: "02:00:00"
 4 |   dependency: "singleton"
 5 |   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
 6 |   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
 7 |   eval_name: eval_boolq
 8 |   model_train_name: llama2_7b
 9 |   train_dir: ${base_results_dir}/${.model_train_name}
10 |   tasks: boolq  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
11 |   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
12 | 
13 | model:
14 |   model_type: nemo-llama
15 |   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
16 |   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
17 |   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
18 |   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
19 |   tensor_model_parallel_size: 1
20 |   pipeline_model_parallel_size: 1
21 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
22 |   precision: bf16 # must match training precision - 32, 16 or bf16
23 |   eval_batch_size: 4
24 |   #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
25 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/evaluation/vit/imagenet_val.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: eval_${.task_name}_${.model_train_name}
 3 |   time_limit: "04:00:00"
 4 |   dependency: "singleton"
 5 |   model_train_name: vit_B_16
 6 |   task_name: "imagenet_val"  # Rename this name to be more clear
 7 |   fine_tuning_dir: ${base_results_dir}/${.model_train_name}/imagenet_1k
 8 |   results_dir: ${base_results_dir}/${.model_train_name}/${.name}
 9 | 
10 | trainer:
11 |   devices: 1
12 |   num_nodes: 1
13 |   accelerator: gpu
14 |   logger: False # logger provided by exp_manager
15 |   precision: bf16 # 16, 32, or bf16
16 | 
17 | model:
18 |   restore_from_path: ${evaluation.run.fine_tuning_dir}/results/checkpoints/nemo_vit_classification.nemo  # Path to a trained vit .nemo file
19 |   precision: ${evaluation.trainer.precision}
20 |   micro_batch_size: 512 # we only supports DP=1 eval at the moment, GBS=MBS
21 | 
22 |   data:
23 |     num_workers: 8
24 |     imagenet_val: ${data_dir}/imagenet_1k/val # path to imagenet val folder


--------------------------------------------------------------------------------
/launcher_scripts/conf/export/gpt3/export_gpt3.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: export_${.model_train_name}
 3 |   time_limit: "2:00:00"
 4 |   model_train_name: "gpt3_5b"
 5 |   dependency: "singleton"
 6 |   training_dir: ${base_results_dir}/${.model_train_name}
 7 |   config_summary: tp${export.model.tensor_model_parallel_size}_pp${export.triton_deployment.pipeline_model_parallel_size}_${export.model.weight_data_type}_${export.triton_deployment.data_type}
 8 |   results_dir: ${base_results_dir}/${.model_train_name}/export_${.config_summary}
 9 |   model_type: "gpt3"
10 | 
11 | model:
12 |   checkpoint_path: ${export.run.training_dir}/results/checkpoints
13 |   # FT checkpoint will be saved in ${.triton_model_dir}/1/${.tensor_model_parallel_size}-gpu
14 |   tensor_model_parallel_size: 8
15 |   weight_data_type: fp16   # fp32|fp16
16 |   processes: 16
17 |   load_checkpoints_to_cpu: False
18 | 
19 | triton_deployment:
20 |   triton_model_dir: ${export.run.results_dir}/model_repo/${export.run.model_train_name}
21 |   max_batch_size: 1
22 |   pipeline_model_parallel_size: 1
23 |   int8_mode: False
24 |   enable_custom_all_reduce: False
25 |   data_type: fp16  # fp32|fp16|bf16
26 | 
27 | benchmark:
28 |   input_len: 60
29 |   output_len: 20
30 |   batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
31 |   triton_wait_time_s: 300
32 |   vocab_size: 51200
33 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/export/mt5/export_mt5.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: export_${.model_train_name}
 3 |   time_limit: "2:00:00"
 4 |   model_train_name: t5_invalid_model_name # Add here model name. It must match export configuration.
 5 |   dependency: "singleton"
 6 |   config_summary: tp${export.model.tensor_model_parallel_size}_pp${export.triton_deployment.pipeline_model_parallel_size}_${export.model.weight_data_type}_${export.triton_deployment.data_type}
 7 |   results_dir: ${base_results_dir}/${.model_train_name}/export_from_convert_${.config_summary}
 8 |   model_type: "mt5"
 9 | 
10 | model:
11 |   checkpoint_path: t5_invalid_path # Set here path of model converted from training
12 |   # FT checkpoint will be saved in ${.triton_model_dir}/1/${.tensor_model_parallel_size}-gpu
13 |   tensor_model_parallel_size: 8
14 |   weight_data_type: fp16   # fp32|fp16
15 |   processes: 16
16 |   load_checkpoints_to_cpu: False
17 | 
18 | triton_deployment:
19 |   triton_model_dir: ${export.run.results_dir}/model_repo/${export.run.model_train_name}
20 |   max_batch_size: 1
21 |   pipeline_model_parallel_size: 1
22 |   int8_mode: False
23 |   enable_custom_all_reduce: False
24 |   data_type: fp16  # fp32|fp16|bf16
25 | 
26 | benchmark:
27 |   input_len: 60
28 |   output_len: 20
29 |   batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
30 |   triton_wait_time_s: 300
31 |   vocab_size: 250112
32 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/export/t5/export_t5.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: export_${.model_train_name}
 3 |   time_limit: "2:00:00"
 4 |   model_train_name: mt5_invalid_model_name # Add here model name. It must match export configuration.
 5 |   dependency: "singleton"
 6 |   config_summary: tp${export.model.tensor_model_parallel_size}_pp${export.triton_deployment.pipeline_model_parallel_size}_${export.model.weight_data_type}_${export.triton_deployment.data_type}
 7 |   results_dir: ${base_results_dir}/${.model_train_name}/export_from_convert_${.config_summary}
 8 |   model_type: "t5"
 9 | 
10 | model:
11 |   checkpoint_path: mt5_invalid_path # Set here path of model converted from training
12 |   # FT checkpoint will be saved in ${.triton_model_dir}/1/${.tensor_model_parallel_size}-gpu
13 |   tensor_model_parallel_size: 8
14 |   weight_data_type: fp16   # fp32|fp16
15 |   processes: 16
16 |   load_checkpoints_to_cpu: False
17 | 
18 | triton_deployment:
19 |   triton_model_dir: ${export.run.results_dir}/model_repo/${export.run.model_train_name}
20 |   max_batch_size: 1
21 |   pipeline_model_parallel_size: 1
22 |   int8_mode: False
23 |   enable_custom_all_reduce: False
24 |   data_type: fp16  # fp32|fp16|bf16
25 | 
26 | benchmark:
27 |   input_len: 60
28 |   output_len: 20
29 |   batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
30 |   triton_wait_time_s: 300
31 |   vocab_size: 29184
32 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/external_conversion/clip/convert_external_clip.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: convert_external_clip
 3 |   nodes: ${divide_ceil:${external_conversion.model.model_parallel_size}, 8} # 8 gpus per node
 4 |   time_limit: "2:00:00"
 5 |   dependency: "singleton"
 6 |   ntasks_per_node: ${divide_ceil:${external_conversion.model.model_parallel_size}, ${.nodes}}
 7 |   results_dir: ${base_results_dir}/${.name}
 8 |   nemo_file_name: converted_${external_conversion.model.arch}_${external_conversion.model.version}.nemo # name of nemo checkpoint; must be .nemo file
 9 | 
10 | model:
11 | #  If converting from OpenCLIP, specify the architecture (`arch`) and version (`version`) from the
12 | #  OpenCLIP model list (https://github.com/mlfoundations/open_clip#usage).
13 | #  If converting from Hugging Face, set the version to `huggingface` and the architecture (`arch`)
14 | #  to the Hugging Face model name (e.g., `laion/CLIP-ViT-H-14-laion2B-s32B-b79K`).
15 |   arch: ViT-H-14
16 |   version: laion2b_s32b_b79k
17 |   hparams_file: /path/to/modified_hparam.yaml
18 |   tensor_model_parallel_size: 1
19 |   pipeline_model_parallel_size: 1
20 |   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/fw_inference/clip/clip_similarity.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: fw_inference_${.model_train_name}
 3 |   time_limit: "04:00:00"
 4 |   dependency: "singleton"
 5 |   model_train_name: clip_vit_B_32
 6 |   results_dir: ${base_results_dir}/${.model_train_name}/${.name}
 7 | 
 8 | image_path: ??? # Path to a image for inference
 9 | texts: ??? # List of texts to compute similarity
10 | 
11 | trainer:
12 |   devices: 1
13 |   num_nodes: 1
14 |   accelerator: gpu
15 |   logger: False # logger provided by exp_manager
16 |   precision: bf16 # 16, 32, or bf16
17 | 
18 | model:
19 |   restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/results/checkpoints/nemo_clip.nemo  # Path to a trained CLIP .nemo file
20 |   precision: ${fw_inference.trainer.precision}
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/fw_inference/dreambooth/text2img.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: fw_inference_${.model_train_name}
 3 |   time_limit: "04:00:00"
 4 |   dependency: "singleton"
 5 |   model_train_name: dreambooth_sd_860m
 6 |   results_dir: ${base_results_dir}/${.model_train_name}/${.name}
 7 | 
 8 | infer:
 9 |   unconditional_guidance_scale: 7.5
10 |   num_images_per_prompt: 4
11 |   batch_size: 4
12 |   height: 512
13 |   width: 512
14 |   down_factor: 8
15 |   inference_steps: 100
16 |   sampler_type: 'DDIM'
17 |   eta: 0
18 |   output_type: 'pil'
19 |   save_to_file: True
20 |   out_path: ${fw_inference.run.results_dir}
21 |   seed: 234
22 |   prompts:
23 |     - "a photo of a sks dog"
24 |     - "a photo of a sks dog in the Acropolis"
25 |     - "a photo of a sks dog in front of eiffel tower"
26 |     - "a photo of sks dog sleeping"
27 |     - "a photo of a sks dog riding a bike"
28 | 
29 | trainer:
30 |   devices: 1
31 |   num_nodes: 1
32 |   accelerator: gpu
33 |   precision: 16
34 |   logger: False # logger provided by exp_manager
35 | 
36 | model:
37 |   restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/convert_nemo/results/nemo_dreambooth.nemo  # Path to a trained CLIP .nemo file
38 |   precision: ${fw_inference.trainer.precision}
39 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/fw_inference/instruct_pix2pix/edit_cli.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: fw_inference_${.model_train_name}
 3 |   time_limit: "04:00:00"
 4 |   dependency: "singleton"
 5 |   model_train_name: instruct_pix2pix_860m_sd_edit
 6 |   results_dir: ${base_results_dir}/${.model_train_name}/${.name}
 7 | 
 8 | edit:
 9 |   resolution: 512
10 |   steps: 100
11 |   input: ??? # path/to/input/picture
12 |   outpath: ${fw_inference.run.results_dir}
13 |   prompt: ""
14 |   cfg_text: 7.5
15 |   cfg_image: 1.2
16 |   num_images_per_prompt: 8
17 |   combine_images: [2, 4] # [row, column], set to null if don't want to combine
18 |   seed: 1234
19 | 
20 | trainer:
21 |   devices: 1
22 |   num_nodes: 1
23 |   accelerator: gpu
24 |   logger: False # logger provided by exp_manager
25 |   precision: bf16 # 16, 32, or bf16
26 | 
27 | model:
28 |   restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/results/checkpoints/nemo_instruct_pix2pix.nemo  # Path to a trained CLIP .nemo file
29 |   precision: ${fw_inference.trainer.precision}
30 | 
31 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/fw_inference/nsfw/nsfw.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: fw_inference_${.model_train_name}
 3 |   time_limit: "00:10:00"
 4 |   depencency: "singleton"
 5 |   model_train_name: nsfw_L_14
 6 |   results_dir: ${base_results_dir}/${.model_train_name}/${.name}
 7 | 
 8 | image_path: ??? # Path to a image for inference
 9 | 
10 | trainer:
11 |   devices: 1
12 |   num_nodes: 1
13 |   accelerator: gpu
14 |   logger: False # logger provided by exp_manager
15 |   precision: 16 # 16, 32, or bf16
16 | 
17 | model:
18 |   restore_from_path: ${base_results_dir}/${fw_inference.run.model_train_name}/results/checkpoints/nemo_nsfw.nemo 
19 |   precision: ${fw_inference.trainer.precision}
20 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/fw_inference/vit/imagenet1k.yaml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   name: fw_inference_${.model_train_name}
 3 |   time_limit: "04:00:00"
 4 |   dependency: "singleton"
 5 |   model_train_name: vit_B_16
 6 |   fine_tuning_dir: ${base_results_dir}/${.model_train_name}/imagenet_1k
 7 |   results_dir: ${base_results_dir}/${.model_train_name}/${.name}
 8 | 
 9 | data_path: ??? # Path to a image folder for inference
10 | 
11 | trainer:
12 |   devices: 1
13 |   num_nodes: 1
14 |   accelerator: gpu
15 |   logger: False # logger provided by exp_manager
16 |   precision: bf16 # 16, 32, or bf16
17 | 
18 | model:
19 |   restore_from_path: ${fw_inference.run.fine_tuning_dir}/results/checkpoints/nemo_vit_classification.nemo  # Path to a trained vit .nemo file
20 |   precision: ${fw_inference.trainer.precision}
21 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/rag_indexing/bert/110m.yaml:
--------------------------------------------------------------------------------
 1 | hydra:
 2 |   searchpath:
 3 |     - file:///opt/NeMo/examples/nlp/language_modeling/conf
 4 | 
 5 | run:
 6 |   name: ${.eval_name}_${.model_train_name}
 7 |   time_limit: "4:00:00"
 8 |   dependency: "singleton"
 9 |   nodes: 1
10 |   ntasks_per_node: 1
11 |   eval_name: rag_indexing
12 |   model_train_name: bert
13 |   results_dir: ${base_results_dir}/${.name}
14 | 
15 | trainer:
16 |   devices: 1
17 |   num_nodes: 1
18 |   accelerator: gpu
19 |   logger: False # logger provided by exp_manager
20 |   precision: 'bf16-mixed'
21 |   use_distributed_sampler: False
22 | 
23 | indexing:
24 |   embedder:
25 |     model_type: bert
26 |     model_path: null
27 |     embed_batch_size: 128
28 |   data:
29 |     data_path: null
30 |     chunk_size: 256
31 |     chunk_overlap: 10
32 |   index_path: null


--------------------------------------------------------------------------------
/launcher_scripts/conf/rag_indexing/bert/340m.yaml:
--------------------------------------------------------------------------------
 1 | hydra:
 2 |   searchpath:
 3 |     - file:///opt/NeMo/examples/nlp/language_modeling/conf
 4 | 
 5 | run:
 6 |   name: ${.eval_name}_${.model_train_name}
 7 |   time_limit: "4:00:00"
 8 |   dependency: "singleton"
 9 |   nodes: 1
10 |   ntasks_per_node: 1
11 |   eval_name: rag_indexing
12 |   model_train_name: bert
13 |   results_dir: ${base_results_dir}/${.name}
14 |   
15 | trainer:
16 |   devices: 1
17 |   num_nodes: 1
18 |   accelerator: gpu
19 |   logger: False # logger provided by exp_manager
20 |   precision: 'bf16-mixed'
21 |   use_distributed_sampler: False
22 | 
23 | indexing:
24 |   embedder:
25 |     model_type: bert
26 |     model_path: null
27 |     embed_batch_size: 128
28 |   data:
29 |     data_path: null
30 |     chunk_size: 256
31 |     chunk_overlap: 10
32 |   index_path: null


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/background/random.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.background.random_background.RandomBackground
2 | base_background: [1, 1, 1]
3 | random_ratio: 0.5
4 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/background/static.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.background.static_background.StaticBackground
2 | background: [0, 0, 1] # rgb
3 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/background/tcnn.yaml:
--------------------------------------------------------------------------------
 1 | _target_: nemo.collections.multimodal.modules.nerf.background.tcnn_background.TCNNBackground
 2 | bound: 1
 3 | encoder_num_input_dims: 3   # 3 directions
 4 | encoder_cfg:
 5 |   otype: "HashGrid"
 6 |   n_levels: 16
 7 |   n_features_per_level: 2
 8 |   log2_hashmap_size: 19
 9 |   base_resolution: 16
10 |   interpolation: "Smoothstep"
11 |   per_level_scale: # default is np.exp2(np.log2(2048 * bound / 16) / (16 - 1))
12 | 
13 | background_net_num_output_dims: 3 # rgb
14 | background_net_cfg:
15 |   otype: "FullyFusedMLP"
16 |   activation: "ReLU"
17 |   output_activation: "None"
18 |   n_neurons: 32
19 |   n_hidden_layers: 2
20 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/background/torchngp.yaml:
--------------------------------------------------------------------------------
 1 | _target_: nemo.collections.multimodal.modules.nerf.background.torchngp_background.TorchNGPBackground
 2 | 
 3 | encoder_type: "frequency"
 4 | encoder_input_dims: 3
 5 | encoder_multi_res: 6
 6 | 
 7 | num_output_dims: 3
 8 | net_cfg:
 9 |   num_hidden_dims: 32
10 |   num_layers: 2
11 |   bias: True
12 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/data/data.yaml:
--------------------------------------------------------------------------------
 1 | _target_: data.AggregatorDataModule
 2 | 
 3 | train_batch_size: 1
 4 | train_shuffle: false
 5 | train_dataset:
 6 |   _target_: nemo.collections.multimodal.data.nerf.random_poses.RandomPosesDataset
 7 |   internal_batch_size: 100
 8 |   width: 64
 9 |   height: 64
10 |   radius_range: [3.0, 3.5]
11 |   theta_range: [45, 105]
12 |   phi_range: [-180, 180]
13 |   fovx_range: [10, 30]
14 |   fovy_range: [10, 30]
15 |   jitter: False
16 |   jitter_center: 0.2
17 |   jitter_target: 0.2
18 |   jitter_up: 0.02
19 |   uniform_sphere_rate: 0
20 |   angle_overhead: 30
21 |   angle_front: 60
22 | 
23 | val_batch_size: 1
24 | val_shuffle: false
25 | val_dataset:
26 |   _target_: nemo.collections.multimodal.data.nerf.circle_poses.CirclePosesDataset
27 |   size: 5
28 |   width: 800
29 |   height: 800
30 |   angle_overhead: 30
31 |   angle_front: 60
32 | 
33 | test_batch_size: 1
34 | test_shuffle: false
35 | test_dataset:
36 |   _target_: nemo.collections.multimodal.data.nerf.circle_poses.CirclePosesDataset
37 |   size: 100
38 |   width: 800
39 |   height: 800
40 |   angle_overhead: 30
41 |   angle_front: 60
42 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/dreamfusion-dmtet.yaml:
--------------------------------------------------------------------------------
 1 | _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion # TODO(ahmadki): dreamfusion-dmetet should have it's own class
 2 | defaults:
 3 |   - nerf: torchngp
 4 |   - background: torchngp
 5 |   - material: basic_shading
 6 |   - renderer: nvdiffrast
 7 |   - guidance: sd_huggingface
 8 |   - optim: adan
 9 |   - loss: dmtet
10 |   - data: data
11 |   - _self_
12 | 
13 | ### model options
14 | resume_from_checkpoint:
15 | prompt: 'a hamburger'
16 | negative_prompt: ''
17 | front_prompt: ', front view'
18 | side_prompt: ', side view'
19 | back_prompt: ', back view'
20 | update_extra_interval: 16
21 | guidance_scale: 100
22 | export_video: False
23 | 
24 | iters: ${training.trainer.max_steps}
25 | # TODO(ahmadki): move to database
26 | latent_iter_ratio: 0.0
27 | albedo_iter_ratio: 0
28 | min_ambient_ratio: 0.1
29 | textureless_ratio: 0.2
30 | 
31 | data:
32 |   train_dataset:
33 |     width: 512
34 |     height: 512
35 |   val_dataset:
36 |     width: 800
37 |     height: 800
38 |   test_dataset:
39 |     width: 800
40 |     height: 800
41 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/dreamfusion.yaml:
--------------------------------------------------------------------------------
 1 | _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion
 2 | defaults:
 3 |   - nerf: torchngp
 4 |   - background: static
 5 |   - material: basic_shading
 6 |   - renderer: torchngp_raymarching
 7 |   - guidance: sd_nemo
 8 |   - optim: adan
 9 |   - loss: dreamfusion
10 |   - data: data
11 |   - _self_
12 | 
13 | ### model options
14 | resume_from_checkpoint:
15 | prompt: 'a hamburger'
16 | negative_prompt: ''
17 | front_prompt: ', front view'
18 | side_prompt: ', side view'
19 | back_prompt: ', back view'
20 | update_extra_interval: 16
21 | guidance_scale: 100
22 | export_video: False
23 | 
24 | iters: ${training.trainer.max_steps}
25 | # TODO(ahmadki): move to database
26 | latent_iter_ratio: 0.2
27 | albedo_iter_ratio: 0.0
28 | min_ambient_ratio: 0.1
29 | textureless_ratio: 0.2
30 | 
31 | data:
32 |   train_dataset:
33 |     width: 64
34 |     height: 64
35 |   val_dataset:
36 |     width: 800
37 |     height: 800
38 |   test_dataset:
39 |     width: 800
40 |     height: 800
41 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/guidance/sd_huggingface.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.guidance.stablediffusion_huggingface_pipeline.StableDiffusion
2 | precision: ${training.trainer.precision}
3 | model_key: stabilityai/stable-diffusion-2-1-base
4 | t_range: [0.02, 0.98]
5 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/guidance/sd_nemo.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.guidance.stablediffusion_nemo_pipeline.StableDiffusion
2 | checkpoint: /sd_checkpoints/nemo-1.5/sd-1.5.nemo
3 | sampler_type: 'DDIM'
4 | t_range: [0.02, 0.98]
5 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/guidance/sd_trt.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.guidance.stablediffusion_trt_pipeline.StableDiffusion
2 | checkpoint: /sd_checkpoints/nemo-1.5/sd-1.5.nemo
3 | plan_dir: /sd_checkpoints/nemo-1.5/plan
4 | sampler_type=: DDIM"
5 | t_range: [0.02, 0.98]
6 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/loss/dmtet.yaml:
--------------------------------------------------------------------------------
1 | lambda_sds: 1.0
2 | lambda_opacity: 0.0
3 | lambda_entropy: 0.0
4 | lambda_orientation: 0.0
5 | lambda_2d_normal_smooth: 0.0
6 | lambda_3d_normal_smooth: 0.0
7 | lambda_mesh_normal: 0.5
8 | lambda_mesh_laplacian: 0.5
9 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/loss/dreamfusion.yaml:
--------------------------------------------------------------------------------
1 | lambda_sds: 1.0
2 | lambda_opacity: 0.0
3 | lambda_entropy: 1e-3
4 | lambda_orientation: 1e-2
5 | lambda_2d_normal_smooth: 0.0
6 | lambda_3d_normal_smooth: 0.0
7 | lambda_mesh_normal: 0.0
8 | lambda_mesh_laplacian: 0.0
9 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/material/basic_shading.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.materials.basic_shading.BasicShading
2 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/nerf/tcnn.yaml:
--------------------------------------------------------------------------------
 1 | _target_: nemo.collections.multimodal.modules.nerf.geometry.tcnn_nerf.TCNNNerf
 2 | num_input_dims: 3   # 3D space
 3 | bound: 1
 4 | density_activation: softplus # softplus, exp
 5 | blob_radius: 0.5
 6 | blob_density: 10
 7 | normal_type: central_finite_difference
 8 | 
 9 | encoder_cfg:
10 |   otype: "HashGrid"
11 |   n_levels: 16
12 |   n_features_per_level: 2
13 |   log2_hashmap_size: 19
14 |   base_resolution: 16
15 |   interpolation: "Smoothstep"
16 |   per_level_scale: # default is np.exp2(np.log2(2048 * bound / 16) / (16 - 1))
17 | 
18 | sigma_net_num_output_dims: 1 # density
19 | sigma_net_cfg:
20 |   otype: "FullyFusedMLP"
21 |   activation: "ReLU"
22 |   output_activation: "None"
23 |   n_neurons: 64
24 |   n_hidden_layers: 3
25 | 
26 | features_net_num_output_dims: 3 # rgb
27 | features_net_cfg:
28 |   otype: "FullyFusedMLP"
29 |   activation: "ReLU"
30 |   output_activation: "None"
31 |   n_neurons: 64
32 |   n_hidden_layers: 3
33 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/nerf/torchngp.yaml:
--------------------------------------------------------------------------------
 1 | _target_: nemo.collections.multimodal.modules.nerf.geometry.torchngp_nerf.TorchNGPNerf
 2 | num_input_dims: 3    # 3D space
 3 | bound: 1
 4 | density_activation: exp # softplus, exp
 5 | blob_radius: 0.2
 6 | blob_density: 5
 7 | normal_type: central_finite_difference
 8 | 
 9 | encoder_cfg:
10 |   encoder_type: 'hashgrid'
11 |   encoder_max_level:
12 |   log2_hashmap_size: 19
13 |   desired_resolution: 2048
14 |   interpolation: smoothstep
15 | 
16 | sigma_net_num_output_dims: 1    # density
17 | sigma_net_cfg:
18 |   num_hidden_dims: 64
19 |   num_layers: 3
20 |   bias: True # FIXME(ahmadki):exp: does it makes sense that it's True ?
21 | 
22 | features_net_num_output_dims: 3   # rgb
23 | features_net_cfg:
24 |   num_hidden_dims: 64
25 |   num_layers: 3
26 |   bias: True
27 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/optim/adan.yaml:
--------------------------------------------------------------------------------
1 | name: adan
2 | lr: 5e-3
3 | eps: 1e-8
4 | weight_decay: 2e-5
5 | max_grad_norm: 5.0
6 | foreach: False
7 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/renderer/nerfacc.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.renderers.nerfacc_volume_renderer.NerfaccVolumeBaseRenderer
2 | grid_resolution: 128
3 | grid_levels: 3
4 | bound: 1 # ${training.model.nerf.bound} # FIXME(ahmadki)
5 | render_step_size: 1.e-3
6 | near_plane: 0.2
7 | cone_angle: 0.004
8 | alpha_thre: 1.e-2
9 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/renderer/nvdiffrast.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.renderers.nvdiffrast_renderer.NVDiffRastRenderer
2 | bound: 1 # ${training.model.nerf.bound} # FIXME(ahmadki)
3 | grid_resolution: 128
4 | density_thresh: 10.0
5 | update_interval: 16
6 | quartet_file: "/results/tets/128_tets.npz" # FIXME(ahmadki): documentation
7 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/nerf/model/renderer/torchngp_raymarching.yaml:
--------------------------------------------------------------------------------
1 | _target_: nemo.collections.multimodal.modules.nerf.renderers.torchngp_volume_renderer.TorchNGPVolumeRenderer
2 | bound: 1 # ${training.model.nerf.bound} # FIXME(ahmadki)
3 | update_interval: 16
4 | grid_resolution: 128
5 | density_thresh: 10
6 | max_steps: 1024
7 | dt_gamma: 0
8 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16
 3 |     
 4 | # Bulk overlap with AllGather
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 2
 8 |   set_sm_margin: 0
 9 | 
10 | qkv_wgrad:
11 |   method: bulk
12 |   num_sm: 2
13 |   set_sm_margin: 0
14 | 
15 | fc1_dgrad:
16 |   method: bulk
17 |   num_sm: 2
18 |   set_sm_margin: 0
19 | 
20 | fc1_wgrad:
21 |   method: bulk
22 |   num_sm: 2
23 |   set_sm_margin: 0
24 | 
25 | ## Ring-exchange overlap with AllGather
26 | qkv_fprop:
27 |   method: ring_exchange
28 |   aggregate: 0
29 | 
30 | proj_dgrad:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | fc1_fprop:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc2_dgrad:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | # Chunked-collective overlap with ReduceScatter
43 | proj_fprop:
44 |   method: pipeline
45 |   num_sm: 4
46 |   num_splits: 4
47 |   set_sm_margin: 0
48 | 
49 | fc2_fprop:
50 |   method: pipeline
51 |   num_sm: 4
52 |   num_splits: 4
53 |   set_sm_margin: 0
54 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16
 3 | 
 4 | # Bulk overlap with AllGather
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 2
 8 |   set_sm_margin: 0
 9 | 
10 | qkv_wgrad:
11 |   method: bulk
12 |   num_sm: 2
13 |   set_sm_margin: 0
14 | 
15 | fc1_dgrad:
16 |   method: bulk
17 |   num_sm: 2
18 |   set_sm_margin: 0
19 | 
20 | fc1_wgrad:
21 |   method: bulk
22 |   num_sm: 2
23 |   set_sm_margin: 0
24 | 
25 | ## Ring-exchange overlap with AllGather
26 | qkv_fprop:
27 |   method: ring_exchange
28 |   aggregate: 0
29 | 
30 | proj_dgrad:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | fc1_fprop:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc2_dgrad:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | # Chunked-collective overlap with ReduceScatter
43 | proj_fprop:
44 |   method: pipeline
45 |   num_sm: 8
46 |   num_splits: 4
47 |   set_sm_margin: 0
48 | 
49 | fc2_fprop:
50 |   method: pipeline
51 |   num_sm: 4
52 |   num_splits: 4
53 |   set_sm_margin: 0
54 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 2
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 24
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 2
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 1
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 1
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 1
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 1
45 | 
46 | # Chunked
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 |   fp8_buf: 0
54 |   atomic_gemm: 0
55 | 
56 | fc2_fprop:
57 |   method: pipeline
58 |   num_sm: 8
59 |   cga_size: 2
60 |   num_splits: 4
61 |   set_sm_margin: 1
62 |   fp8_buf: 0
63 |   atomic_gemm: 0


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 2
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 24
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 2
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 1
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 1
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 1
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | # Chunked
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 |   fp8_buf: 1
54 |   atomic_gemm: 0
55 | 
56 | fc2_fprop:
57 |   method: pipeline
58 |   num_sm: 16
59 |   cga_size: 2
60 |   num_splits: 4
61 |   set_sm_margin: 1
62 |   fp8_buf: 1
63 |   atomic_gemm: 0


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h8192_tp4_mbs1_seqlen8192.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/70B/TP8/MBS1/SeqLen8K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 4
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 24
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 4
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | # Chunked-collective overlap with ReduceScatter
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 |   fp8_buf: 1
54 | 
55 | fc2_fprop:
56 |   method: pipeline
57 |   num_sm: 16
58 |   cga_size: 2
59 |   num_splits: 4
60 |   set_sm_margin: 1
61 |   fp8_buf: 1
62 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 4
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 4
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 4
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | # Chunked-collective overlap with ReduceScatter
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 |   fp8_buf: 1
54 | 
55 | fc2_fprop:
56 |   method: ring_exchange
57 |   num_sm: 1
58 |   set_sm_margin: 1
59 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 4
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 4
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 4
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | # Chunked-collective overlap with ReduceScatter
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 |   fp8_buf: 1
54 | 
55 | fc2_fprop:
56 |   method: ring_exchange
57 |   num_sm: 1
58 |   set_sm_margin: 1
59 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8
 3 | 
 4 | # Bulk overlap with AllGather
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 8
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 16
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 4
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 16
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 1
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | # Chunked-collective overlap with ReduceScatter
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 16
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 | 
54 | fc2_fprop:
55 |   method: pipeline
56 |   num_sm: 24
57 |   cga_size: 2
58 |   num_splits: 4
59 |   set_sm_margin: 1
60 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h5120_tp2_mbs1_seqlen4096.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/13B/TP2/MBS1/SeqLen4K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 4
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 8
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 4
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | # Chunked-collective overlap with ReduceScatter
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 | 
54 | fc2_fprop:
55 |   method: pipeline
56 |   num_sm: 20
57 |   cga_size: 2
58 |   num_splits: 4
59 |   set_sm_margin: 1
60 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h8192_tp2_mbs1_seqlen4096.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/70B/TP2/MBS1/SeqLen4K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 8
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 32
14 |   cga_size: 2 
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 8
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | proj_fprop:
47 |   method: ring_exchange
48 |   num_sm: 1
49 |   set_sm_margin: 1
50 | 
51 | fc2_fprop:
52 |   method: ring_exchange
53 |   num_sm: 1
54 |   set_sm_margin: 1
55 | 
56 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h8192_tp4_mbs1_seqlen4096.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/70B/TP2/MBS1/SeqLen4K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 8
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 16
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 4
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 1
45 | 
46 | # Chunked-collective overlap with ReduceScatter
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 | 
54 | fc2_fprop:
55 |   method: pipeline
56 |   num_sm: 16
57 |   cga_size: 2
58 |   num_splits: 4
59 |   set_sm_margin: 1
60 | 


--------------------------------------------------------------------------------
/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h8192_tp4_mbs1_seqlen8192.yaml:
--------------------------------------------------------------------------------
 1 | # UB communicator configurations
 2 | # Model configs: H100/70B/TP8/MBS1/SeqLen8K/FP8
 3 | 
 4 | # Bulk overlap with AllGather / ReduceScatter
 5 | qkv_dgrad:
 6 |   method: bulk
 7 |   num_sm: 4
 8 |   cga_size: 2
 9 |   set_sm_margin: 0
10 | 
11 | qkv_wgrad:
12 |   method: bulk
13 |   num_sm: 24
14 |   cga_size: 2
15 |   set_sm_margin: 0
16 | 
17 | fc1_dgrad:
18 |   method: bulk
19 |   num_sm: 2
20 |   cga_size: 2
21 |   set_sm_margin: 0
22 | 
23 | fc1_wgrad:
24 |   method: bulk
25 |   num_sm: 4
26 |   cga_size: 2
27 |   set_sm_margin: 0
28 | 
29 | ## Ring-exchange overlap with AllGather
30 | qkv_fprop:
31 |   method: ring_exchange
32 |   aggregate: 0
33 | 
34 | proj_dgrad:
35 |   method: ring_exchange
36 |   aggregate: 0
37 | 
38 | fc1_fprop:
39 |   method: ring_exchange
40 |   aggregate: 0
41 | 
42 | fc2_dgrad:
43 |   method: ring_exchange
44 |   aggregate: 0
45 | 
46 | # Chunked-collective overlap with ReduceScatter
47 | proj_fprop:
48 |   method: pipeline
49 |   num_sm: 24
50 |   cga_size: 2
51 |   num_splits: 4
52 |   set_sm_margin: 1
53 | 
54 | fc2_fprop:
55 |   method: pipeline
56 |   num_sm: 16
57 |   cga_size: 2
58 |   num_splits: 4
59 |   set_sm_margin: 1
60 | 


--------------------------------------------------------------------------------
/launcher_scripts/data/nsfw/concepts.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/conf/auto_blend.yaml:
--------------------------------------------------------------------------------
1 | model_type: mt5
2 | preprocessed_dir: null
3 | blending_alpha: 1.0


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/conf/checkpoint_search.yaml:
--------------------------------------------------------------------------------
1 | checkpoint_folder: null
2 | checkpoint_name: latest
3 | tensor_model_parallel_size: 1
4 | pipeline_model_parallel_size: 1


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/conf/get_ag_overlap.yaml:
--------------------------------------------------------------------------------
1 | name: 'get_ag_overlap'
2 | fp8: null


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/conf/get_ln_sm_margin.yaml:
--------------------------------------------------------------------------------
1 | name: 'get_ln_sm_margin'


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/conf/hparams_override.yaml:
--------------------------------------------------------------------------------
1 | hparams_file: null
2 | output_path: null
3 | 
4 | vocab_file: null
5 | merge_file: null
6 | tokenizer_model: null


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/conf/numa_mapping.yaml:
--------------------------------------------------------------------------------
1 | # GPU Numa Mapping Config
2 | enable: True  # Set to False to disable all mapping (performance will suffer).
3 | mode: unique_contiguous  # One of: all, single, single_unique, unique_interleaved or unique_contiguous.
4 | scope: node  # Either node or socket.
5 | cores: all_logical  # Either all_logical or single_logical.
6 | balanced: True  # Whether to assing an equal number of physical cores to each process.
7 | min_cores: 1  # Minimum number of physical cores per process.
8 | max_cores: 8  # Maximum number of physical cores per process. Can be null to use all available cores.


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/datacuration_scripts/download_fasttext.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | #
19 | # Downloads the FastText classifier
20 | #
21 | 
22 | 
23 | set -eu
24 | 
25 | res_file=$1
26 | 
27 | ## Download the fasttext model
28 | if [ ! -f ${res_file} ]; then
29 |   wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O ${res_file}
30 | fi


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/custom_dataprep/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/fid_evaluation_dataprep/conf/config.yaml:
--------------------------------------------------------------------------------
1 | preprocess_images: True
2 | preprocess_captions: True
3 | root_dir: /path/to/fid_evaluation/coco2014/
4 | num_processes: 8


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/mc4_dataprep/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/multimodal_dataprep/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | # this file is only a template for the hydra arguments
 2 | # you should not edit or use this file unless you are debugging data prep scripts directly
 3 | 
 4 | dataset_repo_id:
 5 | dataset_output_root:
 6 | 
 7 | input_dir:
 8 | output_dir:
 9 | parquet_subpartitions:
10 | parquet_pattern:
11 | num_parquets_downloaded:
12 | download_num_processes:
13 | download_num_threads:
14 | img2dataset_additional_arguments:
15 | node_array_size:
16 | tar_chunk_size:
17 | file_ext_in_tar:
18 | precache_config_path:
19 | output_wdinfo_path:
20 | append_tar_dir:
21 | source_dir:
22 | source_extensions:
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | data_config: download_gpt3_pile
 2 | cluster_type: bcm
 3 | launcher_scripts_path: null
 4 | data_dir: null
 5 | the_pile_url: null
 6 | file_numbers: null
 7 | rm_downloaded: True
 8 | rm_extracted: True
 9 | tokenizer_type: null
10 | vocab_save_dir: null
11 | merges_save_dir: null
12 | tokenizer_library: null
13 | tokenizer_model: null
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/slim_pajama_dataprep/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/dataprep_scripts/slim_pajama_dataprep/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /hydra/hydra_logging@_group_: none
 4 |   - override /hydra/job_logging@_group_: none
 5 | hydra:
 6 |   run:
 7 |     dir: .
 8 |   output_subdir: null
 9 | data_config: download_slim_pajama
10 | cluster_type: bcm
11 | launcher_scripts_path: null
12 | data_dir: null
13 | slim_pajama_url: null
14 | approved_sources: null
15 | file_numbers: null
16 | rm_downloaded: True
17 | rm_extracted: True
18 | tokenizer_type: null
19 | vocab_save_dir: null
20 | merges_save_dir: null
21 | tokenizer_library: null
22 | tokenizer_model: null
23 | preprocessed_dir: null


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/eval_harness/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/export_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/nemo_launcher/collections/export_scripts/__init__.py


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/collections/metric_calculation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/nemo_launcher/collections/metric_calculation/__init__.py


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: NeMo Framework Base Model Conversion
4 | name: nemo-framework-conversion
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: NeMo Framework Data Preparation
4 | name: nemo-framework-data-prep
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: data-prep-config
5 | data:
6 |   config.yaml: |-
7 |   {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
8 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   trainingImage: cfg.container
 3 |   pullPolicy: IfNotPresent
 4 | 
 5 |   # Insert the name of your container registry pull secret #
 6 |   pullSecret: nvcr.io
 7 | 
 8 |   nodes: training.trainer.num_nodes
 9 | 
10 | dataPrepConfig:
11 |   # Specify the amount of shared memory to attach to the Pods #
12 |   shmSize: 512Gi
13 | 
14 |   # Insert the address for the NFS server if using NFS for model storage #
15 |   NFSServer: <Insert NFS server address>
16 | 
17 |   # Insert the path to save data on the NFS server #
18 |   NFSPath: <Insert NFS server path>
19 | 
20 |   # Insert the total number of processes to spawn on the cluster #
21 |   totalProcesses: <Insert number of processes>
22 | 
23 |   # Insert the number of processes to spawn per node #
24 |   procsPerNode: <Insert number of processes per node>
25 | 
26 |   # Insert the data preparation stage, such as download, extract, or preprocess #
27 |   stage: <Insert the data prep stage>
28 | 
29 |   # Insert the dnsPolicy #
30 |   dnsPolicy: "nil"
31 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: NeMo Framework Evaluation
4 | name: nemo-framework-evaluation
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: evaluation-config
5 | data:
6 |   hparams.yaml: |-
7 |   {{ (.Files.Glob "config/hparams.yaml").AsConfig | indent 4 }}
8 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/peft/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: NeMo Framework PEFT
4 | name: nemo-framework-peft
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/peft/peft-config.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: {{ .Release.Name }}-peft-config
5 | data:
6 |   config.yaml: |-
7 |   {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
8 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/rlhf_ppo/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: NeMo Framework RLHF PPO training
4 | name: nemo-framework-rlhf-ppo
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/rlhf_ppo/rlhf-ppo-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ .Release.Name }}-config
 5 | data:
 6 |   config_critic.yaml: |-
 7 |   {{ (.Files.Glob "config/gpt_ppo_critic.yaml").AsConfig | indent 4 }}
 8 |   config_actor.yaml: |-
 9 |   {{ (.Files.Glob "config/gpt_ppo_actor.yaml").AsConfig | indent 4 }}
10 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/rlhf_ppo/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   trainingImage: cfg.container
 3 |   pullPolicy: IfNotPresent
 4 | 
 5 |   # Insert the name of your container registry pull secret #
 6 |   pullSecret: nvcr.io
 7 | 
 8 | trainingConfig:
 9 |   # Specify the amount of shared memory to attach to the Pods #
10 |   shmSize: 512Gi
11 | 
12 |   # Insert the address for the NFS server if using NFS for model storage #
13 |   NFSServer: <Insert NFS server address>
14 | 
15 |   # Insert the path to save data on the NFS server #
16 |   NFSPath: <Insert NFS server path>
17 | 
18 |   # Specify the k8s resource name for IB devices #
19 |   ibResourceName: nvidia.com/hostdev
20 | 
21 |   # Specity the number of IB devices to include in pods #
22 |   ibCount: "0"
23 | 
24 |   # Specify the WandB API key if using WandB for logging #
25 |   wandbKey: "nil"
26 | 
27 |   # Insert the dnsPolicy #
28 |   dnsPolicy: "nil"
29 | 
30 | critic:
31 |   numGPUs: rlhf_ppo.critic.trainer.devices
32 |   nodes: rlhf_ppo.critic.trainer.num_nodes
33 | 
34 | actor:
35 |   numGPUs: rlhf_ppo.actor.trainer.devices
36 |   nodes: rlhf_ppo.actor.trainer.num_nodes
37 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | appVersion: "1.0"
3 | description: NeMo Framework Base Model Training
4 | name: nemo-framework-training
5 | version: 1.0.0
6 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: training-config
5 | data:
6 |   config.yaml: |-
7 |   {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
8 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   trainingImage: cfg.container
 3 |   pullPolicy: IfNotPresent
 4 | 
 5 |   # Insert the name of your container registry pull secret #
 6 |   pullSecret: nvcr.io
 7 | 
 8 |   numGPUs: training.trainer.devices
 9 |   nodes: training.trainer.num_nodes
10 | 
11 | trainingConfig:
12 |   # Specify the amount of shared memory to attach to the Pods #
13 |   shmSize: 512Gi
14 | 
15 |   # Insert the address for the NFS server if using NFS for model storage #
16 |   NFSServer: <Insert NFS server address>
17 | 
18 |   # Insert the path to save data on the NFS server #
19 |   NFSPath: <Insert NFS server path>
20 | 
21 |   # Specify the k8s resource name for IB devices. Can be string or list of strings. If list, must be same length as ibCount #
22 |   ibResourceName: nvidia.com/hostdev
23 | 
24 |   # Specity the number of IB devices to include in pods. Can be string or list. If list, must be same length as ibResourceName #
25 |   ibCount: "8"
26 | 
27 |   # Specity the number of IB networks to include in pods. Should be a comma separated set of networks. #
28 |   ibNetworkAnnotation: ""
29 | 
30 |   # Specify the WandB API key if using WandB for logging #
31 |   wandbKey: "nil"
32 | 
33 |   # Insert the dnsPolicy #
34 |   dnsPolicy: "nil"
35 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/core/v2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/nemo_launcher/core/v2/__init__.py


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/nemo_launcher/utils/data_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/__init__.py


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/unit_tests/__init__.py


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/config_tests/test_cluster_config.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import OmegaConf
 2 | 
 3 | 
 4 | class TestClusterConfig:
 5 |     def test_cluster_bcm_config(self):
 6 |         conf = OmegaConf.load("conf/cluster/bcm.yaml")
 7 |         s = """
 8 |         partition: null
 9 |         account: null
10 |         exclusive: True
11 |         gpus_per_task: null
12 |         gpus_per_node: 8
13 |         mem: 0
14 |         job_name_prefix: "nemo-megatron-"
15 |         nodelist: null
16 |         srun_args:
17 |           - "--no-container-mount-home"
18 |         """
19 |         expected = OmegaConf.create(s)
20 |         assert (
21 |             expected == conf
22 |         ), f"conf/cluster/bcm.yaml must be set to {expected} but it currently is {conf}."
23 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/unit_tests/stages_tests/__init__.py


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/test_adapters.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?'
 5 | 
 6 | 
 7 | def adapter_learning(model_type):
 8 |     cmd = (
 9 |         "python3 main.py "
10 |         "stages=[adapter_learning] "
11 |         f"adapter_learning={model_type}/squad "
12 |         "launcher_scripts_path=. "
13 |         "base_results_dir=test_folder"
14 |     )
15 | 
16 |     command = subprocess.Popen(
17 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
18 |     )
19 |     output, errors = command.communicate()
20 | 
21 |     return errors.decode()
22 | 
23 | 
24 | class TestAdapterLearn:
25 |     def test_gpt3(self):
26 | 
27 |         output = adapter_learning("gpt3")
28 |         assert ERROR in output
29 | 
30 |     def test_t5(self):
31 | 
32 |         output = adapter_learning("t5")
33 |         assert ERROR in output
34 | 
35 |     def test_remove_folders(self):
36 |         os.system("rm -rf test_folder")
37 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/test_convert.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?'
 5 | 
 6 | 
 7 | def convert(model_type):
 8 |     cmd = (
 9 |         "python3 main.py "
10 |         "stages=[conversion] "
11 |         f"conversion={model_type}/convert_{model_type} "
12 |         "launcher_scripts_path=. "
13 |         "base_results_dir=test_folder"
14 |     )
15 | 
16 |     command = subprocess.Popen(
17 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
18 |     )
19 |     output, errors = command.communicate()
20 | 
21 |     return errors.decode()
22 | 
23 | 
24 | class TestConvert:
25 |     def test_gpt3(self):
26 | 
27 |         output = convert("gpt3")
28 |         assert ERROR in output
29 | 
30 |     def test_prompt_t5(self):
31 | 
32 |         output = convert("t5")
33 |         assert ERROR in output
34 | 
35 |     def test_prompt_mt5(self):
36 | 
37 |         output = convert("mt5")
38 |         assert ERROR in output
39 | 
40 |     def test_remove_folders(self):
41 |         os.system("rm -rf test_folder")
42 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/test_data_prep.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?'
 5 | 
 6 | 
 7 | def data_prep(model_type, data_type):
 8 |     cmd = (
 9 |         "python3 main.py "
10 |         "stages=[data_preparation] "
11 |         f"data_preparation={model_type}/{data_type} "
12 |         "launcher_scripts_path=. "
13 |         "base_results_dir=test_folder"
14 |     )
15 | 
16 |     command = subprocess.Popen(
17 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
18 |     )
19 |     output, errors = command.communicate()
20 | 
21 |     return errors.decode()
22 | 
23 | 
24 | class TestDataPrep:
25 |     def test_gpt3(self):
26 | 
27 |         output = data_prep("gpt3", "download_gpt3_pile")
28 |         assert ERROR in output
29 | 
30 |     def test_t5(self):
31 | 
32 |         output = data_prep("t5", "download_t5_pile")
33 |         assert ERROR in output
34 | 
35 |     def test_mt5(self):
36 | 
37 |         output = data_prep("mt5", "download_mc4")
38 |         assert ERROR in output
39 | 
40 |     def test_bert(self):
41 | 
42 |         output = data_prep("bert", "download_bert_pile")
43 |         assert ERROR in output
44 | 
45 |     def test_remove_folders(self):
46 |         os.system("rm -rf test_folder")
47 |         os.system("rm -rf data")
48 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/test_export.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?'
 5 | 
 6 | 
 7 | def export(model_type):
 8 |     cmd = (
 9 |         "python3 main.py "
10 |         "stages=[export] "
11 |         f"export={model_type}/export_{model_type} "
12 |         "launcher_scripts_path=. "
13 |         "base_results_dir=test_folder"
14 |     )
15 | 
16 |     command = subprocess.Popen(
17 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
18 |     )
19 |     output, errors = command.communicate()
20 | 
21 |     return errors.decode()
22 | 
23 | 
24 | class TestExport:
25 |     def test_gpt3(self):
26 | 
27 |         output = export("gpt3")
28 |         assert ERROR in output
29 | 
30 |     def test_t5(self):
31 | 
32 |         output = export("t5")
33 |         assert ERROR in output
34 | 
35 |     def test_mt5(self):
36 | 
37 |         output = export("mt5")
38 |         assert ERROR in output
39 | 
40 |     def test_remove_folders(self):
41 |         os.system("rm -rf test_folder")
42 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/test_fine_tune.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?'
 5 | 
 6 | 
 7 | def fine_tune(model_type, task_type):
 8 |     cmd = (
 9 |         "python3 main.py "
10 |         "stages=[fine_tuning] "
11 |         f"fine_tuning={model_type}/{task_type} "
12 |         "launcher_scripts_path=. "
13 |         "base_results_dir=test_folder"
14 |     )
15 | 
16 |     command = subprocess.Popen(
17 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
18 |     )
19 |     output, errors = command.communicate()
20 | 
21 |     return errors.decode()
22 | 
23 | 
24 | class TestFineTune:
25 |     def test_t5(self):
26 | 
27 |         output = fine_tune("t5", "squad")
28 |         assert ERROR in output
29 | 
30 |     def test_mt5(self):
31 | 
32 |         output = fine_tune("mt5", "xquad")
33 |         assert ERROR in output
34 | 
35 |     def test_remove_folders(self):
36 |         os.system("rm -rf test_folder")
37 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/test_ia3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?'
 5 | 
 6 | 
 7 | def ia3_learning(model_type):
 8 |     cmd = (
 9 |         "python3 main.py "
10 |         "stages=[ia3_learning] "
11 |         f"ia3_learning={model_type}/squad "
12 |         "launcher_scripts_path=. "
13 |         "base_results_dir=test_folder"
14 |     )
15 | 
16 |     command = subprocess.Popen(
17 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
18 |     )
19 |     output, errors = command.communicate()
20 | 
21 |     return errors.decode()
22 | 
23 | 
24 | class TestIA3Learn:
25 |     def test_gpt3(self):
26 | 
27 |         output = ia3_learning("gpt3")
28 |         assert ERROR in output
29 | 
30 |     def test_t5(self):
31 | 
32 |         output = ia3_learning("t5")
33 |         assert ERROR in output
34 | 
35 |     def test_remove_folders(self):
36 |         os.system("rm -rf test_folder")
37 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/stages_tests/test_prompt_learn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | ERROR = 'RuntimeError: Could not detect "srun", are you indeed on a slurm cluster?'
 5 | 
 6 | 
 7 | def prompt_learn(model_type):
 8 |     cmd = (
 9 |         "python3 main.py "
10 |         "stages=[prompt_learning] "
11 |         f"prompt_learning={model_type}/squad "
12 |         "launcher_scripts_path=. "
13 |         "base_results_dir=test_folder"
14 |     )
15 | 
16 |     command = subprocess.Popen(
17 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
18 |     )
19 |     output, errors = command.communicate()
20 | 
21 |     return errors.decode()
22 | 
23 | 
24 | class TestPromptLearn:
25 |     def test_gpt3(self):
26 | 
27 |         output = prompt_learn("gpt3")
28 |         assert ERROR in output
29 | 
30 |     def test_t5(self):
31 | 
32 |         output = prompt_learn("t5")
33 |         assert ERROR in output
34 | 
35 |     def test_mt5(self):
36 | 
37 |         output = prompt_learn("mt5")
38 |         assert ERROR in output
39 | 
40 |     def test_remove_folders(self):
41 |         os.system("rm -rf test_folder")
42 |         os.system("rm -rf data")
43 | 


--------------------------------------------------------------------------------
/launcher_scripts/tests/unit_tests/utils_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/26e42fa91d9e608897ce485b4911c263a4b57008/launcher_scripts/tests/unit_tests/utils_tests/__init__.py


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | best_download>=0.0.6
 2 | dask
 3 | huggingface_hub>=0.13.0
 4 | hydra-core==1.3.2
 5 | img2dataset
 6 | omegaconf>=2.2,<2.3
 7 | pynvml==11.4.1
 8 | pytablewriter==0.58.0
 9 | requests==2.26.0
10 | tqdm==4.62.3
11 | zstandard==0.15.2
12 | hera
13 | pydantic
14 | kubeflow-training>=1.8
15 | kubernetes
16 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | line_length = 119
3 | profile = black
4 | 


--------------------------------------------------------------------------------