├── .coveragerc
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── enhancement.md
    │   ├── question.md
    │   └── regression.md
    └── workflows
    │   └── stale.yml
├── .gitignore
├── .gitlab-ci.yml
├── .gitlab
    ├── labeler-config.yml
    ├── scripts
    │   ├── build.sh
    │   └── fetch-legacy-suite.sh
    └── stages
    │   ├── 00.pre.yml
    │   ├── 01.build.yml
    │   ├── 02.test.yml
    │   ├── 03.integration-tests.yml
    │   ├── 04.functional-tests.yml
    │   └── 05.publish.yml
├── .pre-commit-config.yaml
├── .pylintrc
├── CHANGELOG.md
├── CODEOWNERS
├── CONTRIBUTING.md
├── Dockerfile.ci.dev
├── Dockerfile.ci.lts
├── Dockerfile.linting
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── llama_mistral.md
    └── source
    │   ├── api-guide
    │       ├── context_parallel.rst
    │       ├── custom_fsdp.md
    │       ├── datasets.rst
    │       ├── dist_checkpointing.rst
    │       ├── dist_checkpointing.strategies.rst
    │       ├── dist_optimizer.md
    │       ├── distributed.rst
    │       ├── encoder_decoder_parallelism.rst
    │       ├── fusions.rst
    │       ├── index.rst
    │       ├── models.bert.rst
    │       ├── models.gpt.rst
    │       ├── models.rst
    │       ├── models.t5.rst
    │       ├── moe.rst
    │       ├── multi_latent_attention.rst
    │       ├── multi_token_prediction.md
    │       ├── num_microbatches_calculator.rst
    │       ├── optimizer_cpu_offload.rst
    │       ├── optimizer_param_scheduler.rst
    │       ├── pipeline_parallel.rst
    │       ├── tensor_parallel.rst
    │       └── transformer.rst
    │   ├── images
    │       ├── context_parallel
    │       │   ├── CP_overview.png
    │       │   └── CP_results.png
    │       ├── custom_fsdp
    │       │   ├── FSDP_Allreduce.png
    │       │   ├── FSDP_workflow.png
    │       │   └── MCore_Custom_FSDP_Class_Diagram.png
    │       ├── distrib_optimizer
    │       │   ├── data_flow.png
    │       │   └── sharding_scheme.png
    │       ├── moe
    │       │   └── token_drop.png
    │       └── multi_token_prediction
    │       │   └── MTP_implementation.png
    │   ├── index.rst
    │   └── user-guide
    │       └── index.rst
├── examples
    ├── academic_paper_scripts
    │   ├── detxoify_lm
    │   │   ├── README.md
    │   │   ├── annotations
    │   │   │   ├── filter-selfgeneration.py
    │   │   │   ├── perspective_api_annotate.py
    │   │   │   └── preprocess.sh
    │   │   ├── finetune_gpt.py
    │   │   ├── finetune_gpt_distributed-1.3b.sh
    │   │   ├── generate-1.3b.sh
    │   │   ├── generate_samples_gpt.py
    │   │   ├── perspective_api.py
    │   │   └── self_generation
    │   │   │   └── selfgenerate-1.3b-unconditional.sh
    │   ├── msdp
    │   │   ├── README.md
    │   │   ├── data_processing.sh
    │   │   ├── eval_knwl_generation.sh
    │   │   ├── eval_resp_generation.sh
    │   │   ├── prep_resp_gen.sh
    │   │   ├── prompt_knwl_gen.sh
    │   │   └── prompt_resp_gen.sh
    │   └── sc21
    │   │   ├── CONFIG.sh
    │   │   ├── README.md
    │   │   ├── SBATCH.sh
    │   │   ├── SRUN.sh
    │   │   ├── run_figure_11.sh
    │   │   ├── run_figure_12.sh
    │   │   ├── run_figure_13.sh
    │   │   ├── run_figure_14.sh
    │   │   ├── run_figure_15.sh
    │   │   ├── run_figure_16.sh
    │   │   ├── run_figure_17.sh
    │   │   ├── run_figure_18.sh
    │   │   └── run_table_1.sh
    ├── bert
    │   ├── README.md
    │   └── train_bert_340m_distributed.sh
    ├── export
    │   ├── README.md
    │   └── trtllm_export
    │   │   ├── README.md
    │   │   ├── distributed_export
    │   │       └── gpt_distributed_gpu_export.py
    │   │   └── single_device_export
    │   │       └── gpt_single_device_cpu_export.py
    ├── gpt3
    │   ├── README.md
    │   ├── gpt_config.yaml
    │   └── train_gpt3_175b_distributed.sh
    ├── inference
    │   ├── README.md
    │   ├── gpt
    │   │   ├── gpt_dynamic_inference.py
    │   │   ├── gpt_dynamic_inference_12b.sh
    │   │   ├── gpt_dynamic_inference_357m.sh
    │   │   ├── gpt_static_inference.py
    │   │   └── utils.py
    │   ├── llama_mistral
    │   │   ├── huggingface_reference.py
    │   │   ├── run_text_generation_llama3.1.sh
    │   │   ├── run_text_generation_llama3.sh
    │   │   └── run_text_generation_mistral.sh
    │   ├── run_text_generation_server_345M.sh
    │   ├── run_text_generation_server_345M_8_tensor_parallel.sh
    │   └── t5
    │   │   └── simple_t5_batch_inference.py
    ├── mamba
    │   ├── .gitignore
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── run_text_gen_server_8b.sh
    │   ├── run_text_gen_server_8b_gpt3.sh
    │   └── train.sh
    ├── mixtral
    │   ├── README.md
    │   └── train_mixtral_8x7b_distributed.sh
    ├── multimodal
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── assets
    │   │   └── pretrain_curves.png
    │   ├── combine_lm_vision_checkpoints.sh
    │   ├── combine_state_dicts.py
    │   ├── config.py
    │   ├── convert_llava_pretrain_to_wds.py
    │   ├── dataloader_provider.py
    │   ├── dataset_helpers.py
    │   ├── energon_util.py
    │   ├── evaluation
    │   │   ├── evaluate_ai2d.py
    │   │   ├── evaluate_chartqa.py
    │   │   ├── evaluate_coco.py
    │   │   ├── evaluate_infovqa.py
    │   │   ├── evaluate_mathvista.py
    │   │   ├── evaluate_mmmu.py
    │   │   ├── evaluate_ocrbench.py
    │   │   ├── evaluate_ocrbench_v2.py
    │   │   ├── evaluate_rd_tablebench.py
    │   │   ├── evaluate_realworldqa.py
    │   │   ├── evaluate_spdocvqa.py
    │   │   ├── evaluate_textvqa.py
    │   │   ├── evaluate_video_motionbench.py
    │   │   ├── evaluate_video_mvbench.py
    │   │   ├── evaluate_video_phys_game_bench.py
    │   │   ├── evaluate_vqav2.py
    │   │   ├── evaluation_datasets.py
    │   │   └── mmmu_utils.py
    │   ├── image_processing.py
    │   ├── layer_scaling.py
    │   ├── layer_specs.py
    │   ├── manual_prompts.json
    │   ├── model.py
    │   ├── model_converter
    │   │   ├── clip_converter.py
    │   │   ├── internvit_converter.py
    │   │   ├── radio_converter.py
    │   │   ├── siglip_converter.py
    │   │   └── vision_model_tester.py
    │   ├── multimodal_args.py
    │   ├── nvlm
    │   │   ├── README.md
    │   │   ├── internvit.py
    │   │   ├── nvlm_prompts.json
    │   │   ├── pp_checkpoint_converter.py
    │   │   ├── pretrain_blend.yaml
    │   │   ├── pretrain_qwen20_72b_internvit_6b.sh
    │   │   ├── pretrain_yi_34b_internvit_6b.sh
    │   │   ├── run_text_generation_qwen20_72b_internvit_6b.sh
    │   │   ├── run_text_generation_qwen25_7b_internvit_video.sh
    │   │   ├── run_text_generation_qwen25_7b_siglip.sh
    │   │   ├── run_text_generation_yi_34b_internvit_6b.sh
    │   │   ├── sft_34b_internvit.sh
    │   │   ├── sft_blend.yaml
    │   │   ├── sft_qwen20_72b_internvit_6b.sh
    │   │   └── sft_qwen2p5_7b_internvit_6b_video.sh
    │   ├── pretrain_dataset.yaml
    │   ├── pretrain_mistral_clip.sh
    │   ├── radio
    │   │   └── radio_g.py
    │   ├── run_text_generation.py
    │   ├── sft_dataset.yaml
    │   ├── sft_mistral_clip.sh
    │   ├── text_generation_mistral_clip.sh
    │   └── train.py
    ├── post_training
    │   └── modelopt
    │   │   ├── README.md
    │   │   ├── conf
    │   │       ├── arguments.sh
    │   │       ├── deepseek-ai
    │   │       │   ├── DeepSeek-R1.sh
    │   │       │   └── DeepSeek-V2-Lite.sh
    │   │       ├── meta-llama
    │   │       │   ├── Llama-3.1-8B-Instruct.sh
    │   │       │   ├── Llama-3.2-1B-Instruct.sh
    │   │       │   ├── Llama-4-Maverick-17B-128E-Instruct.sh
    │   │       │   └── Llama-4-Scout-17B-16E-Instruct.sh
    │   │       ├── nvidia
    │   │       │   ├── Nemotron-H-4B-Instruct.sh
    │   │       │   ├── Nemotron-H-8B-Base-8K.sh
    │   │       │   └── Nemotron-Mini-4B-Instruct.sh
    │   │       └── qwen
    │   │       │   ├── Qwen3-235B-A22B.sh
    │   │       │   └── Qwen3-30B-A3B.sh
    │   │   ├── convert.sh
    │   │   ├── convert_model.py
    │   │   ├── export.py
    │   │   ├── export.sh
    │   │   ├── finetune.py
    │   │   ├── finetune.sh
    │   │   ├── generate.py
    │   │   ├── generate.sh
    │   │   ├── mmlu.py
    │   │   ├── mmlu.sh
    │   │   ├── quantize.py
    │   │   ├── quantize.sh
    │   │   ├── requirements.txt
    │   │   └── speculative.md
    ├── retro
    │   ├── README.md
    │   ├── preprocess_data.sh
    │   └── train_retro_2b_distributed.sh
    ├── run_simple_mcore_train_loop.py
    └── t5
    │   ├── README.md
    │   ├── t5_mcore_train_curve.png
    │   └── train_t5_220m_distributed.sh
├── images
    ├── model_table.png
    ├── strong_scaling.png
    └── weak_scaling.png
├── megatron
    ├── core
    │   ├── MSC_Integration.md
    │   ├── QuickStart.md
    │   ├── README.md
    │   ├── README_STRAGGLER.md
    │   ├── __init__.py
    │   ├── config.py
    │   ├── config_logger.py
    │   ├── datasets
    │   │   ├── Makefile
    │   │   ├── __init__.py
    │   │   ├── bert_dataset.py
    │   │   ├── blended_dataset.py
    │   │   ├── blended_megatron_dataset_builder.py
    │   │   ├── blended_megatron_dataset_config.py
    │   │   ├── gpt_dataset.py
    │   │   ├── helpers.cpp
    │   │   ├── helpers.py
    │   │   ├── indexed_dataset.py
    │   │   ├── masked_dataset.py
    │   │   ├── megatron_dataset.py
    │   │   ├── megatron_tokenizer.py
    │   │   ├── multimodal_dataset.py
    │   │   ├── object_storage_utils.py
    │   │   ├── readme.md
    │   │   ├── retro
    │   │   │   ├── __init__.py
    │   │   │   ├── config
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bert_embedders.py
    │   │   │   │   ├── config.py
    │   │   │   │   ├── gpt_chunk_datasets.py
    │   │   │   │   └── tokenizers.py
    │   │   │   ├── db
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build.py
    │   │   │   │   ├── dataset.py
    │   │   │   │   └── utils.py
    │   │   │   ├── external_libs.py
    │   │   │   ├── index
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build.py
    │   │   │   │   ├── factory.py
    │   │   │   │   ├── index.py
    │   │   │   │   ├── indexes
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── faiss_base.py
    │   │   │   │   │   └── faiss_par_add.py
    │   │   │   │   ├── utils.py
    │   │   │   │   └── validate.py
    │   │   │   ├── query
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── gpt_chunk_dataset.py
    │   │   │   │   ├── multi_split_gpt_dataset.py
    │   │   │   │   ├── query.py
    │   │   │   │   ├── retro_dataset.py
    │   │   │   │   └── utils.py
    │   │   │   └── utils.py
    │   │   ├── t5_dataset.py
    │   │   ├── utils.py
    │   │   ├── utils_object_storage.py
    │   │   └── utils_s3.py
    │   ├── dist_checkpointing
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── dict_utils.py
    │   │   ├── exchange_utils.py
    │   │   ├── mapping.py
    │   │   ├── optimizer.py
    │   │   ├── serialization.py
    │   │   ├── state_dict_utils.py
    │   │   ├── strategies
    │   │   │   ├── __init__.py
    │   │   │   ├── async_utils.py
    │   │   │   ├── base.py
    │   │   │   ├── cached_metadata_filesystem_reader.py
    │   │   │   ├── common.py
    │   │   │   ├── filesystem_async.py
    │   │   │   ├── fully_parallel.py
    │   │   │   ├── resharding.py
    │   │   │   ├── state_dict_saver.py
    │   │   │   ├── tensorstore.py
    │   │   │   ├── torch.py
    │   │   │   ├── two_stage.py
    │   │   │   └── zarr.py
    │   │   ├── tensor_aware_state_dict.py
    │   │   ├── utils.py
    │   │   └── validation.py
    │   ├── distributed
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── custom_fsdp
    │   │   │   ├── __init__.py
    │   │   │   ├── fully_sharded_data_parallel.py
    │   │   │   └── param_and_grad_buffer.py
    │   │   ├── data_parallel_base.py
    │   │   ├── distributed_data_parallel.py
    │   │   ├── distributed_data_parallel_config.py
    │   │   ├── finalize_model_grads.py
    │   │   ├── param_and_grad_buffer.py
    │   │   ├── torch_fully_sharded_data_parallel.py
    │   │   └── torch_fully_sharded_data_parallel_config.py
    │   ├── enums.py
    │   ├── export
    │   │   ├── __init__.py
    │   │   ├── data_type.py
    │   │   ├── export_config.py
    │   │   ├── model_type.py
    │   │   └── trtllm
    │   │   │   ├── __init__.py
    │   │   │   ├── engine_builder
    │   │   │       ├── __init__.py
    │   │   │       └── trtllm_engine_builder.py
    │   │   │   ├── model_to_trllm_mapping
    │   │   │       ├── __init__.py
    │   │   │       └── default_conversion_dict.py
    │   │   │   ├── trt_model_config.py
    │   │   │   ├── trt_model_type.py
    │   │   │   ├── trtllm_helper.py
    │   │   │   ├── trtllm_layers.py
    │   │   │   └── trtllm_weights_converter
    │   │   │       ├── __init__.py
    │   │   │       ├── distributed_trtllm_model_weights_converter.py
    │   │   │       ├── single_device_trtllm_model_weights_converter.py
    │   │   │       └── utils.py
    │   ├── extensions
    │   │   ├── __init__.py
    │   │   └── transformer_engine.py
    │   ├── fp8_utils.py
    │   ├── fusions
    │   │   ├── __init__.py
    │   │   ├── fused_bias_dropout.py
    │   │   ├── fused_bias_geglu.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_bias_swiglu.py
    │   │   ├── fused_cross_entropy.py
    │   │   ├── fused_indices_converter.py
    │   │   ├── fused_layer_norm.py
    │   │   └── fused_softmax.py
    │   ├── inference
    │   │   ├── __init__.py
    │   │   ├── async_stream.py
    │   │   ├── common_inference_params.py
    │   │   ├── communication_utils.py
    │   │   ├── contexts
    │   │   │   ├── __init__.py
    │   │   │   ├── base_context.py
    │   │   │   ├── dynamic_chunk_allocator.py
    │   │   │   ├── dynamic_context.py
    │   │   │   └── static_context.py
    │   │   ├── engines
    │   │   │   ├── __init__.py
    │   │   │   ├── abstract_engine.py
    │   │   │   ├── dynamic_engine.py
    │   │   │   ├── mcore_engine.py
    │   │   │   └── static_engine.py
    │   │   ├── inference_request.py
    │   │   ├── model_inference_wrappers
    │   │   │   ├── __init__.py
    │   │   │   ├── abstract_model_inference_wrapper.py
    │   │   │   ├── gpt
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── gpt_inference_wrapper.py
    │   │   │   ├── inference_wrapper_config.py
    │   │   │   ├── multimodal
    │   │   │   │   └── vlm_inference_wrapper.py
    │   │   │   └── t5
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── t5_inference_wrapper.py
    │   │   ├── sampling_params.py
    │   │   ├── scheduler.py
    │   │   ├── text_generation_controllers
    │   │   │   ├── __init__.py
    │   │   │   ├── encoder_decoder_text_generation_controller.py
    │   │   │   ├── simple_text_generation_controller.py
    │   │   │   ├── text_generation_controller.py
    │   │   │   └── vlm_text_generation_controller.py
    │   │   └── utils.py
    │   ├── inference_params.py
    │   ├── jit.py
    │   ├── model_parallel_config.py
    │   ├── models
    │   │   ├── T5
    │   │   │   ├── __init__.py
    │   │   │   ├── t5_model.py
    │   │   │   └── t5_spec.py
    │   │   ├── __init__.py
    │   │   ├── bert
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_layer_specs.py
    │   │   │   ├── bert_lm_head.py
    │   │   │   ├── bert_model.py
    │   │   │   └── pooler.py
    │   │   ├── common
    │   │   │   ├── __init__.py
    │   │   │   ├── embeddings
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── language_model_embedding.py
    │   │   │   │   ├── relative_pos_embedding.py
    │   │   │   │   ├── rope_utils.py
    │   │   │   │   ├── rotary_pos_embedding.py
    │   │   │   │   └── yarn_rotary_pos_embedding.py
    │   │   │   ├── language_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── language_module.py
    │   │   │   └── vision_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── vision_module.py
    │   │   ├── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── gpt_layer_specs.py
    │   │   │   ├── gpt_model.py
    │   │   │   ├── heterogeneous
    │   │   │   │   └── heterogeneous_layer_specs.py
    │   │   │   └── moe_module_specs.py
    │   │   ├── huggingface
    │   │   │   ├── __init__.py
    │   │   │   ├── clip_model.py
    │   │   │   ├── module.py
    │   │   │   └── qwen_model.py
    │   │   ├── mamba
    │   │   │   ├── __init__.py
    │   │   │   ├── mamba_layer_specs.py
    │   │   │   └── mamba_model.py
    │   │   ├── mimo
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── config
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── base_configs.py
    │   │   │   ├── model
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── base.py
    │   │   │   └── submodules
    │   │   │   │   ├── audio.py
    │   │   │   │   ├── base.py
    │   │   │   │   └── vision.py
    │   │   ├── multimodal
    │   │   │   ├── __init__.py
    │   │   │   ├── context_parallel.py
    │   │   │   ├── llava_model.py
    │   │   │   └── llava_spec.py
    │   │   ├── retro
    │   │   │   ├── __init__.py
    │   │   │   ├── base_attention.py
    │   │   │   ├── config.py
    │   │   │   ├── decoder_attention.py
    │   │   │   ├── decoder_spec.py
    │   │   │   ├── encoder_attention.py
    │   │   │   ├── encoder_spec.py
    │   │   │   ├── model.py
    │   │   │   └── utils.py
    │   │   └── vision
    │   │   │   ├── __init__.py
    │   │   │   ├── clip_vit_model.py
    │   │   │   ├── multimodal_projector.py
    │   │   │   ├── radio.py
    │   │   │   └── vit_layer_specs.py
    │   ├── msc_utils.py
    │   ├── num_microbatches_calculator.py
    │   ├── optimizer
    │   │   ├── __init__.py
    │   │   ├── clip_grads.py
    │   │   ├── cpu_offloading
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   └── hybrid_optimizer.py
    │   │   ├── distrib_optimizer.py
    │   │   ├── grad_scaler.py
    │   │   ├── optimizer.py
    │   │   └── optimizer_config.py
    │   ├── optimizer_param_scheduler.py
    │   ├── package_info.py
    │   ├── packed_seq_params.py
    │   ├── parallel_state.py
    │   ├── pipeline_parallel
    │   │   ├── __init__.py
    │   │   ├── p2p_communication.py
    │   │   └── schedules.py
    │   ├── post_training
    │   │   ├── __init__.py
    │   │   └── modelopt
    │   │   │   ├── __init__.py
    │   │   │   ├── gpt
    │   │   │       ├── __init__.py
    │   │   │       ├── model_specs.py
    │   │   │       └── state_dict_hooks.py
    │   │   │   ├── layers.py
    │   │   │   └── mamba
    │   │   │       ├── __init__.py
    │   │   │       └── model_specs.py
    │   ├── process_groups_config.py
    │   ├── requirements.txt
    │   ├── rerun_state_machine.py
    │   ├── ssm
    │   │   ├── __init__.py
    │   │   ├── mamba_block.py
    │   │   ├── mamba_hybrid_layer_allocation.py
    │   │   ├── mamba_layer.py
    │   │   ├── mamba_mixer.py
    │   │   ├── mlp_layer.py
    │   │   └── triton_cache_manager.py
    │   ├── tensor_parallel
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   └── utils.py
    │   ├── timers.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── cuda_graphs.py
    │   │   ├── custom_layers
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_engine.py
    │   │   ├── dot_product_attention.py
    │   │   ├── enums.py
    │   │   ├── heterogeneous
    │   │   │   ├── heterogeneous_config.py
    │   │   │   └── linear_replacements.py
    │   │   ├── identity_op.py
    │   │   ├── mlp.py
    │   │   ├── module.py
    │   │   ├── moe
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── experts.py
    │   │   │   ├── fused_a2a.py
    │   │   │   ├── grouped_gemm_util.py
    │   │   │   ├── legacy_a2a_token_dispatcher.py
    │   │   │   ├── moe_layer.py
    │   │   │   ├── moe_utils.py
    │   │   │   ├── router.py
    │   │   │   ├── shared_experts.py
    │   │   │   ├── token_dispatcher.py
    │   │   │   └── upcycling_utils.py
    │   │   ├── multi_latent_attention.py
    │   │   ├── multi_token_prediction.py
    │   │   ├── spec_utils.py
    │   │   ├── torch_layer_norm.py
    │   │   ├── torch_norm.py
    │   │   ├── transformer_block.py
    │   │   ├── transformer_config.py
    │   │   ├── transformer_layer.py
    │   │   └── utils.py
    │   └── utils.py
    ├── inference
    │   ├── __init__.py
    │   ├── endpoints
    │   │   ├── common.py
    │   │   └── completions.py
    │   ├── static
    │   │   └── index.html
    │   ├── text_generation
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── beam_utils.py
    │   │   ├── communication.py
    │   │   ├── forward_step.py
    │   │   ├── generation.py
    │   │   ├── mcore_engine_server.py
    │   │   ├── sampling.py
    │   │   └── tokenization.py
    │   └── text_generation_server.py
    ├── legacy
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── autoaugment.py
    │   │   ├── biencoder_dataset_utils.py
    │   │   ├── data_samplers.py
    │   │   ├── dataset_utils.py
    │   │   ├── ict_dataset.py
    │   │   ├── image_folder.py
    │   │   ├── multimodal_dataset.py
    │   │   ├── orqa_wiki_dataset.py
    │   │   ├── realm_dataset_utils.py
    │   │   ├── realm_index.py
    │   │   └── vit_dataset.py
    │   ├── fp16_deprecated
    │   │   └── loss_scaler.py
    │   ├── fused_kernels
    │   │   ├── __init__.py
    │   │   ├── compat.h
    │   │   ├── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_fused_kernels.py
    │   │   └── type_shim.h
    │   ├── indexer.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── bert_model.py
    │   │   ├── biencoder_model.py
    │   │   ├── classification.py
    │   │   ├── enums.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_layer_norm.py
    │   │   ├── fused_softmax.py
    │   │   ├── gpt_model.py
    │   │   ├── language_model.py
    │   │   ├── module.py
    │   │   ├── multiple_choice.py
    │   │   ├── realm_model.py
    │   │   ├── rms_norm.py
    │   │   ├── t5_model.py
    │   │   ├── transformer.py
    │   │   ├── utils.py
    │   │   └── vision
    │   │   │   ├── classification.py
    │   │   │   ├── dino.py
    │   │   │   ├── esvit_swin_backbone.py
    │   │   │   ├── inpainting.py
    │   │   │   ├── knn_monitor.py
    │   │   │   ├── mit_backbone.py
    │   │   │   ├── swin_backbone.py
    │   │   │   ├── utils.py
    │   │   │   └── vit_backbone.py
    │   └── mpu
    │   │   └── tests
    │   │       ├── __init__.py
    │   │       ├── commons.py
    │   │       ├── test_cross_entropy.py
    │   │       ├── test_data.py
    │   │       ├── test_initialize.py
    │   │       ├── test_layers.py
    │   │       └── test_random.py
    ├── post_training
    │   ├── __init__.py
    │   ├── algos
    │   │   ├── __init__.py
    │   │   └── distillation.py
    │   ├── arguments.py
    │   ├── checkpointing.py
    │   ├── docs
    │   │   └── distillation.md
    │   ├── generate.py
    │   ├── loss_func.py
    │   ├── model_provider.py
    │   ├── non_loss_data_func.py
    │   └── utils.py
    └── training
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── arguments.py
    │   ├── async_utils.py
    │   ├── checkpointing.py
    │   ├── dist_signal_handler.py
    │   ├── ft_integration.py
    │   ├── global_vars.py
    │   ├── initialize.py
    │   ├── inprocess_restart.py
    │   ├── log_handler.py
    │   ├── one_logger_utils.py
    │   ├── theoretical_memory_usage.py
    │   ├── tokenizer
    │       ├── __init__.py
    │       ├── bert_tokenization.py
    │       ├── gpt2_tokenization.py
    │       ├── multimodal_tokenizer.py
    │       └── tokenizer.py
    │   ├── training.py
    │   ├── utils.py
    │   ├── wandb_utils.py
    │   └── yaml_arguments.py
├── model_config.yaml
├── mypy.ini
├── patches
    └── nemo_2.3.0_te.patch
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_ict.py
├── pretrain_mamba.py
├── pretrain_retro.py
├── pretrain_t5.py
├── pretrain_vision_classify.py
├── pretrain_vision_dino.py
├── pretrain_vision_inpaint.py
├── pretrain_vlm.py
├── pyproject.toml
├── pytest.ini
├── requirements
    ├── pytorch_24.01
    │   └── requirements.txt
    ├── pytorch_24.07
    │   └── requirements.txt
    ├── pytorch_24.10
    │   └── requirements.txt
    └── pytorch_25.03
    │   └── requirements.txt
├── requirements_ci.txt
├── requirements_mlm.txt
├── setup.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── main.py
    ├── msdp
    │   ├── README.md
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── prompt.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── quantize
    │   └── calibrate_gpt.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification
    │   │   ├── classification.py
    │   │   └── eval_utils.py
    │   ├── finetune_utils.py
    │   ├── main.py
    │   └── segmentation
    │   │   ├── cityscapes.py
    │   │   ├── data.py
    │   │   ├── finetune_segformer.py
    │   │   ├── finetune_setr.py
    │   │   ├── metrics.py
    │   │   ├── seg_heads.py
    │   │   ├── seg_models.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── __init__.py
    ├── functional_tests
    │   ├── __init__.py
    │   ├── python_test_utils
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── conftest.py
    │   │   ├── get_test_results_from_tensorboard_logs.py
    │   │   ├── test_inference_regular_pipeline.py
    │   │   ├── test_pretraining_regular_pipeline.py
    │   │   └── test_pretraining_resume_checkpoint_pipeline.py
    │   ├── shell_test_utils
    │   │   ├── _run_training.sh
    │   │   ├── run_ci_test.sh
    │   │   └── start_interactive_job.sh
    │   └── test_cases
    │   │   ├── bert
    │   │       ├── bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       └── bert_release
    │   │       │   ├── golden_values_0.10.0_dgx_a100.json
    │   │       │   ├── golden_values_0.11.0_dgx_a100.json
    │   │       │   ├── golden_values_0.12.0_dgx_a100.json
    │   │       │   ├── golden_values_0.9.0_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │   ├── common
    │   │       └── ckpt_converter
    │   │       │   ├── __main__.py
    │   │       │   └── model_config.yaml
    │   │   ├── gpt-nemo
    │   │       ├── bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G
    │   │       │   └── model_config.yaml
    │   │       └── t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G
    │   │       │   └── model_config.yaml
    │   │   ├── gpt
    │   │       ├── gpt3_15b_8t_release
    │   │       │   ├── golden_values_0.10.0_dgx_a100.json
    │   │       │   ├── golden_values_0.11.0_dgx_a100.json
    │   │       │   ├── golden_values_0.12.0_dgx_a100.json
    │   │       │   ├── golden_values_0.8.0_dgx_a100.json
    │   │       │   ├── golden_values_0.9.0_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_15b_8t_release_sm
    │   │       │   ├── golden_values_0.11.0_PyT24.10_dgx_a100.json
    │   │       │   ├── golden_values_0.11.0_PyT25.01_dgx_a100.json
    │   │       │   ├── golden_values_0.12.0_PyT25.03_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       └── gpt_inference_tp1_pp1_583m_logitsmatch
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │   ├── hybrid
    │   │       ├── hybrid_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── hybrid_mr_mcore_te_tp1_pp4_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       └── hybrid_mr_mcore_te_tp2_pp1_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │   ├── mixtral
    │   │       ├── mixtral_8x22b_tp2pp8ep8vpp1_release
    │   │       │   ├── golden_values_0.10.0_dgx_a100.json
    │   │       │   ├── golden_values_0.12.0_dgx_a100.json
    │   │       │   ├── golden_values_0.9.0_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── mixtral_8x7b_alltoall_tp2pp4ep4_release
    │   │       │   ├── golden_values_0.10.0_dgx_a100.json
    │   │       │   ├── golden_values_0.11.0_dgx_a100.json
    │   │       │   ├── golden_values_0.11.0_patch_dgx_a100.json
    │   │       │   ├── golden_values_0.12.0_dgx_a100.json
    │   │       │   ├── golden_values_0.8.0_dgx_a100.json
    │   │       │   ├── golden_values_0.9.0_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── mixtral_8x7b_alltoall_tp2pp4ep4_release_sm
    │   │       │   └── model_config.yaml
    │   │       └── mixtral_8x7b_tp1pp4ep8vpp8_release
    │   │       │   ├── golden_values_0.10.0_dgx_a100.json
    │   │       │   ├── golden_values_0.11.0_dgx_a100.json
    │   │       │   ├── golden_values_0.12.0_dgx_a100.json
    │   │       │   ├── golden_values_0.9.0_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │   ├── moe
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mcore_te_tp1_pp1_ep8_etp1_cp_memory_speed
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       └── gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │   ├── multimodal-llava
    │   │       ├── multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       ├── multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │       └── multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev_dgx_a100.json
    │   │       │   ├── golden_values_dev_dgx_h100.json
    │   │       │   ├── golden_values_lts_dgx_a100.json
    │   │       │   └── model_config.yaml
    │   │   └── t5
    │   │       ├── t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1
    │   │           ├── golden_values_dev_dgx_a100.json
    │   │           ├── golden_values_dev_dgx_h100.json
    │   │           ├── golden_values_lts_dgx_a100.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
    │   │           └── golden_values_lts_dgx_a100.json
    │   │       ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel
    │   │           └── golden_values_lts_dgx_a100.json
    │   │       └── t5_release
    │   │           ├── golden_values_0.10.0_dgx_a100.json
    │   │           ├── golden_values_0.11.0_dgx_a100.json
    │   │           ├── golden_values_0.12.0_dgx_a100.json
    │   │           ├── golden_values_0.9.0_dgx_a100.json
    │   │           └── model_config.yaml
    ├── test_utils
    │   ├── python_scripts
    │   │   ├── auto_reminder.py
    │   │   ├── check_status_of_main.py
    │   │   ├── common.py
    │   │   ├── dashboard.py
    │   │   ├── download_coverage_results.py
    │   │   ├── download_golden_values.py
    │   │   ├── generate_jet_trigger_job.py
    │   │   ├── generate_local_jobs.py
    │   │   ├── launch_jet_workload.py
    │   │   ├── notify.py
    │   │   └── wait_for_resources.py
    │   └── recipes
    │   │   ├── _build-mcore-dev.yaml
    │   │   ├── _build-mcore-lts.yaml
    │   │   ├── _build-nemo.yaml
    │   │   ├── bert.yaml
    │   │   ├── common.yaml
    │   │   ├── gpt-inference.yaml
    │   │   ├── gpt-nemo.yaml
    │   │   ├── gpt.yaml
    │   │   ├── mamba.yaml
    │   │   ├── moe.yaml
    │   │   ├── multimodal-llava.yaml
    │   │   ├── t5.yaml
    │   │   └── unit-tests.yaml
    └── unit_tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── data
    │       ├── __init__.py
    │       ├── test_bin_reader.py
    │       ├── test_builder.py
    │       ├── test_gpt_dataset.py
    │       ├── test_multimodal_dataset.py
    │       ├── test_preprocess_data.py
    │       └── test_preprocess_mmdata.py
    │   ├── dist_checkpointing
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   ├── common.py
    │       │   ├── test_bert_model.py
    │       │   ├── test_gpt_model.py
    │       │   ├── test_mamba.py
    │       │   ├── test_mlp_glu.py
    │       │   ├── test_moe_experts.py
    │       │   └── test_t5_model.py
    │       ├── test_async_save.py
    │       ├── test_flattened_resharding.py
    │       ├── test_fp8.py
    │       ├── test_fully_parallel.py
    │       ├── test_global_metadata_reuse.py
    │       ├── test_local.py
    │       ├── test_mapping.py
    │       ├── test_msc.py
    │       ├── test_nonpersistent.py
    │       ├── test_optimizer.py
    │       ├── test_replication.py
    │       ├── test_serialization.py
    │       ├── test_torch_dist.py
    │       └── utils.py
    │   ├── distributed
    │       ├── test_distributed_data_parallel.py
    │       ├── test_finalize_model_grads.py
    │       ├── test_grad_reduce_for_replicated_embedder.py
    │       ├── test_grad_sync_with_expert_parallel.py
    │       ├── test_mcore_fully_sharded_data_parallel.py
    │       ├── test_param_and_grad_buffer.py
    │       └── test_torch_fully_sharded_parallel.py
    │   ├── export
    │       └── trtllm
    │       │   ├── __init__.py
    │       │   ├── test_distributed_fp8.py
    │       │   ├── test_single_device_fp8.py
    │       │   ├── test_trtllm_distributed_gpu_converter.py
    │       │   ├── test_trtllm_helper.py
    │       │   ├── test_trtllm_layers.py
    │       │   └── test_trtllm_single_device_converter.py
    │   ├── fusions
    │       ├── test_bias_dropout_fusion.py
    │       ├── test_swiglu_fusion.py
    │       └── test_torch_softmax.py
    │   ├── inference
    │       ├── __init__.py
    │       ├── contexts
    │       │   └── test_dynamic_context.py
    │       ├── engines
    │       │   ├── __init__.py
    │       │   ├── test_dynamic_engine.py
    │       │   └── test_static_engine.py
    │       ├── model_inference_wrappers
    │       │   ├── __init__.py
    │       │   ├── gpt
    │       │   │   └── test_gpt_inference_wrapper.py
    │       │   ├── t5
    │       │   │   └── test_t5_inference_wrapper.py
    │       │   └── test_model_inference_wrapper_config.py
    │       ├── test_common_inference_params.py
    │       ├── test_communication_utils.py
    │       ├── test_flash_decode.py
    │       ├── test_inference_utils.py
    │       ├── test_scheduler.py
    │       └── text_generation_controllers
    │       │   ├── __init__.py
    │       │   ├── test_encoder_decoder_text_generation_controller.py
    │       │   ├── test_simple_text_generation_controller.py
    │       │   └── test_vlm_text_generation_controller.py
    │   ├── models
    │       ├── __init__.py
    │       ├── test_base_embedding.py
    │       ├── test_bert_model.py
    │       ├── test_clip_vit_model.py
    │       ├── test_gpt_model.py
    │       ├── test_heterogeneous_gpt_model.py
    │       ├── test_llava_model.py
    │       ├── test_mamba_model.py
    │       ├── test_mimo_audio_submodules.py
    │       ├── test_mimo_embedding_alignment.py
    │       ├── test_mimo_model.py
    │       ├── test_mimo_submodules.py
    │       ├── test_multimodal_projector.py
    │       ├── test_radio_model.py
    │       └── test_t5_model.py
    │   ├── pipeline_parallel
    │       ├── __init__.py
    │       ├── test_helpers.py
    │       └── test_schedules.py
    │   ├── post_training
    │       ├── __init__.py
    │       └── test_modelopt_module_spec.py
    │   ├── run_ci_test.sh
    │   ├── ssm
    │       ├── test_mamba_block.py
    │       ├── test_mamba_hybrid_layer_allocation.py
    │       ├── test_mamba_layer.py
    │       └── test_mamba_mixer.py
    │   ├── tensor_parallel
    │       ├── __init__.py
    │       ├── test_cross_entropy.py
    │       ├── test_data.py
    │       ├── test_initialization.py
    │       ├── test_layers.py
    │       ├── test_mappings.py
    │       ├── test_random.py
    │       └── test_tensor_parallel_utils.py
    │   ├── test_basic.py
    │   ├── test_checkpointing.py
    │   ├── test_fp8_param.py
    │   ├── test_imports.py
    │   ├── test_inference.py
    │   ├── test_local_multi_tensor_fns.py
    │   ├── test_model_configs.py
    │   ├── test_num_microbatches_calculator.py
    │   ├── test_optimizer.py
    │   ├── test_optimizer_cpu_offloading.py
    │   ├── test_optimizer_param_scheduler.py
    │   ├── test_parallel_state.py
    │   ├── test_process_groups_config.py
    │   ├── test_tokenizer.py
    │   ├── test_training.py
    │   ├── test_utilities.py
    │   ├── test_utils.py
    │   └── transformer
    │       ├── __init__.py
    │       ├── moe
    │           ├── __init__.py
    │           ├── conftest.py
    │           ├── test_a2a_token_dispatcher.py
    │           ├── test_aux_loss.py
    │           ├── test_grouped_mlp.py
    │           ├── test_moe_layer.py
    │           ├── test_moe_layer_discrepancy.py
    │           ├── test_multihot_indices_converter.py
    │           ├── test_routers.py
    │           ├── test_sequential_mlp.py
    │           ├── test_shared_experts.py
    │           ├── test_token_dispatcher.py
    │           └── test_upcycling.py
    │       ├── test_attention.py
    │       ├── test_attention_no_rope.py
    │       ├── test_attention_packed_seq.py
    │       ├── test_core_attention.py
    │       ├── test_cuda_graphs.py
    │       ├── test_mlp.py
    │       ├── test_module.py
    │       ├── test_multi_latent_attention.py
    │       ├── test_multi_token_prediction.py
    │       ├── test_relative_attention.py
    │       ├── test_retro_attention.py
    │       ├── test_rope.py
    │       ├── test_spec_customization.py
    │       ├── test_transformer_block.py
    │       ├── test_transformer_block_custom_pgs.py
    │       ├── test_transformer_layer.py
    │       └── test_utils.py
└── tools
    ├── autoformat.sh
    ├── bert_embedding
        ├── __init__.py
        ├── dataset.py
        ├── embed.py
        ├── external_libs.py
        └── huggingface.py
    ├── checkpoint
        ├── convert.py
        ├── hybrid_conversion.py
        ├── loader_base.py
        ├── loader_core.py
        ├── loader_legacy.py
        ├── loader_llama_mistral.py
        ├── loader_llava.py
        ├── loader_mixtral_hf.py
        ├── saver_base.py
        ├── saver_core.py
        ├── saver_hf_llava.py
        ├── saver_legacy.py
        ├── saver_llava.py
        ├── schema_base.py
        ├── schema_core.py
        ├── schema_hf.py
        └── utils.py
    ├── copyright.sh
    ├── linter.py
    ├── merge_datasets.py
    ├── preprocess_data.py
    ├── preprocess_data_nmt.py
    ├── preprocess_mmdata.py
    ├── report_theoretical_memory.py
    ├── retro
        ├── README.md
        ├── build_db.md
        ├── cli
        │   ├── __init__.py
        │   ├── __main__.py
        │   └── cli.py
        ├── config_utils.py
        ├── docker
        │   └── Dockerfile
        ├── preprocess_data.py
        ├── sft
        │   ├── README.md
        │   ├── dataset_conv.py
        │   ├── open_inst.sh
        │   ├── sft_retro.py
        │   └── sft_retro_lm.sh
        └── text_generation
        │   ├── evaluate.py
        │   ├── metrics.py
        │   ├── retro_api.py
        │   ├── retro_generate.sh
        │   ├── retro_generation.py
        │   └── retro_text_generation.py
    ├── run_inference_performance_test.py
    ├── run_mamba_text_generation_server.py
    ├── run_mamba_text_generation_server_completions.py
    ├── run_text_generation_server.py
    ├── run_vlm_text_generation.py
    ├── text_generation_cli.py
    └── wait_daemon.sh


/.coveragerc:
--------------------------------------------------------------------------------
1 | [html]
2 | directory = coverage
3 | 
4 | [run]
5 | data_file = .coverage_$LOCAL_RANK
6 | relative_files = true
7 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | extend-ignore = E203,E501,F401,E402,E714
4 | per-file-ignores = __init__.py:F401


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: BUG
 3 | about: Report a bug that needs attention
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Stack trace/logs**
20 | If applicable, add the stack trace or logs from the time of the error.
21 | 
22 | **Environment (please complete the following information):**
23 |  - Megatron-LM commit ID
24 |  - PyTorch version
25 |  - CUDA version
26 |  - NCCL version
27 | 
28 | **Proposed fix**
29 | If you have a proposal for how to fix the issue state it here or link to a PR.
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: ENHANCEMENT
 3 | about: Suggest an idea to improve this project
 4 | title: "[ENHANCEMENT]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Proposed implementation**
20 | If you have a proposed implementation for the feature state it here or link to a PR.
21 | 
22 | **Additional context**
23 | Add any other context or screenshots about the feature request here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: QUESTION
 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
 4 |   request
 5 | title: "[QUESTION]"
 6 | labels: ''
 7 | assignees: ''
 8 | 
 9 | ---
10 | 
11 | **Your question**
12 | Ask a clear and concise question about Megatron-LM.
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/regression.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: REGRESSION
 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update
 4 | title: "[REGRESSION]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the regression**
11 | A clear and concise description of what the regression is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Previous performance**
17 | What speed or accuracy did you previously see.
18 | 
19 | **New performance**
20 | What speed or accuracy do you see after the update.
21 | 
22 | **Stack trace/logs**
23 | If applicable, add the stack trace or logs related to the regression.
24 | 
25 | **Environment (please complete the following information):**
26 |  - Previous Megatron-LM commit ID
27 |  - New Megatron-LM commit ID
28 |  - Previous PyTorch version
29 |  - New PyTorch version
30 |  - Previous CUDA version
31 |  - New CUDA version
32 |  - Previous NCCL version
33 |  - New NCCL version
34 | 
35 | **Proposed fix**
36 | If you have a proposal for how to fix the issue state it here or link to a PR.
37 | 
38 | **Additional context**
39 | Add any other context about the problem here.
40 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
 2 | #
 3 | # You can adjust the behavior by modifying this file.
 4 | # For more information, see:
 5 | # https://github.com/actions/stale
 6 | name: Mark stale issues and pull requests
 7 | 
 8 | on:
 9 |   schedule:
10 |   - cron: '15 18 * * *'
11 | 
12 | jobs:
13 |   stale:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       issues: write
18 |       pull-requests: write
19 | 
20 |     steps:
21 |     - uses: actions/stale@v5
22 |       with:
23 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
24 |         days-before-stale: 60
25 |         stale-issue-message: 'Marking as stale. No activity in 60 days.'
26 |         stale-pr-message: 'Marking as stale. No activity in 60 days.'
27 |         stale-issue-label: 'stale'
28 |         stale-pr-label: 'stale'
29 |         remove-stale-when-updated: true
30 |         operations-per-run: 1000
31 |         days-before-close: -1
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.so
 3 | build
 4 | .coverage_*
 5 | *.egg-info
 6 | *~
 7 | slurm*
 8 | logs
 9 | .vscode
10 | local/
11 | .gitmodules
12 | wandb/
13 | onelogger.log
14 | onelogger.err


--------------------------------------------------------------------------------
/.gitlab/labeler-config.yml:
--------------------------------------------------------------------------------
 1 | CI:
 2 |   - .gitlab-ci.yml
 3 |   - Dockerfile.ci.lts
 4 |   - Dockerfile.ci.dev
 5 |   - .github/**
 6 |   - .gitlab/**
 7 | 
 8 | Datasets:
 9 |   - megatron/core/datasets/**
10 | 
11 | BERT:
12 |   - megatron/core/models/bert/**
13 | 
14 | GPT:
15 |   - megatron/core/models/gpt/**
16 | 
17 | RETRO:
18 |   - megatron/core/models/retro/**
19 | 
20 | Dist-Ckpt:
21 |   - megatron/core/dist_checkpointing
22 | 
23 | Dist-Opt:
24 |   - megatron/core/optimizer/distrib_optimizer
25 | 
26 | Inference:
27 |   - megatron/core/inference
28 | 
29 | MoE:
30 |   - megatron/core/transformer/moe
31 | 
32 | Tests:
33 |   - tests/**
34 | 
35 | ParallelState:
36 |   - megatron/core/parallel_state.py
37 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/psf/black
 3 |   rev: 'refs/tags/24.4.2:refs/tags/24.4.2'
 4 |   hooks:
 5 |   - id: black
 6 |     files: ^megatron/core/.*
 7 |     args: ["--skip-magic-trailing-comma"]
 8 | - repo: https://github.com/pycqa/pylint
 9 |   rev: v3.2.6
10 |   hooks:
11 |   - id: pylint
12 |     files: ^megatron/core/.* 
13 | - repo: https://github.com/pycqa/isort
14 |   rev: 5.13.2
15 |   hooks:
16 |   - id: isort
17 |     files: ^megatron/core/.*


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MAIN]
 2 | ignore-paths=tests
 3 | max-line-length=100
 4 | 
 5 | [MESSAGES CONTROL]
 6 | disable=all
 7 | 
 8 | enable=C0115,C0116,W0611,C0301,E0606
 9 | # C0115: missing-class-docstring
10 | # C0116: missing-function-docstring
11 | # W0611: unused-import
12 | # C0301: line-too-long
13 | # E0606: possibly-used-before-assignment


--------------------------------------------------------------------------------
/Dockerfile.linting:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:experimental
 2 | 
 3 | ARG FROM_IMAGE_NAME
 4 | FROM $FROM_IMAGE_NAME as main
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
 8 |       /etc/apt/apt.conf.d/docker-clean
 9 | 
10 | RUN apt-get update && \
11 |       apt-get install -y python3-venv && \
12 |       apt-get clean && \
13 |       python -m venv /opt/jet
14 | 
15 | RUN pip3 install --no-cache-dir \
16 |       black==24.4.2 \
17 |       isort==5.13.2 \
18 |       flake8==7.1.0 \
19 |       pylint==3.2.6 \
20 |       coverage \
21 |       mypy \
22 |       python-gitlab \
23 |       pandas \
24 |       slack-sdk
25 | 
26 | WORKDIR /opt/megatron-lm
27 | 
28 | ##### For NVIDIANS only #####
29 | FROM main as jet
30 | ARG JET_API_VERSION
31 | RUN --mount=type=secret,id=JET_INDEX_URLS \
32 |       JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ 
33 |       pip install --no-cache-dir "jet-client~=2.0" --upgrade $JET_INDEX_URLS
34 | ENV PATH="$PATH:/opt/jet/bin"
35 | ###


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/core/requirements.txt
2 | include megatron/core/README.md
3 | recursive-include requirements *
4 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/index.rst:
--------------------------------------------------------------------------------
 1 | API Guide
 2 | =========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 4
 6 | 
 7 |    models
 8 |    tensor_parallel
 9 |    context_parallel
10 |    pipeline_parallel
11 |    custom_fsdp
12 |    fusions
13 |    transformer
14 |    moe
15 |    dist_checkpointing
16 |    dist_optimizer
17 |    distributed
18 |    datasets
19 |    multi_latent_attention
20 |    num_microbatches_calculator
21 |    optimizer_param_scheduler
22 |    optimizer_cpu_offload
23 |    multi_token_prediction
24 |    encoder_decoder_parallelism


--------------------------------------------------------------------------------
/docs/source/api-guide/models.bert.rst:
--------------------------------------------------------------------------------
 1 | models.bert package
 2 | ===================
 3 | Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 
 4 | 
 5 | Submodules
 6 | ----------
 7 | 
 8 | models.bert.bert\_model module
 9 | ------------------------------
10 | 
11 | .. automodule:: core.models.bert.bert_model
12 |    :members:
13 |    :undoc-members:
14 |    :show-inheritance:
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: core.models.bert
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.gpt.rst:
--------------------------------------------------------------------------------
 1 | models.gpt package
 2 | ==================
 3 | This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 
 4 | 
 5 | Submodules
 6 | ----------
 7 | 
 8 | models.gpt.gpt\_model module
 9 | ----------------------------
10 | 
11 | .. automodule:: core.models.gpt.gpt_model
12 |    :members:
13 |    :undoc-members:
14 |    :show-inheritance:
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: core.models.gpt
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.rst:
--------------------------------------------------------------------------------
 1 | models package
 2 | ==============
 3 | This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 
 4 | 
 5 | Subpackages
 6 | -----------
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 4
10 | 
11 |    models.gpt
12 |    models.t5
13 |    models.bert
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: core.models
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.t5.rst:
--------------------------------------------------------------------------------
 1 | models.t5 package
 2 | =================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | models.t5.t5\_model module
 8 | --------------------------
 9 | 
10 | .. automodule:: core.models.T5.t5_model
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: core.models.T5
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/moe.rst:
--------------------------------------------------------------------------------
1 | Mixture of Experts package
2 | ==========================
3 | 
4 | .. mdinclude :: ../../../megatron/core/transformer/moe/README.md
5 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/multi_latent_attention.rst:
--------------------------------------------------------------------------------
 1 | Multi-Latent Attention
 2 | ======================
 3 | 
 4 | Multi-Latent Attention overview 
 5 | -------------------------------
 6 | 
 7 | Multi-Latent Attention ("MLA") is an innovative attention mechanism introduced by Deepseek team that enhances the efficiency of attention computation by leveraging multiple latent spaces. This approach is particularly beneficial for large language models (LLMs), as it reduces the computational burden associated with traditional attention mechanisms. According to Deepseek-V2 technical report, MLA achieves better performance compared to Multi-Head Attention (MHA) and requires smaller KV cache.
 8 | 
 9 | Enabling Multi-Latent Attention
10 | -------------------------------
11 | 
12 | To enable MLA in Megatron-LM, set the following flags in command line:
13 | - `--multi-latent-attention` to enable MLA in MLP.
14 | - Set `MLATransformerConfig` to configure MLA.
15 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/num_microbatches_calculator.rst:
--------------------------------------------------------------------------------
 1 | Microbatches Calculator
 2 | =======================
 3 | This api is used to calculate the number of microbatches required to fit a given model on a given batch size.
 4 | 
 5 | 
 6 | Module contents
 7 | ---------------
 8 | 
 9 | .. automodule:: core.num_microbatches_calculator
10 |    :members:
11 |    :undoc-members:
12 |    :show-inheritance:
13 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/optimizer_cpu_offload.rst:
--------------------------------------------------------------------------------
1 | Optimizer CPU offload package
2 | ==============================
3 | 
4 | .. mdinclude :: ../../../megatron/core/optimizer/cpu_offloading/README.md
5 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/optimizer_param_scheduler.rst:
--------------------------------------------------------------------------------
 1 | Optimizer Parameters Scheduler
 2 | ==============================
 3 | This api is used to calculate the learning rate and weight decay for the optimizer.
 4 | 
 5 | 
 6 | Module contents
 7 | ---------------
 8 | 
 9 | .. automodule:: core.optimizer_param_scheduler
10 |    :members:
11 |    :undoc-members:
12 |    :show-inheritance:
13 | 


--------------------------------------------------------------------------------
/docs/source/images/context_parallel/CP_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/context_parallel/CP_overview.png


--------------------------------------------------------------------------------
/docs/source/images/context_parallel/CP_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/context_parallel/CP_results.png


--------------------------------------------------------------------------------
/docs/source/images/custom_fsdp/FSDP_Allreduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/custom_fsdp/FSDP_Allreduce.png


--------------------------------------------------------------------------------
/docs/source/images/custom_fsdp/FSDP_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/custom_fsdp/FSDP_workflow.png


--------------------------------------------------------------------------------
/docs/source/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png


--------------------------------------------------------------------------------
/docs/source/images/distrib_optimizer/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/distrib_optimizer/data_flow.png


--------------------------------------------------------------------------------
/docs/source/images/distrib_optimizer/sharding_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/distrib_optimizer/sharding_scheme.png


--------------------------------------------------------------------------------
/docs/source/images/moe/token_drop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/moe/token_drop.png


--------------------------------------------------------------------------------
/docs/source/images/multi_token_prediction/MTP_implementation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/multi_token_prediction/MTP_implementation.png


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Lumache documentation master file, created by
 2 |    sphinx-quickstart on Tue Aug 15 13:44:10 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Megatron Core User Guide
 7 | ===================================
 8 | 
 9 | **Megatron Core** is a Python library that has the core components required to build your language models. 
10 | A reference implementation of Megatron Core can be found in  `NeMo <https://github.com/NVIDIA/NeMo/tree/main>`_ It offers a *simple* and
11 | *intuitive* API.
12 | 
13 | .. toctree::
14 |    :maxdepth: 2
15 |    :caption: User Guide
16 | 
17 |    user-guide/index
18 | 
19 | .. toctree::
20 |    :maxdepth: 3
21 |    :caption: API Guide
22 |    
23 |    api-guide/index
24 | 


--------------------------------------------------------------------------------
/docs/source/user-guide/index.rst:
--------------------------------------------------------------------------------
1 | User Guide 
2 | ============
3 | 
4 | .. mdinclude:: ../../../megatron/core/QuickStart.md
5 | .. mdinclude:: ../../../megatron/core/MSC_Integration.md


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh:
--------------------------------------------------------------------------------
 1 | VOCAB_FILE=pt2-vocab.json
 2 | MERGE_FILE=gpt2-merges.txt
 3 | 
 4 | python3 tools/preprocess_data.py \
 5 |     --input $1 \
 6 |     --output-prefix $2 \
 7 |     --vocab-file $VOCAB_FILE \
 8 |     --merge-file $MERGE_FILE \
 9 |     --tokenizer-type GPT2BPETokenizer \
10 |     --append-eod  --workers 20 --chunk-size 25
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
3 | 
4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/prep_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Preparing the input file for the response generation (second-stage prompting)
 4 | 
 5 | DIR=`pwd`
 6 | 
 7 | TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
 8 |         (e.g., /testseen_processed.txt)
 9 | KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
10 |         (e.g., /testseen_knowledge_generations.txt)
11 | PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
12 |         (e.g., /testseen_processed_with_generated_knowledge.txt)
13 | 
14 | python ${DIR}/tasks/msdp/preprocessing.py \
15 |         --func prepare_input \
16 |         --test_file ${TEST_FILE} \
17 |         --knwl_gen_file ${KNOWLEDGE_FILE} \
18 |         --processed_file ${PROCESSED_FILE}
19 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/SBATCH.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | sbatch -p ${SLURM_PARTITION} \
 5 |        -A ${SLURM_ACCOUNT} \
 6 |        --job-name=${JOB_NAME} \
 7 |        --nodes=${NNODES} \
 8 |        --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 9 | 
10 | exit 0
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/SRUN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 4 | 
 5 | 
 6 | THIS_DIR=`pwd`
 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 8 | mkdir -p ${THIS_DIR}/logs
 9 | 
10 | 
11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
12 | 
13 | 
14 | srun -l \
15 |      --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
16 |      --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
17 |      --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [1, 2, 4, 8].
 8 | PP=1
 9 | 
10 | # Batch size (global batch size) options = [8, 128].
11 | GBS=8
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel size options.
18 | NLS=$((3*PP))
19 | NNODES=${PP}
20 | 
21 | 
22 | # Other params.
23 | TP=8
24 | MBS=1
25 | HS=20480
26 | NAH=128
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Interleaved schedule options = [YES, NO].
 8 | INTERLEAVED=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set interleaved schedule options.
18 | if [ ${INTERLEAVED} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${INTERLEAVED} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and tensor-parallel size options.
18 | TP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | MBS=1
23 | NLS=32
24 | HS=20480
25 | NAH=128
26 | DDP=local
27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
28 | NNODES=8
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and data-parallel size options.
18 | DP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | TP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32].
 8 | TP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set tensor-parallel and data-parallel size options.
18 | DP=$((64/TP))
19 | 
20 | 
21 | # Other params.
22 | PP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Microbatch size options = [1, 2, 4, 8].
 8 | MBS=1
 9 | 
10 | # Batch size (global batch size) options = [128, 512].
11 | GBS=128
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Other params.
18 | TP=8
19 | PP=8
20 | NLS=32
21 | HS=15360
22 | NAH=128
23 | DDP=local
24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
25 | NNODES=8
26 | 
27 | 
28 | # Name of the job.
29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
30 | 
31 | 
32 | # Import the configs.
33 | . `pwd`/CONFIG.sh
34 | 
35 | 
36 | # Submit the job.
37 | . `pwd`/SBATCH.sh
38 | 
39 | 
40 | exit 0
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_17.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Activation recomputation options = [YES, NO].
 8 | ACTIVATION_RECOMPUTATION=YES
 9 | 
10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256].
11 | GBS=1
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set activation recomputation.
18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS=""
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=16
31 | MBS=1
32 | NLS=80
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=16
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_18.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Scatter-gather communication optimization options = [YES, NO].
 8 | SCATTER_GATHER=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set scatter-gather communication optimization options.
18 | if [ ${SCATTER_GATHER} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${SCATTER_GATHER} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/export/README.md:
--------------------------------------------------------------------------------
 1 | # Megatron Core Export
 2 | 
 3 | This module is used to export megatron core models to different inference frameworks. 
 4 | Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
 5 | 
 6 | ## PTQ AND EXPORT
 7 | Follow the examples of [TensorRT Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment. 
 8 | 
 9 | # TRTLLM EXPORT
10 | Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
11 | 


--------------------------------------------------------------------------------
/examples/inference/llama_mistral/huggingface_reference.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 3 | 
 4 | # Set up argument parsing
 5 | parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
 6 | parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
 7 | parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
 8 | 
 9 | # Parse command-line arguments
10 | args = parser.parse_args()
11 | 
12 | model_path = args.model_path
13 | prompt = args.prompt
14 | 
15 | config = AutoConfig.from_pretrained(model_path)
16 | tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
17 | model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
18 | 
19 | inputs = tokenizer(prompt, return_tensors="pt")
20 | for key in inputs:
21 |     inputs[key] = inputs[key].cuda()
22 | # top_k, top_p and do_sample are set for greedy argmax based sampling
23 | 
24 | outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
25 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))


--------------------------------------------------------------------------------
/examples/inference/run_text_generation_server_345M.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model.
 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | export CUDA_DEVICE_MAX_CONNECTIONS=1
14 | 
15 | pip install flask-restful
16 | 
17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
18 |        --tensor-model-parallel-size 1  \
19 |        --pipeline-model-parallel-size 1  \
20 |        --num-layers 24  \
21 |        --hidden-size 1024  \
22 |        --load ${CHECKPOINT}  \
23 |        --num-attention-heads 16  \
24 |        --max-position-embeddings 1024  \
25 |        --tokenizer-type GPT2BPETokenizer  \
26 |        --fp16  \
27 |        --micro-batch-size 1  \
28 |        --seq-length 1024  \
29 |        --vocab-file $VOCAB_FILE  \
30 |        --merge-file $MERGE_FILE  \
31 |        --seed 42
32 | 


--------------------------------------------------------------------------------
/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel
 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | pip install flask-restful
14 | 
15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
16 |        --tensor-model-parallel-size 8  \
17 |        --pipeline-model-parallel-size 1  \
18 |        --num-layers 24  \
19 |        --hidden-size 1024  \
20 |        --load ${CHECKPOINT}  \
21 |        --num-attention-heads 16  \
22 |        --max-position-embeddings 1024  \
23 |        --tokenizer-type GPT2BPETokenizer  \
24 |        --fp16  \
25 |        --micro-batch-size 1  \
26 |        --seq-length 1024  \
27 |        --vocab-file $VOCAB_FILE  \
28 |        --merge-file $MERGE_FILE  \
29 |        --seed 42
30 | 


--------------------------------------------------------------------------------
/examples/mamba/.gitignore:
--------------------------------------------------------------------------------
1 | checkpoints/
2 | data-cache/
3 | tensorboard/
4 | triton-cache/
5 | 


--------------------------------------------------------------------------------
/examples/multimodal/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:24.02-py3
 2 | 
 3 | RUN apt update && \
 4 |     apt -y upgrade && \
 5 |     apt install -y --no-install-recommends \
 6 |         software-properties-common \
 7 |         build-essential \
 8 |         python3-pip \
 9 |         python3-dev \
10 |         bash \
11 |         git \
12 |         vim \
13 |         tmux \
14 |         python-is-python3 \
15 |         default-jre
16 | 
17 | RUN pip install --upgrade pip
18 | RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging
19 | RUN pip install transformers datasets accelerate timm
20 | RUN pip install pytest-cov pytest_mock nltk wrapt
21 | RUN pip install zarr "tensorstore==0.1.45"
22 | RUN pip install black isort click==8.0.2
23 | RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken
24 | RUN pip install git+https://github.com/openai/CLIP.git
25 | # Use --no-deps for the following to avoid outdated and unnecessary dependencies.
26 | RUN pip install open_clip_torch open-flamingo[eval] --no-deps
27 | 


--------------------------------------------------------------------------------
/examples/multimodal/assets/pretrain_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/examples/multimodal/assets/pretrain_curves.png


--------------------------------------------------------------------------------
/examples/multimodal/convert_llava_pretrain_to_wds.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import webdataset as wds
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | llava_pretrain_dir = '<path_to_LLaVA-Pretrain>'
 8 | 
 9 | # Paths to the dataset files
10 | json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json')
11 | output = os.path.join(llava_pretrain_dir, 'wds')
12 | 
13 | if not os.path.exists(output):
14 |     os.mkdir(output)
15 | 
16 | # Load data
17 | with open(json_file, 'r') as f:
18 |     data = json.load(f)
19 | 
20 | with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer:
21 |     for entry in tqdm(data):
22 |         with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file:
23 |                 image_data = img_file.read()
24 |         sample = {
25 |             "__key__": entry['id'],
26 |             "jpg": image_data,
27 |             "json": json.dumps(entry['conversations']).encode("utf-8"),
28 |         }
29 |         shard_writer.write(sample)
30 | 
31 | print(f"Dataset successfully converted to wds")
32 | 


--------------------------------------------------------------------------------
/examples/multimodal/nvlm/pretrain_blend.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 0.579   # Datasets are weighted according to their size. Weights sum up to 1.
 7 |         path: <path to laion dataset>
 8 |         subflavors:
 9 |           augmentation: False
10 | 
11 |       - weight: 0.02
12 |         path: <path to coco>
13 |         subflavors:
14 |           augmentation: False
15 | 
16 |       - weight: 0.01
17 |         path: <path to vqav2 dataset>
18 |         subflavors:
19 |           augmentation: False
20 | 
21 |       # Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
22 |       # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
23 |   val:
24 |     datasets:
25 |       - weight: 1.
26 |         path: <path to validation dataset>
27 |         subflavors:
28 |           augmentation: False
29 | 


--------------------------------------------------------------------------------
/examples/multimodal/nvlm/sft_blend.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 0.01  # # Datasets are weighted according to their size. Weights sum up to 1.
 7 |         path: <path to coco>
 8 |         subflavors:
 9 |           augmentation: False
10 | 
11 |       - weight: 0.02
12 |         path: <path to clevr-math dataset>
13 |         subflavors:
14 |           augmentation: False
15 | 
16 |       # Please refer to Table 6 in https://arxiv.org/pdf/2409.11402 for full list of SFT datasets.
17 |       # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
18 |   val:
19 |     datasets:
20 |       - weight: 1.
21 |         path: <path to validation dataset>
22 |         subflavors:
23 |           augmentation: False
24 | 


--------------------------------------------------------------------------------
/examples/multimodal/pretrain_dataset.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 1.
 7 |         path: <path_to_pretraining_dataset_in_energon_format>
 8 |         subflavors:
 9 |           augmentation: false
10 |   val:
11 |     datasets:
12 |       - weight: 1.
13 |         path: <path_to_pretraining_dataset_in_energon_format>
14 |         subflavors:
15 |           augmentation: false
16 | 


--------------------------------------------------------------------------------
/examples/multimodal/sft_dataset.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 1.
 7 |         path: <path_to_sft_dataset_in_energon_format>
 8 |         subflavors:
 9 |           augmentation: false
10 |   val:
11 |     datasets:
12 |       - weight: 1.
13 |         path: <path_to_sft_dataset_in_energon_format>
14 |         subflavors:
15 |           augmentation: false
16 | 


--------------------------------------------------------------------------------
/examples/post_training/modelopt/conf/meta-llama/Llama-3.1-8B-Instruct.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${HF_MODEL_CKPT} ]; then
 4 |     HF_MODEL_CKPT=meta-llama/Llama-3.1-8B-Instruct
 5 |     TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8
 6 | else
 7 |     TOKENIZER_MODEL=${HF_MODEL_CKPT}
 8 | fi
 9 | 
10 | MODEL_ARGS=" \
11 |     --save-interval 100000 \
12 |     --micro-batch-size 1 \
13 |     --bf16 \
14 |     --no-masked-softmax-fusion \
15 |     --disable-bias-linear \
16 |     --untie-embeddings-and-output-weights \
17 |     --use-rotary-position-embeddings \
18 |     --rotary-percent 1.0 \
19 |     --no-rope-fusion \
20 |     --no-position-embedding \
21 |     --normalization RMSNorm \
22 |     --swiglu \
23 |     --num-layers 32 \
24 |     --hidden-size 4096 \
25 |     --ffn-hidden-size 14336 \
26 |     --num-attention-heads 32 \
27 |     --group-query-attention \
28 |     --num-query-groups 8 \
29 |     --seq-length 4096 \
30 |     --max-position-embeddings 8192 \
31 |     --tokenizer-type HuggingFaceTokenizer \
32 |     --make-vocab-size-divisible-by 1 \
33 |     --use-mcore-models \
34 |     --rotary-base 500000 \
35 |     --use-rope-scaling \
36 | "
37 | 


--------------------------------------------------------------------------------
/examples/post_training/modelopt/conf/meta-llama/Llama-3.2-1B-Instruct.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${HF_MODEL_CKPT} ]; then
 4 |     HF_MODEL_CKPT=meta-llama/Llama-3.2-1B-Instruct
 5 |     TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8
 6 | else
 7 |     TOKENIZER_MODEL=${HF_MODEL_CKPT}
 8 | fi
 9 | 
10 | MODEL_ARGS=" \
11 |     --save-interval 100000 \
12 |     --micro-batch-size 1 \
13 |     --bf16 \
14 |     --no-masked-softmax-fusion \
15 |     --disable-bias-linear \
16 |     --use-rotary-position-embeddings \
17 |     --no-rope-fusion \
18 |     --no-position-embedding \
19 |     --normalization RMSNorm \
20 |     --swiglu \
21 |     --num-layers 16 \
22 |     --hidden-size 2048 \
23 |     --ffn-hidden-size 8192 \
24 |     --num-attention-heads 32 \
25 |     --group-query-attention \
26 |     --num-query-groups 8 \
27 |     --seq-length 4096 \
28 |     --max-position-embeddings 8192 \
29 |     --tokenizer-type HuggingFaceTokenizer \
30 |     --make-vocab-size-divisible-by 1 \
31 |     --use-mcore-models \
32 |     --rotary-percent 1.0 \
33 |     --rotary-base 500000 \
34 |     --use-rope-scaling \
35 |     --export-force-local-attention \
36 | "
37 | 


--------------------------------------------------------------------------------
/examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${HF_MODEL_CKPT} ]; then
 4 |     HF_MODEL_CKPT=nvidia/Nemotron-H-4B-Instruct
 5 |     TOKENIZER_MODEL=nvidia/Nemotron-H-4B-Instruct
 6 | else
 7 |     TOKENIZER_MODEL=${HF_MODEL_CKPT}
 8 | fi
 9 | 
10 | MODEL_ARGS=" \
11 |     --save-interval 100000 \
12 |     --micro-batch-size 1 \
13 |     --bf16 \
14 |     --no-masked-softmax-fusion \
15 |     --disable-bias-linear \
16 |     --untie-embeddings-and-output-weights \
17 |     --use-rotary-position-embeddings \
18 |     --rotary-percent 0.5 \
19 |     --no-rope-fusion \
20 |     --no-position-embedding \
21 |     --normalization RMSNorm \
22 |     --squared-relu \
23 |     --num-layers 52 \
24 |     --hidden-size 3072 \
25 |     --ffn-hidden-size 12288 \
26 |     --kv-channels 128 \
27 |     --num-attention-heads 32 \
28 |     --group-query-attention \
29 |     --num-query-groups 8 \
30 |     --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
31 |     --mamba-head-dim 64 \
32 |     --mamba-num-heads 112 \
33 |     --mamba-num-groups 8 \
34 |     --mamba-state-dim 128 \
35 |     --seq-length 4096 \
36 |     --max-position-embeddings 8192 \
37 |     --tokenizer-type HuggingFaceTokenizer \
38 |     --make-vocab-size-divisible-by 1 \
39 |     --use-mcore-models \
40 |     --rotary-base 10000 \
41 |     --export-model-type MambaModel \
42 | "
43 | 


--------------------------------------------------------------------------------
/examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z ${HF_MODEL_CKPT} ]; then
 4 |     HF_MODEL_CKPT=nvidia/Nemotron-Mini-4B-Instruct
 5 |     TOKENIZER_MODEL=nvidia/Nemotron-Mini-4B-Instruct
 6 | else
 7 |     TOKENIZER_MODEL=${HF_MODEL_CKPT}
 8 | fi
 9 | 
10 | MODEL_ARGS=" \
11 |     --save-interval 100000 \
12 |     --micro-batch-size 1 \
13 |     --bf16 \
14 |     --no-masked-softmax-fusion \
15 |     --disable-bias-linear \
16 |     --untie-embeddings-and-output-weights \
17 |     --use-rotary-position-embeddings \
18 |     --rotary-percent 0.5 \
19 |     --no-rope-fusion \
20 |     --no-position-embedding \
21 |     --normalization LayerNorm \
22 |     --apply-layernorm-1p \
23 |     --squared-relu \
24 |     --num-layers 32 \
25 |     --hidden-size 3072 \
26 |     --ffn-hidden-size 9216 \
27 |     --num-attention-heads 24 \
28 |     --group-query-attention \
29 |     --num-query-groups 8 \
30 |     --seq-length 4096 \
31 |     --max-position-embeddings 4096 \
32 |     --tokenizer-type HuggingFaceTokenizer \
33 |     --make-vocab-size-divisible-by 1 \
34 |     --use-mcore-models \
35 |     --rotary-base 10000 \
36 | "
37 | 


--------------------------------------------------------------------------------
/examples/post_training/modelopt/mmlu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 4 | 
 5 | # Common arguments and base model specific arguments
 6 | source "${SCRIPT_DIR}/conf/arguments.sh"
 7 | 
 8 | # Extra arguments of this script
 9 | MLM_DEFAULT_ARGS="--finetune --auto-detect-ckpt-format --export-te-mcore-model --sequence-parallel"
10 | 
11 | ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/mmlu.py \
12 |     ${MODEL_ARGS} \
13 |     --tensor-model-parallel-size ${TP} \
14 |     --expert-model-parallel-size ${EP} \
15 |     --pipeline-model-parallel-size ${PP} \
16 |     --tokenizer-model ${TOKENIZER_MODEL} \
17 |     --load ${MLM_MODEL_CKPT} \
18 |     ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS}
19 | 


--------------------------------------------------------------------------------
/examples/post_training/modelopt/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets
 2 | jsonlines
 3 | mamba-ssm
 4 | causal-conv1d
 5 | nvidia-modelopt
 6 | omegaconf
 7 | pulp
 8 | tensorstore!=0.1.46,!=0.1.72
 9 | torchprofile
10 | transformers
11 | zarr
12 | 


--------------------------------------------------------------------------------
/examples/t5/t5_mcore_train_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/examples/t5/t5_mcore_train_curve.png


--------------------------------------------------------------------------------
/images/model_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/images/model_table.png


--------------------------------------------------------------------------------
/images/strong_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/images/strong_scaling.png


--------------------------------------------------------------------------------
/images/weak_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/images/weak_scaling.png


--------------------------------------------------------------------------------
/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import megatron.core.tensor_parallel
 4 | import megatron.core.utils
 5 | from megatron.core import parallel_state
 6 | from megatron.core.distributed import DistributedDataParallel
 7 | from megatron.core.inference_params import InferenceParams
 8 | from megatron.core.model_parallel_config import ModelParallelConfig
 9 | from megatron.core.package_info import (
10 |     __contact_emails__,
11 |     __contact_names__,
12 |     __description__,
13 |     __download_url__,
14 |     __homepage__,
15 |     __keywords__,
16 |     __license__,
17 |     __package_name__,
18 |     __repository_url__,
19 |     __shortversion__,
20 |     __version__,
21 | )
22 | from megatron.core.timers import Timers
23 | 
24 | # Alias parallel_state as mpu, its legacy name
25 | mpu = parallel_state
26 | 
27 | __all__ = [
28 |     "parallel_state",
29 |     "tensor_parallel",
30 |     "utils",
31 |     "DistributedDataParallel",
32 |     "InferenceParams",
33 |     "ModelParallelConfig",
34 |     "Timers",
35 | ]
36 | 


--------------------------------------------------------------------------------
/megatron/core/config.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | ENABLE_EXPERIMENTAL = False
4 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | 
 4 | LIBNAME = helpers_cpp
 5 | LIBEXT = $(shell python3-config --extension-suffix)
 6 | 
 7 | OUT = $(LIBNAME)$(LIBEXT)
 8 | SRC = helpers.cpp
 9 | 
10 | default: $(OUT)
11 | 
12 | $(OUT): $(SRC)
13 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
14 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/datasets/__init__.py


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .config import RetroGPTChunkDatasets
4 | from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
5 | from .query.retro_dataset import get_retro_datasets
6 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - Embedder: Base class for all Bert embedders.
 7 |   - RetroBertEmbedders: Container class for in-memory and on-disk embedders.
 8 |   - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing.
 9 |   - RetroGPTChunkDatasets: Container class for train, valid, and test datasets.
10 |   - RetroTokenizers: Container class for GPT and Bert tokenizers.
11 | """
12 | 
13 | from .bert_embedders import Embedder, RetroBertEmbedders
14 | from .config import RetroPreprocessingConfig
15 | from .gpt_chunk_datasets import RetroGPTChunkDatasets
16 | from .tokenizers import RetroTokenizers
17 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/gpt_chunk_datasets.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Container dataclass for GPT chunk datasets (train, valid, and test)."""
 4 | 
 5 | from dataclasses import dataclass
 6 | 
 7 | 
 8 | @dataclass
 9 | class RetroGPTChunkDatasets:
10 |     """Container dataclass for GPT chunk datasets."""
11 | 
12 |     # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'.
13 |     train: dict = None
14 |     valid: dict = None
15 |     test: dict = None
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/tokenizers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Container class for GPT and Bert tokenizers."""
 4 | 
 5 | from dataclasses import dataclass
 6 | 
 7 | from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
 8 | 
 9 | 
10 | @dataclass
11 | class RetroTokenizers:
12 |     """Container class for GPT and Bert tokenizers."""
13 | 
14 |     gpt: MegatronTokenizer = None
15 |     bert: MegatronTokenizer = None
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/db/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - build_db: Build a chunk database from a list of indexed datasets.
 7 | """
 8 | 
 9 | from .build import build_db
10 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Required external libraries for Retro preprocessing."""
 4 | 
 5 | import importlib
 6 | 
 7 | required_libs = ["faiss", "h5py", "transformers"]  # for huggingface bert
 8 | 
 9 | for lib in required_libs:
10 |     try:
11 |         globals()[lib] = importlib.import_module(lib)
12 |     except ImportError as e:
13 |         raise Exception(
14 |             f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'."
15 |         )
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/index/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - train_index: Train an index on representative vectors.
 7 |   - add_to_index: Add vectors to a trained index.
 8 |   - build_index: Wrapper function that calls above two functions.
 9 | """
10 | 
11 | from .build import add_to_index, build_index, train_index
12 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/index/indexes/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | - FaissBaseIndex: Unoptimized Faiss index wrapper
 6 | - FaissParallelAddIndex: Optimized index.add() for Faiss index.
 7 | """
 8 | 
 9 | from .faiss_base import FaissBaseIndex
10 | from .faiss_par_add import FaissParallelAddIndex
11 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/query/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Utilities for querying the pretraining dataset."""
 4 | 
 5 | import os
 6 | 
 7 | from megatron.core.datasets.megatron_dataset import MegatronDataset
 8 | 
 9 | 
10 | def get_query_dir(project_dir: str) -> str:
11 |     """Get root directory of all saved query data.
12 | 
13 |     Args:
14 |         project_dir (str): Retro project dir.
15 | 
16 |     Returns:
17 |         Path to query sub-directory in Retro project.
18 |     """
19 |     return os.path.join(project_dir, "query")
20 | 
21 | 
22 | def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str:
23 |     """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test).
24 | 
25 |     Args:
26 |         project_dir (str): Retro project dir.
27 |         key (str): Dataset split key; 'train', 'valid', or 'test'.
28 |         dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors.
29 | 
30 |     Returns:
31 |         Path to directory containing this dataset's neighbors within Retro project.
32 |     """
33 |     return os.path.join(
34 |         get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}")
35 |     )
36 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/utils_s3.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from megatron.core.datasets.object_storage_utils import (  # pylint: disable=unused-import
3 |     S3_PREFIX,
4 |     S3Client,
5 | )
6 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from .core import check_is_distributed_checkpoint
 4 | from .mapping import LocalNonpersistentObject, ShardedObject, ShardedTensor
 5 | from .serialization import (
 6 |     load,
 7 |     load_common_state_dict,
 8 |     load_plain_tensors,
 9 |     load_tensors_metadata,
10 |     remove_sharded_tensors,
11 |     save,
12 | )
13 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | """ Various loading and saving strategies """
4 | from megatron.core.dist_checkpointing.strategies.common import register_default_common_strategies
5 | 
6 | # We load "common" strategies by default to be always available
7 | register_default_common_strategies()
8 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """ FS Reader with metadata cached support. """
 4 | 
 5 | import os
 6 | from typing import Union
 7 | 
 8 | from torch.distributed.checkpoint import FileSystemReader, Metadata
 9 | 
10 | 
11 | class CachedMetadataFileSystemReader(FileSystemReader):
12 |     """
13 |     Extends FileSystemReader to cache metadata for improved performance.
14 | 
15 |     Attributes:
16 |         _cached_metadata (Metadata or None): Cached metadata from the file system.
17 |     """
18 | 
19 |     def __init__(self, path: Union[str, os.PathLike]) -> None:
20 |         """
21 |         Initialize with file system path.
22 | 
23 |         Args:
24 |             path (Union[str, os.PathLike]): Path to the checkpoint directory or file.
25 |         """
26 |         super().__init__(path=path)
27 |         self._cached_metadata = None
28 | 
29 |     def read_metadata(self) -> Metadata:
30 |         """
31 |         Read metadata from file system, caching for subsequent calls.
32 | 
33 |         Returns:
34 |             Metadata: Checkpoint metadata.
35 |         """
36 |         if self._cached_metadata is None:
37 |             self._cached_metadata = super().read_metadata()
38 |         return self._cached_metadata
39 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/README.md:
--------------------------------------------------------------------------------
 1 | ## How to use pytorch FSDP2?
 2 | 
 3 | Add these flag to enable Torch FSDP2.
 4 | 
 5 | ```
 6 | --use-torch-fsdp2
 7 | --no-gradient-accumulation-fusion
 8 | --ckpt-format torch_dist
 9 | ```
10 | 
11 | It is worth noting that CUDA_MAX_CONNECTIONS=1 should not be enabled to ensure that the communication of FSDP and the computation on the primary stream can be fully parallelized.
12 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from packaging.version import Version
 4 | 
 5 | from .distributed_data_parallel import DistributedDataParallel
 6 | from .distributed_data_parallel_config import DistributedDataParallelConfig
 7 | from .finalize_model_grads import finalize_model_grads
 8 | from .torch_fully_sharded_data_parallel import TorchFullyShardedDataParallel
 9 | from .torch_fully_sharded_data_parallel_config import TorchFullyShardedDataParallelConfig
10 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/custom_fsdp/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .fully_sharded_data_parallel import FullyShardedDataParallel
4 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Union
 5 | 
 6 | from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig
 7 | 
 8 | 
 9 | @dataclass
10 | class TorchFullyShardedDataParallelConfig(DistributedDataParallelConfig):
11 |     """Configuration for TorchFullyShardedDataParallel."""
12 | 
13 |     reshard_after_forward: Union[bool, int] = True
14 |     """
15 |     Controls the parameter behavior after forward.
16 | 
17 |     See PyTorch for complete documentation:
18 |     https://github.com/pytorch/pytorch/blob/ac8ddf115065106f038865389a07f2d0c9ed5e11/torch/distributed/fsdp/_fully_shard/_fully_shard.py#L97C31-L97C49 # pylint: disable=line-too-long 
19 |     """
20 | 


--------------------------------------------------------------------------------
/megatron/core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | class ModelType(enum.Enum):
 7 |     """Model type."""
 8 | 
 9 |     encoder_or_decoder = 1
10 |     encoder_and_decoder = 2
11 |     retro_encoder = 3
12 |     retro_decoder = 4
13 | 
14 | 
15 | class Fp8Recipe(str, enum.Enum):
16 |     """FP8 recipe names: delayed, tensorwise, mxfp8, blockwise."""
17 | 
18 |     delayed = "delayed"
19 |     tensorwise = "tensorwise"
20 |     mxfp8 = "mxfp8"
21 |     blockwise = "blockwise"
22 | 


--------------------------------------------------------------------------------
/megatron/core/export/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/data_type.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from enum import Enum
4 | 
5 | DataType = Enum('DataType', ["bfloat16", "float16", "float32"])
6 | 


--------------------------------------------------------------------------------
/megatron/core/export/export_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import warnings
 4 | from dataclasses import dataclass
 5 | from typing import Optional
 6 | 
 7 | 
 8 | @dataclass
 9 | class ExportConfig:
10 |     """Base configuration for Megatron Core Export
11 | 
12 |     These parameters control the export setting for trtllm
13 |     """
14 | 
15 |     inference_tp_size: int = 1
16 | 
17 |     inference_pp_size: int = 1
18 | 
19 |     use_parallel_embedding: bool = False
20 | 
21 |     use_embedding_sharing: Optional[bool] = None
22 | 
23 |     def __post_init__(self):
24 |         if self.use_embedding_sharing is not None:
25 |             with warnings.catch_warnings():
26 |                 warnings.simplefilter("always")
27 |                 warnings.warn(
28 |                     "use_embedding_sharing is deprecated in ExportConfig, "
29 |                     "use share_embeddings_and_output_weights in TRTLLMHelper instead",
30 |                     DeprecationWarning,
31 |                     stacklevel=3,
32 |                 )
33 | 


--------------------------------------------------------------------------------
/megatron/core/export/model_type.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from enum import Enum
4 | 
5 | ModelType = Enum(
6 |     'ModelType',
7 |     ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma", "nemotron_nas"],
8 | )
9 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/engine_builder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/trt_model_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import tensorrt_llm
 4 | 
 5 | from megatron.core.export.model_type import ModelType
 6 | 
 7 | TRT_MODEL_CONFIG = {
 8 |     ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig,
 9 |     ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig,
10 |     ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig,
11 |     ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig,
12 |     ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig,
13 |     ModelType.gemma: tensorrt_llm.models.GemmaConfig,
14 |     ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig,
15 |     ModelType.nemotron_nas: tensorrt_llm.models.nemotron_nas.config.DeciConfig,
16 | }
17 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/trt_model_type.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from megatron.core.export.model_type import ModelType
 4 | 
 5 | TRT_MODEL_TYPE_STRING = {
 6 |     ModelType.gpt: 'GPTForCausalLM',
 7 |     ModelType.gptnext: 'GPTForCausalLM',
 8 |     ModelType.starcoder: 'GPTForCausalLM',
 9 |     ModelType.mixtral: 'LlamaForCausalLM',
10 |     ModelType.llama: 'LlamaForCausalLM',
11 |     ModelType.gemma: 'GemmaForCausalLM',
12 |     ModelType.falcon: 'FalconForCausalLM',
13 |     ModelType.nemotron_nas: 'DeciLMForCausalLM',
14 | }
15 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/trtllm_weights_converter/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | GATED_ACTIVATION = ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
4 | 
5 | 
6 | def is_gated_activation(helper):
7 |     """Check whether the model is gated activation"""
8 |     return helper.activation in GATED_ACTIVATION or helper.transformer_config.gated_linear_unit
9 | 


--------------------------------------------------------------------------------
/megatron/core/extensions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/extensions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/fusions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/fusions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/inference/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/common_inference_params.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | from megatron.core.inference.sampling_params import (  # noqa: F401 # pylint: disable=unused-import
3 |     SamplingParams as CommonInferenceParams,
4 | )
5 | 


--------------------------------------------------------------------------------
/megatron/core/inference/contexts/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import warnings
 4 | 
 5 | from .base_context import BaseInferenceContext
 6 | from .dynamic_chunk_allocator import ChunkAllocator
 7 | from .static_context import StaticInferenceContext
 8 | 
 9 | warnings.warn(
10 |     "The following imports from `dynamic_context.py` will be removed "
11 |     "in this file in `megatron-core` 0.14. The imports here result in "
12 |     "a cyclic import issue that causes rotary embeddings to import "
13 |     "from Apex rather than Transformer Engine.",
14 |     DeprecationWarning,
15 | )
16 | from .dynamic_context import (
17 |     ChunkOverflowError,
18 |     ContextOverflowError,
19 |     DynamicInferenceContext,
20 |     RequestOverflowError,
21 |     TokenOverflowError,
22 | )
23 | 


--------------------------------------------------------------------------------
/megatron/core/inference/contexts/base_context.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import abc
 4 | 
 5 | 
 6 | class BaseInferenceContext(abc.ABC):
 7 |     """Base class for inference contexts.
 8 | 
 9 |     Currently extended by `StaticInferenceContext` and `DynamicInferenceContext`.
10 |     Extend this class for any future contexts types.
11 |     """
12 | 
13 |     @abc.abstractmethod
14 |     def is_static_batching(self) -> bool:
15 |         """Return `True` if context uses static batching."""
16 |         pass
17 | 
18 |     def is_dynamic_batching(self) -> bool:
19 |         """Return `True` if context uses dynamic batching."""
20 |         return not self.is_static_batching()
21 | 


--------------------------------------------------------------------------------
/megatron/core/inference/engines/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .abstract_engine import AbstractEngine
4 | from .dynamic_engine import DynamicInferenceEngine
5 | from .static_engine import StaticInferenceEngine
6 | 


--------------------------------------------------------------------------------
/megatron/core/inference/engines/abstract_engine.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | from typing import List
 4 | 
 5 | 
 6 | class AbstractEngine(ABC):
 7 |     @staticmethod
 8 |     @abstractmethod
 9 |     def generate(self) -> dict:
10 |         """The abstract backend's generate function.
11 | 
12 |         To define a new backend, implement this and return the outputs as a dictionary.
13 | 
14 |         Returns:
15 |             dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`.
16 |         """
17 |         pass
18 | 


--------------------------------------------------------------------------------
/megatron/core/inference/engines/mcore_engine.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .static_engine import (  # noqa: F401 # pylint: disable=unused-import
4 |     StaticInferenceEngine as MCoreEngine,
5 | )
6 | 


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/t5/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/text_generation_controllers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from megatron.core.inference.text_generation_controllers.text_generation_controller import (  # noqa: F401 # pylint: disable=unused-import
4 |     TextGenerationController as SimpleTextGenerationController,
5 | )
6 | 


--------------------------------------------------------------------------------
/megatron/core/inference/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | class Counter:
 3 |     """A simple counter class
 4 | 
 5 |     This class is responsible for assigning request ids to incoming requests
 6 |     """
 7 | 
 8 |     def __init__(self, start: int = 0) -> None:
 9 |         self.counter = start
10 | 
11 |     def __next__(self) -> int:
12 |         i = self.counter
13 |         self.counter += 1
14 |         return i
15 | 
16 |     def reset(self) -> None:
17 |         """Reset counter"""
18 |         self.counter = 0
19 | 


--------------------------------------------------------------------------------
/megatron/core/inference_params.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .inference.contexts import (  # noqa: F401 # pylint: disable=unused-import
4 |     StaticInferenceContext as InferenceParams,
5 | )
6 | 


--------------------------------------------------------------------------------
/megatron/core/jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from megatron.core.utils import is_torch_min_version
 6 | 
 7 | jit_fuser = torch.jit.script
 8 | # nvFuser is deprecated in PyTorch JIT starting from 2.2
 9 | if is_torch_min_version("2.2.0a0"):
10 |     jit_fuser = torch.compile
11 | 


--------------------------------------------------------------------------------
/megatron/core/models/T5/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .t5_model import T5Model
3 | 


--------------------------------------------------------------------------------
/megatron/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/bert/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/common/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .rope_utils import apply_rotary_pos_emb
4 | from .rotary_pos_embedding import MultimodalRotaryEmbedding, RotaryEmbedding
5 | from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale
6 | 


--------------------------------------------------------------------------------
/megatron/core/models/common/language_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/common/language_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/common/vision_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/vision_module.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | """Megatron Vision Module."""
 3 | 
 4 | from megatron.core.transformer.module import MegatronModule
 5 | from megatron.core.transformer.transformer_config import TransformerConfig
 6 | 
 7 | 
 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes.
 9 | class VisionModule(MegatronModule):
10 |     """Base vision module that has common helper functions used across CLIP, ViT, etc.
11 | 
12 |     Args:
13 |         config (TransformerConfig): Input transformer config for the model
14 |     """
15 | 
16 |     def __init__(self, config: TransformerConfig) -> None:
17 |         super().__init__(config=config)
18 | 


--------------------------------------------------------------------------------
/megatron/core/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .gpt_model import GPTModel
3 | 


--------------------------------------------------------------------------------
/megatron/core/models/huggingface/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | from .module import HuggingFaceModule, build_hf_model
3 | 


--------------------------------------------------------------------------------
/megatron/core/models/huggingface/clip_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from transformers import AutoModel
 4 | from transformers.models.siglip.modeling_siglip import SiglipEncoderLayer
 5 | 
 6 | from megatron.core.models.huggingface import HuggingFaceModule
 7 | 
 8 | 
 9 | class SiglipHuggingFaceModel(HuggingFaceModule):
10 |     """
11 |     Wrapper for Siglip HuggingFace models.
12 |     """
13 | 
14 |     # Currently applies to FSDP2 only, not the custom FSDP implementation.
15 |     _fsdp_modules = [SiglipEncoderLayer]
16 | 
17 |     def __init__(self, config):
18 |         super().__init__(config)
19 |         self.model = AutoModel.from_pretrained(config.vision_model_type.split("hf://")[1])
20 | 
21 |     def forward(self, *args, **kwargs):
22 |         """Siglip forward."""
23 |         x = self.model(*args, **kwargs)
24 |         x = x["last_hidden_state"]
25 | 
26 |         return x
27 | 


--------------------------------------------------------------------------------
/megatron/core/models/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .mamba_model import MambaModel
3 | 


--------------------------------------------------------------------------------
/megatron/core/models/mimo/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from megatron.core.models.mimo.config.base_configs import MimoModelConfig
 4 | from megatron.core.models.mimo.model import MimoModel
 5 | from megatron.core.models.mimo.submodules.audio import AudioModalitySubmodules
 6 | from megatron.core.models.mimo.submodules.base import ModalitySubmodules
 7 | from megatron.core.models.mimo.submodules.vision import VisionModalitySubmodules
 8 | 
 9 | __all__ = [
10 |     'MimoModelConfig',
11 |     'MimoModel',
12 |     # Submodule classes
13 |     'ModalitySubmodules',
14 |     'VisionModalitySubmodules',
15 |     'AudioModalitySubmodules',
16 | ]
17 | 


--------------------------------------------------------------------------------
/megatron/core/models/mimo/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from megatron.core.models.mimo.config.base_configs import MimoModelConfig
4 | 
5 | __all__ = ['MimoModelConfig']
6 | 


--------------------------------------------------------------------------------
/megatron/core/models/mimo/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | from megatron.core.models.mimo.model.base import MimoModel
3 | 
4 | __all__ = ['MimoModel']
5 | 


--------------------------------------------------------------------------------
/megatron/core/models/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - RetroConfig: configuration dataclass for RetroModel.
 7 |   - RetroModel: The Retro model.
 8 |   - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block.
 9 | """
10 | 
11 | from .config import RetroConfig
12 | from .decoder_spec import get_retro_decoder_block_spec
13 | from .model import RetroModel
14 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def get_config_path(project_dir: str) -> str:
 9 |     """Config copy stored within retro project dir."""
10 |     return os.path.join(project_dir, "config.json")
11 | 
12 | 
13 | def get_gpt_data_dir(project_dir: str) -> str:
14 |     """Get project-relative directory of GPT bin/idx datasets."""
15 |     return os.path.join(project_dir, "data")
16 | 
17 | 
18 | # ** Note ** : Retro's compatibility between cross attention and Flash/Fused
19 | #   Attention is currently a work in progress. We default to returning None for
20 | #   now.
21 | # def get_all_true_mask(size, device):
22 | #     return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device)
23 | def get_all_true_mask(size, device):
24 |     return None
25 | 


--------------------------------------------------------------------------------
/megatron/core/models/vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/vision/__init__.py


--------------------------------------------------------------------------------
/megatron/core/optimizer/cpu_offloading/README.md:
--------------------------------------------------------------------------------
 1 | ## How to use ?
 2 | 
 3 | Add these flags to enable optimizer cpu offload in MCore.
 4 | 
 5 | ```bash
 6 | --optimizer-cpu-offload
 7 | --optimizer-offload-fraction 1.0
 8 | --use-precision-aware-optimizer
 9 | ```
10 | 
11 | ## Configuration Recommendataions
12 | 
13 | Gradient copy from GPU to CPU, CPU optimizer step, and subsequent parameter copy from CPU to GPU can be time-consuming operations, and it is recommended to use the flag `--overlap-cpu-optimizer-d2h-h2d` to execute them concurrently.
14 | 


--------------------------------------------------------------------------------
/megatron/core/optimizer/cpu_offloading/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | from .hybrid_optimizer import HybridDeviceOptimizer
3 | 


--------------------------------------------------------------------------------
/megatron/core/package_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | 
 4 | MAJOR = 0
 5 | MINOR = 13
 6 | PATCH = 0
 7 | PRE_RELEASE = 'rc0'
 8 | 
 9 | # Use the following formatting: (major, minor, patch, pre-release)
10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
11 | 
12 | __shortversion__ = '.'.join(map(str, VERSION[:3]))
13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
14 | 
15 | __package_name__ = 'megatron_core'
16 | __contact_names__ = 'NVIDIA'
17 | __contact_emails__ = 'nemo-toolkit@nvidia.com'  # use NeMo Email
18 | __homepage__ = (
19 |     'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'  # use NeMo homepage
20 | )
21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
23 | __description__ = (
24 |     'Megatron Core - a library for efficient and scalable training of transformer based models'
25 | )
26 | __license__ = 'BSD-3'
27 | __keywords__ = (
28 |     'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
29 | )
30 | 


--------------------------------------------------------------------------------
/megatron/core/packed_seq_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | from dataclasses import dataclass
 3 | 
 4 | from torch import Tensor
 5 | 
 6 | 
 7 | @dataclass
 8 | class PackedSeqParams:
 9 |     '''
10 |     parameters to TEDotProductAttention and fused rope kernels for the
11 |     `thd` (packed) sequence format
12 |     '''
13 | 
14 |     qkv_format: str = None
15 |     cu_seqlens_q: Tensor = None
16 |     cu_seqlens_kv: Tensor = None
17 |     cu_seqlens_q_padded: Tensor = None
18 |     cu_seqlens_kv_padded: Tensor = None
19 |     max_seqlen_q: Tensor = None
20 |     max_seqlen_kv: Tensor = None
21 | 


--------------------------------------------------------------------------------
/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .schedules import get_forward_backward_func
3 | 


--------------------------------------------------------------------------------
/megatron/core/post_training/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/post_training/modelopt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | """Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt).
 3 | 
 4 | ModelOpt is a library comprising state-of-the-art model optimization techniques
 5 | including quantization and sparsity to compress model for efficient inference on
 6 | NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
 7 | experience for users to optimize their Megatron-core models for inference.
 8 | More details on ModelOpt including installation and usage can be found at
 9 | https://github.com/NVIDIA/TensorRT-Model-Optimizer.
10 | """
11 | 


--------------------------------------------------------------------------------
/megatron/core/post_training/modelopt/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/post_training/modelopt/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | packaging
3 | 


--------------------------------------------------------------------------------
/megatron/core/ssm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/ssm/mlp_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from megatron.core.process_groups_config import ModelCommProcessGroups
 6 | from megatron.core.transformer import (
 7 |     TransformerConfig,
 8 |     TransformerLayer,
 9 |     TransformerLayerSubmodules,
10 | )
11 | 
12 | 
13 | class MLPLayer(TransformerLayer):
14 |     """Drop-in replacement for TransformerLayer but initializes only an MLP via the spec."""
15 | 
16 |     def __init__(
17 |         self,
18 |         config: TransformerConfig,
19 |         submodules: TransformerLayerSubmodules,
20 |         layer_number: int = 1,
21 |         hidden_dropout: float = None,
22 |         model_comm_pgs: Optional[ModelCommProcessGroups] = None,
23 |     ):
24 |         super().__init__(
25 |             config=config,
26 |             submodules=submodules,
27 |             layer_number=layer_number,
28 |             hidden_dropout=hidden_dropout,
29 |             model_comm_pgs=model_comm_pgs,
30 |         )
31 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .module import MegatronModule
4 | from .spec_utils import ModuleSpec, build_module
5 | from .transformer_config import MLATransformerConfig, TransformerConfig
6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
7 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/custom_layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/transformer/custom_layers/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/custom_layers/transformer_engine.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import warnings
 4 | 
 5 | warnings.warn(
 6 |     """The 'megatron.core.transformer.custom_layers.transformer_engine' 
 7 |     module is deprecated and will be removed in 0.10.0. Please use 
 8 |     'megatron.core.extensions.transformer_engine' instead.""",
 9 |     DeprecationWarning,
10 |     stacklevel=2,
11 | )
12 | from megatron.core.extensions.transformer_engine import *
13 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | # can we get rid of this?
 7 | # it's being used in pipeline schedules
 8 | class ModelType(enum.Enum):
 9 |     """Model Type
10 | 
11 |     encoder_or_decoder for bert, gpt etc
12 |     encoder_and_decoder for multimodal , T5 etc
13 |     """
14 | 
15 |     encoder_or_decoder = 1
16 |     encoder_and_decoder = 2
17 | 
18 | 
19 | # class LayerType(enum.Enum):
20 | #     encoder = 1
21 | #     decoder = 2
22 | 
23 | 
24 | class AttnType(enum.Enum):
25 |     """Attention type"""
26 | 
27 |     self_attn = 1
28 |     cross_attn = 2
29 | 
30 | 
31 | class AttnMaskType(enum.Enum):
32 |     """Attention Mask Type"""
33 | 
34 |     padding = 1
35 |     causal = 2
36 |     no_mask = 3  # only used for TE
37 |     padding_causal = 4  # only used for thd attention
38 |     arbitrary = 5
39 | 
40 | 
41 | class AttnBackend(enum.Enum):
42 |     """Attention Backend"""
43 | 
44 |     flash = 1
45 |     fused = 2
46 |     unfused = 3
47 |     local = 4
48 |     auto = 5
49 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/identity_op.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | class IdentityOp(torch.nn.Module):
 6 |     """
 7 |     This is a placeholder for IdentityOp(x) -> x
 8 |     """
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 
16 | 
17 | class IdentityFuncOp(IdentityOp):
18 |     """
19 |     This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x.
20 |     Such a func is handy for ops like `bias_dropout_fusion` which themselves
21 |     return a function at runtime based on passed arguments
22 |     """
23 | 
24 |     def __init__(self, *args, **kwargs):
25 |         super().__init__()
26 | 
27 |     def forward(self, *args, **kwargs):
28 |         return super().forward
29 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/transformer/moe/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/grouped_gemm_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | try:
 4 |     import grouped_gemm
 5 | except ImportError:
 6 |     grouped_gemm = None
 7 | 
 8 | 
 9 | def grouped_gemm_is_available():
10 |     """Check if grouped_gemm is available."""
11 |     return grouped_gemm is not None
12 | 
13 | 
14 | def assert_grouped_gemm_is_available():
15 |     """Assert that grouped_gemm is available."""
16 |     assert grouped_gemm_is_available(), (
17 |         "Grouped GEMM is not available. Please run "
18 |         "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4`."
19 |     )
20 | 
21 | 
22 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None
23 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/torch_layer_norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from megatron.core.transformer.torch_norm import WrappedTorchNorm
3 | 
4 | WrappedTorchLayerNorm = WrappedTorchNorm
5 | 


--------------------------------------------------------------------------------
/megatron/inference/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/inference/endpoints/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | import threading
 5 | 
 6 | GENERATE_NUM = 0
 7 | BEAM_NUM = 1
 8 | LOCK = threading.Lock()
 9 | 
10 | 
11 | def send_do_generate():
12 |     choice = torch.tensor([GENERATE_NUM], dtype=torch.long, device="cuda")
13 |     torch.distributed.broadcast(choice, 0)
14 | 
15 | 
16 | def send_do_beam_search():
17 |     choice = torch.tensor([BEAM_NUM], dtype=torch.long, device="cuda")
18 |     torch.distributed.broadcast(choice, 0)
19 | 


--------------------------------------------------------------------------------
/megatron/inference/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/megatron/legacy/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/legacy/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/megatron/legacy/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/megatron/legacy/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/legacy/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/legacy/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 4 | from .rms_norm import RMSNorm
 5 | 
 6 | from .bert_model import BertModel
 7 | from .gpt_model import GPTModel
 8 | from .t5_model import T5Model
 9 | from .language_model import get_language_model
10 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 |     retro_decoder_with_retriever = 5
11 |  
12 | class AttnType(enum.Enum):
13 |     self_attn = 1
14 |     cross_attn = 2
15 | 
16 | class AttnMaskType(enum.Enum):
17 |     padding = 1
18 |     causal = 2
19 | 
20 | # For backward compatibility with old model checkpoints
21 | from megatron.core.enums import ModelType
22 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | class RMSNorm(torch.nn.Module):
 7 | 
 8 |     def __init__(self,
 9 |                  dim: int,
10 |                  eps: float = 1e-6,
11 |                  sequence_parallel: bool = False,
12 |                  config: dict = None):
13 |         """RMS Normaliation module
14 | 
15 |         Args:
16 |             dim (int): The width of input, i.e. hidden size
17 |             eps (float): epsilon to use for the norm, default to 1e-6
18 |             sequence_parallel (bool): Set to true if sequence parallelism is being used,
19 |               this marks the weights as needing to be allreduced.
20 |         """
21 |         super().__init__()
22 |         self.eps = eps
23 |         self.weight = nn.Parameter(torch.ones(dim))
24 | 
25 |         setattr(self.weight, 'sequence_parallel', sequence_parallel)
26 | 
27 |     def _norm(self, x):
28 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
29 | 
30 |     def forward(self, x):
31 |         output = self._norm(x.float()).type_as(x)
32 |         return output * self.weight
33 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | import warnings
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def resize(input,
 8 |            size=None,
 9 |            scale_factor=None,
10 |            mode='nearest',
11 |            align_corners=None,
12 |            warning=True):
13 |     if warning:
14 |         if size is not None and align_corners:
15 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
16 |             output_h, output_w = tuple(int(x) for x in size)
17 |             if output_h > input_h or output_w > output_h:
18 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
19 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
20 |                         and (output_w - 1) % (input_w - 1)):
21 |                     warnings.warn(
22 |                         f'When align_corners={align_corners}, '
23 |                         'the output would more aligned if '
24 |                         f'input size {(input_h, input_w)} is `x+1` and '
25 |                         f'out size {(output_h, output_w)} is `nx+1`')
26 |     if isinstance(size, torch.Size):
27 |         size = tuple(int(x) for x in size)
28 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
29 | 


--------------------------------------------------------------------------------
/megatron/legacy/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/legacy/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/post_training/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/post_training/algos/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/post_training/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from datasets import load_dataset
 5 | 
 6 | 
 7 | def get_current_memory_info():
 8 |     """Get current memory usage."""
 9 |     remaining_mem, total_mem = torch.cuda.mem_get_info()
10 |     info = "rank {:3}/{:3}  memory remaining {:03}% ({}/{} MB) ".format(
11 |         torch.distributed.get_rank(),
12 |         torch.distributed.get_world_size(),
13 |         int(remaining_mem * 100 / total_mem),
14 |         remaining_mem // 1048576,
15 |         total_mem // 1048576,
16 |     )
17 |     return info
18 | 
19 | 
20 | def report_current_memory_info():
21 |     """Report current memory usage."""
22 |     print(get_current_memory_info(), flush=True)
23 |     torch.distributed.barrier()
24 | 
25 | 
26 | def get_mtbench_chat_data():
27 |     """Return a MTBench dataset."""
28 | 
29 |     def mtbench_to_oai_chat(example):
30 |         """Convert MTBench data to OpenAI chat completion format."""
31 |         conversations = []
32 |         for prompt in example["prompt"]:
33 |             conversations.append({"role": "user", "content": prompt})
34 |         example["conversations"] = conversations
35 |         return example
36 | 
37 |     dataset = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train")
38 |     return dataset.map(mtbench_to_oai_chat)
39 | 


--------------------------------------------------------------------------------
/megatron/training/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args
 6 | from .global_vars import get_signal_handler
 7 | from .global_vars import get_tokenizer
 8 | from .global_vars import get_tensorboard_writer
 9 | from .global_vars import get_wandb_writer
10 | from .global_vars import get_one_logger
11 | from .global_vars import get_adlr_autoresume
12 | from .global_vars import get_timers
13 | from .initialize  import initialize_megatron
14 | from .training import pretrain, get_model, get_train_valid_test_num_samples
15 | 
16 | from .utils import (print_rank_0,
17 |                     is_last_rank,
18 |                     print_rank_last)
19 | 


--------------------------------------------------------------------------------
/megatron/training/activations.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | from megatron.core.jit import jit_fuser
 6 | 
 7 | 
 8 | @jit_fuser
 9 | def squared_relu(x: torch.Tensor) -> torch.Tensor:
10 |     return torch.pow(F.relu(x), 2)
11 | 
12 | 
13 | @jit_fuser
14 | def quick_gelu(x: torch.Tensor) -> torch.Tensor:
15 |     return x * torch.sigmoid(1.702 * x)
16 | 
17 | @jit_fuser
18 | def fast_gelu(x: torch.Tensor) -> torch.Tensor:
19 |     return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
20 | 


--------------------------------------------------------------------------------
/megatron/training/log_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import sys
 4 | from logging import LogRecord, StreamHandler
 5 | 
 6 | BLACKLISTED_MODULES = ["torch.distributed"]
 7 | 
 8 | 
 9 | class CustomHandler(StreamHandler):
10 |     """
11 |     Custom handler to filter out logging from code outside of
12 |     Megatron Core, and dump to stdout.
13 |     """
14 | 
15 |     def __init__(self):
16 |         super().__init__(stream=sys.stdout)
17 | 
18 |     def filter(self, record: LogRecord) -> bool:
19 |         # Prevent log entries that come from the blacklisted modules
20 |         # through (e.g., PyTorch Distributed).
21 |         for blacklisted_module in BLACKLISTED_MODULES:
22 |             if record.name.startswith(blacklisted_module):
23 |                 return False
24 |         return True
25 | 


--------------------------------------------------------------------------------
/megatron/training/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | ignore_missing_imports = True
 3 | check_untyped_defs = False
 4 | disallow_untyped_calls = False
 5 | disallow_untyped_defs = False
 6 | disallow_incomplete_defs = False
 7 | 
 8 | disable_error_code = call-arg,operator,var-annotated,union-attr,import-untyped
 9 | 
10 | # Enable only `assignment` error checking
11 | enable_error_code = assignment


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # content of pytest.ini
2 | [pytest]
3 | markers =
4 |     internal: mark a test as a test to private/internal functions.
5 |     flaky: mark flaky tests for LTS environment
6 |     flaky_in_dev: mark flaky tests for DEV environment
7 | 


--------------------------------------------------------------------------------
/requirements/pytorch_24.01/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops
 2 | flask-restful
 3 | nltk
 4 | pytest
 5 | pytest_asyncio
 6 | pytest-cov
 7 | pytest_mock
 8 | pytest-random-order
 9 | sentencepiece
10 | tiktoken
11 | wrapt
12 | zarr
13 | wandb
14 | triton==2.1.0
15 | tensorstore!=0.1.46,!=0.1.72
16 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"
17 | nvtx
18 | 


--------------------------------------------------------------------------------
/requirements/pytorch_24.07/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops
 2 | flask-restful
 3 | nltk
 4 | pytest
 5 | pytest_asyncio
 6 | pytest-cov
 7 | pytest_mock
 8 | pytest-random-order
 9 | sentencepiece
10 | tiktoken
11 | wrapt
12 | zarr
13 | wandb
14 | tensorstore!=0.1.46
15 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"
16 | nvidia-resiliency-ext
17 | 


--------------------------------------------------------------------------------
/requirements/pytorch_24.10/requirements.txt:
--------------------------------------------------------------------------------
1 | einops
2 | zarr
3 | tensorstore!=0.1.46,!=0.1.72
4 | torch
5 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"
6 | nvidia-resiliency-ext; platform_machine == "x86_64"
7 | nvtx
8 | 


--------------------------------------------------------------------------------
/requirements/pytorch_25.03/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops
 2 | flask-restful
 3 | nltk
 4 | pytest
 5 | pytest-cov
 6 | pytest_mock
 7 | pytest-random-order
 8 | sentencepiece
 9 | tiktoken
10 | wrapt
11 | zarr
12 | wandb
13 | tensorstore!=0.1.46,!=0.1.72
14 | torch
15 | nvidia-modelopt[torch]>=0.23.2; sys_platform != "darwin"
16 | nvtx
17 | 


--------------------------------------------------------------------------------
/requirements_ci.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | wrapt
3 | pytest
4 | pytest_asyncio
5 | pytest-cov
6 | pytest_mock
7 | pytest-random-order
8 | wandb


--------------------------------------------------------------------------------
/requirements_mlm.txt:
--------------------------------------------------------------------------------
1 | tiktoken
2 | flask-restful
3 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | from megatron.training import get_args, print_rank_0
 6 | from megatron.legacy.indexer import IndexBuilder
 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator
 8 | 
 9 | def main():
10 |     """
11 |     Main program
12 |     """
13 | 
14 |     args = get_args()
15 | 
16 |     """
17 |     Create a BlockData data structure by running an IndexBuilder over an
18 |     ICT Dataset and then evaluate on NQ task
19 |     """
20 | 
21 |     print_rank_0("Starting index builder!")
22 | 
23 |     index_builder = IndexBuilder()
24 |     index_builder.build_and_save_index()
25 |     print_rank_0("Build and save indices: done!")
26 | 
27 | 
28 |     print_rank_0("Starting evaluations!")
29 | 
30 |     # Set up the model and evaluator
31 |     evaluator = ORQAEvaluator()
32 | 
33 |     # Run evaluation
34 |     if args.qa_data_dev is not None:
35 |         evaluator.evaluate(args.qa_data_dev, "DEV")
36 | 
37 |     if args.qa_data_test is not None:
38 |         evaluator.evaluate(args.qa_data_test, "TEST")
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/python_test_utils/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49689, "5": 10.48165, "10": 10.50192, "15": 10.45891, "20": 10.44599, "25": 10.35067, "30": 10.16617, "35": 10.04377, "40": 9.90903, "45": 9.75804, "50": 9.67525}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2071.0, "5": 2603.0, "10": 2120.0, "15": 2502.0, "20": 2235.0, "25": 2509.0, "30": 2938.0, "35": 2948.0, "40": 2197.0, "45": 3921.0, "50": 3479.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.89689, "5": 1.31101, "10": 1.31458, "15": 1.39008, "20": 1.43723, "25": 1.38294, "30": 1.37996, "35": 1.34031, "40": 1.38199, "45": 1.37809, "50": 1.40054}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.48367, "5": 10.47639, "10": 10.47262, "15": 10.47929, "20": 10.45433, "25": 10.38155, "30": 10.21158, "35": 10.1058, "40": 9.98135, "45": 9.8233, "50": 9.7299}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2570.0, "5": 2068.0, "10": 2597.0, "15": 2038.0, "20": 2750.0, "25": 2493.0, "30": 2850.0, "35": 2434.0, "40": 3418.0, "45": 3632.0, "50": 2132.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1784014336.0, "5": 1784014336.0, "10": 1784014336.0, "15": 1784014336.0, "20": 1784014336.0, "25": 1784014336.0, "30": 1784014336.0, "35": 1784014336.0, "40": 1784014336.0, "45": 1784014336.0, "50": 1784014336.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2365860864.0, "5": 3108842496.0, "10": 3108842496.0, "15": 3108842496.0, "20": 3108842496.0, "25": 3108842496.0, "30": 3108842496.0, "35": 3108842496.0, "40": 3108842496.0, "45": 3108842496.0, "50": 3108842496.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.5518, "5": 1.13792, "10": 1.13766, "15": 1.22776, "20": 1.1374, "25": 1.18568, "30": 1.23204, "35": 1.14281, "40": 1.37036, "45": 1.13878, "50": 1.3794}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34954, "30": 10.17283, "35": 10.0427, "40": 9.9076, "45": 9.7577, "50": 9.67688}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2166.0, "25": 2639.0, "30": 2769.0, "35": 3080.0, "40": 2282.0, "45": 3831.0, "50": 3519.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2300849152.0, "5": 3043311616.0, "10": 3043311616.0, "15": 3043311616.0, "20": 3043311616.0, "25": 3043311616.0, "30": 3043311616.0, "35": 3043311616.0, "40": 3043311616.0, "45": 3043311616.0, "50": 3043311616.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.92253, "5": 1.17517, "10": 1.16204, "15": 1.1534, "20": 1.15142, "25": 1.1777, "30": 1.14956, "35": 1.15257, "40": 1.14342, "45": 1.14293, "50": 1.14651}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49226, "5": 10.49833, "10": 10.49375, "15": 10.48886, "20": 10.46612, "25": 10.39219, "30": 10.20812, "35": 10.06926, "40": 9.93854, "45": 9.75472, "50": 9.6868}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2098.0, "5": 2869.0, "10": 2322.0, "15": 2605.0, "20": 2299.0, "25": 2583.0, "30": 2637.0, "35": 3051.0, "40": 1841.0, "45": 3921.0, "50": 3392.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3375511040.0, "5": 3375511040.0, "10": 3375511040.0, "15": 3375511040.0, "20": 3375511040.0, "25": 3375511040.0, "30": 3375511040.0, "35": 3375511040.0, "40": 3375511040.0, "45": 3375511040.0, "50": 3375511040.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4165166080.0, "5": 5631605760.0, "10": 5631605760.0, "15": 5631605760.0, "20": 5631605760.0, "25": 5631605760.0, "30": 5631605760.0, "35": 5631605760.0, "40": 5631605760.0, "45": 5631605760.0, "50": 5631605760.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.75804, "5": 0.68618, "10": 0.68574, "15": 0.71793, "20": 0.79578, "25": 0.68652, "30": 0.69897, "35": 0.68192, "40": 0.69111, "45": 0.68688, "50": 0.79338}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.47723, "5": 10.4715, "10": 10.46311, "15": 10.48841, "20": 10.44522, "25": 10.35474, "30": 10.2301, "35": 10.08868, "40": 9.93794, "45": 9.80332, "50": 9.70238}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2137.0, "5": 2003.0, "10": 2053.0, "15": 1807.0, "20": 2617.0, "25": 2429.0, "30": 2748.0, "35": 2364.0, "40": 3423.0, "45": 3125.0, "50": 2396.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3404871168.0, "5": 3404871168.0, "10": 3404871168.0, "15": 3404871168.0, "20": 3404871168.0, "25": 3404871168.0, "30": 3404871168.0, "35": 3404871168.0, "40": 3404871168.0, "45": 3404871168.0, "50": 3404871168.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4194526208.0, "5": 5660965376.0, "10": 5660965376.0, "15": 5660965376.0, "20": 5660965376.0, "25": 5660965376.0, "30": 5660965376.0, "35": 5660965376.0, "40": 5660965376.0, "45": 5660965376.0, "50": 5660965376.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.23346, "5": 0.60029, "10": 0.63129, "15": 0.587, "20": 0.60414, "25": 0.59205, "30": 0.66378, "35": 0.64433, "40": 0.65072, "45": 0.64763, "50": 0.63206}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49405, "5": 10.49933, "10": 10.49631, "15": 10.4873, "20": 10.46572, "25": 10.39496, "30": 10.2104, "35": 10.07333, "40": 9.94011, "45": 9.75651, "50": 9.69025}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2018.0, "5": 2740.0, "10": 2260.0, "15": 2649.0, "20": 2205.0, "25": 2675.0, "30": 2687.0, "35": 2930.0, "40": 1853.0, "45": 4016.0, "50": 2978.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3375511040.0, "5": 3375511040.0, "10": 3375511040.0, "15": 3375511040.0, "20": 3375511040.0, "25": 3375511040.0, "30": 3375511040.0, "35": 3375511040.0, "40": 3375511040.0, "45": 3375511040.0, "50": 3375511040.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4153629696.0, "5": 5620069376.0, "10": 5620069376.0, "15": 5620069376.0, "20": 5620069376.0, "25": 5620069376.0, "30": 5620069376.0, "35": 5620069376.0, "40": 5620069376.0, "45": 5620069376.0, "50": 5620069376.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.9519, "5": 0.61548, "10": 0.60778, "15": 0.60342, "20": 0.59844, "25": 0.60331, "30": 0.60426, "35": 0.59982, "40": 0.59928, "45": 0.80076, "50": 0.64239}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.46352, "5": 10.45321, "10": 10.4481, "15": 10.45891, "20": 10.41677, "25": 10.34598, "30": 10.1814, "35": 10.03992, "40": 9.90206, "45": 9.74954, "50": 9.66818}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2485.0, "5": 2824.0, "10": 2427.0, "15": 2767.0, "20": 2412.0, "25": 2691.0, "30": 2807.0, "35": 3077.0, "40": 2363.0, "45": 3744.0, "50": 3526.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2032164352.0, "5": 2032164352.0, "10": 2032164352.0, "15": 2032164352.0, "20": 2032164352.0, "25": 2032164352.0, "30": 2032164352.0, "35": 2032164352.0, "40": 2032164352.0, "45": 2032164352.0, "50": 2032164352.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4360259072.0, "5": 5220507136.0, "10": 5220507136.0, "15": 5220507136.0, "20": 5220507136.0, "25": 5220507136.0, "30": 5220507136.0, "35": 5220507136.0, "40": 5220507136.0, "45": 5220507136.0, "50": 5220507136.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.15656, "5": 0.90105, "10": 0.87495, "15": 0.87775, "20": 0.99829, "25": 0.90462, "30": 0.89264, "35": 0.90859, "40": 1.22654, "45": 0.98086, "50": 0.99661}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.4681, "5": 10.45367, "10": 10.45093, "15": 10.45833, "20": 10.42029, "25": 10.3405, "30": 10.18378, "35": 10.03886, "40": 9.89837, "45": 9.75107, "50": 9.67018}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2373.0, "5": 2811.0, "10": 2502.0, "15": 2556.0, "20": 2392.0, "25": 2764.0, "30": 2957.0, "35": 3046.0, "40": 2373.0, "45": 3854.0, "50": 3568.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2032164352.0, "5": 2032164352.0, "10": 2032164352.0, "15": 2032164352.0, "20": 2032164352.0, "25": 2032164352.0, "30": 2032164352.0, "35": 2032164352.0, "40": 2032164352.0, "45": 2032164352.0, "50": 2032164352.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4341384704.0, "5": 5201632768.0, "10": 5201632768.0, "15": 5201632768.0, "20": 5201632768.0, "25": 5201632768.0, "30": 5201632768.0, "35": 5201632768.0, "40": 5201632768.0, "45": 5201632768.0, "50": 5201632768.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22.0069, "5": 0.82183, "10": 1.0945, "15": 0.82371, "20": 0.84695, "25": 1.04803, "30": 0.79308, "35": 0.77873, "40": 0.98672, "45": 0.84816, "50": 0.7713}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.42004, "5": 10.44687, "10": 10.44032, "15": 10.43081, "20": 10.40841, "25": 10.32605, "30": 10.18604, "35": 10.03131, "40": 9.91274, "45": 9.75116, "50": 9.66124}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3226.0, "5": 3843.0, "10": 2475.0, "15": 2700.0, "20": 3443.0, "25": 2788.0, "30": 2821.0, "35": 4077.0, "40": 3244.0, "45": 4769.0, "50": 3733.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1632405504.0, "5": 1632405504.0, "10": 1632405504.0, "15": 1632405504.0, "20": 1632405504.0, "25": 1632405504.0, "30": 1632405504.0, "35": 1632405504.0, "40": 1632405504.0, "45": 1632405504.0, "50": 1632405504.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2477577728.0, "5": 3175497216.0, "10": 3175497216.0, "15": 3175497216.0, "20": 3178637312.0, "25": 3178637312.0, "30": 3178637312.0, "35": 3178637312.0, "40": 3178637312.0, "45": 3178637312.0, "50": 3178637312.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.09431, "5": 2.352, "10": 2.3669, "15": 2.36187, "20": 2.34867, "25": 2.34813, "30": 2.35284, "35": 2.36644, "40": 2.35505, "45": 2.34778, "50": 2.35217}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.42626, "5": 10.42178, "10": 10.40882, "15": 10.40955, "20": 10.40433, "25": 10.31113, "30": 10.1472, "35": 10.04626, "40": 9.91097, "45": 9.74281, "50": 9.65795}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3452.0, "5": 3418.0, "10": 3298.0, "15": 3261.0, "20": 3448.0, "25": 2542.0, "30": 4164.0, "35": 3701.0, "40": 3387.0, "45": 4965.0, "50": 3268.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1661765632.0, "5": 1661765632.0, "10": 1661765632.0, "15": 1661765632.0, "20": 1661765632.0, "25": 1661765632.0, "30": 1661765632.0, "35": 1661765632.0, "40": 1661765632.0, "45": 1661765632.0, "50": 1661765632.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2510669824.0, "5": 3207546368.0, "10": 3207546368.0, "15": 3209637888.0, "20": 3209637888.0, "25": 3209637888.0, "30": 3209637888.0, "35": 3209637888.0, "40": 3209637888.0, "45": 3209637888.0, "50": 3209637888.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.05902, "5": 2.10578, "10": 2.07255, "15": 2.28342, "20": 2.04777, "25": 2.03295, "30": 2.0347, "35": 2.05296, "40": 2.03634, "45": 2.02561, "50": 2.04166}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.42107, "5": 10.44497, "10": 10.44241, "15": 10.43152, "20": 10.40907, "25": 10.3264, "30": 10.18328, "35": 10.03461, "40": 9.91258, "45": 9.74932, "50": 9.66168}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2229.0, "5": 2848.0, "10": 2437.0, "15": 3644.0, "20": 3449.0, "25": 3783.0, "30": 2913.0, "35": 4128.0, "40": 2230.0, "45": 4790.0, "50": 4716.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1632405504.0, "5": 1632405504.0, "10": 1632405504.0, "15": 1632405504.0, "20": 1632405504.0, "25": 1632405504.0, "30": 1632405504.0, "35": 1632405504.0, "40": 1632405504.0, "45": 1632405504.0, "50": 1632405504.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2458703360.0, "5": 3155576320.0, "10": 3155576320.0, "15": 3155576320.0, "20": 3155576320.0, "25": 3155576320.0, "30": 3155576320.0, "35": 3155576320.0, "40": 3155576320.0, "45": 3155576320.0, "50": 3155576320.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.46827, "5": 2.04207, "10": 2.0714, "15": 2.06559, "20": 2.04371, "25": 2.04465, "30": 2.0474, "35": 2.21838, "40": 2.04636, "45": 2.05719, "50": 2.04581}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml:
--------------------------------------------------------------------------------
1 | ENV_VARS:
2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
3 |   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
4 |   NCCL_ALGO: Tree
5 |   CUBLAS_WORKSPACE_CONFIG: :4096:8
6 | MODEL_ARGS:
7 | TEST_TYPE: regular
8 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 | MODEL_ARGS:
 5 |   trainer.num_nodes: 1
 6 |   trainer.devices: 8
 7 |   trainer.max_steps: 50
 8 |   trainer.val_check_interval: 50
 9 |   trainer.limit_val_batches: 50
10 |   trainer.strategy.tensor_model_parallel_size: 2
11 |   trainer.strategy.pipeline_model_parallel_size: 2
12 |   trainer.strategy.sequence_parallel: True
13 |   data.micro_batch_size: 2
14 |   data.global_batch_size: 32
15 |   data.seq_length: 512
16 |   log.log_dir: ${CHECKPOINT_SAVE_PATH}
17 | TEST_TYPE: regular
18 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 | MODEL_ARGS:
 5 |   trainer.num_nodes: 1
 6 |   trainer.devices: 8
 7 |   trainer.max_steps: 50
 8 |   trainer.val_check_interval: 50
 9 |   trainer.limit_val_batches: 50
10 |   trainer.strategy.tensor_model_parallel_size: 4
11 |   trainer.strategy.pipeline_model_parallel_size: 1
12 |   trainer.strategy.context_parallel_size: 1
13 |   trainer.strategy.sequence_parallel: True
14 |   data.micro_batch_size: 1
15 |   data.global_batch_size: 8
16 |   data.seq_length: 2048
17 | TEST_TYPE: regular
18 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 |   NVTE_APPLY_QK_LAYER_SCALING: 1
 5 | MODEL_ARGS:
 6 |   trainer.num_nodes: 1
 7 |   trainer.devices: 8
 8 |   trainer.max_steps: 50
 9 |   trainer.val_check_interval: 50
10 |   trainer.limit_val_batches: 50
11 |   trainer.strategy.tensor_model_parallel_size: 2
12 |   trainer.strategy.pipeline_model_parallel_size: 2
13 |   trainer.strategy.expert_model_parallel_size: 2
14 |   trainer.strategy.context_parallel_size: 1
15 |   trainer.strategy.sequence_parallel: True
16 |   model.config.num_layers: 12
17 |   model.config.hidden_size: 768
18 |   model.config.num_attention_heads: 16
19 |   model.config.ffn_hidden_size: 3072
20 |   model.config.apply_query_key_layer_scaling: True
21 |   model.config.bias_activation_fusion: False
22 |   model.config.add_bias_linear: False
23 |   model.config.num_moe_experts: 8
24 |   model.config.moe_grouped_gemm: True
25 |   model.config.moe_router_load_balancing_type: aux_loss
26 |   model.config.moe_router_topk: 2
27 |   model.config.moe_aux_loss_coeff: 1e-2
28 |   data.micro_batch_size: 1
29 |   data.global_batch_size: 8
30 |   data.seq_length: 2048
31 |   log.log_dir: ${CHECKPOINT_SAVE_PATH}
32 | TEST_TYPE: regular
33 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 | MODEL_ARGS:
 5 |   trainer.num_nodes: 1
 6 |   trainer.devices: 8
 7 |   trainer.max_steps: 50
 8 |   trainer.val_check_interval: 50
 9 |   trainer.limit_val_batches: 50
10 |   trainer.strategy.tensor_model_parallel_size: 1
11 |   trainer.strategy.pipeline_model_parallel_size: 1
12 |   trainer.strategy.context_parallel_size: 2
13 |   trainer.strategy.sequence_parallel: True
14 |   model.config.num_layers: 12
15 |   model.config.hidden_size: 768
16 |   model.config.num_attention_heads: 16
17 |   model.config.ffn_hidden_size: 3072
18 |   data.micro_batch_size: 4
19 |   data.global_batch_size: 64
20 |   data.seq_length: 2048
21 |   log.log_dir: ${CHECKPOINT_SAVE_PATH}
22 | TEST_TYPE: regular
23 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 | MODEL_ARGS:
 5 |   trainer.num_nodes: 1
 6 |   trainer.devices: 8
 7 |   trainer.max_steps: 50
 8 |   trainer.val_check_interval: 50
 9 |   trainer.limit_val_batches: 50
10 |   trainer.strategy.tensor_model_parallel_size: 2
11 |   trainer.strategy.pipeline_model_parallel_size: 1
12 |   trainer.strategy.expert_model_parallel_size: 4
13 |   trainer.strategy.sequence_parallel: True
14 |   model.config.num_layers: 12
15 |   model.config.hidden_size: 768
16 |   model.config.num_attention_heads: 16
17 |   model.config.ffn_hidden_size: 3072
18 |   data.micro_batch_size: 1
19 |   data.global_batch_size: 8
20 |   data.seq_length: 2048
21 | TEST_TYPE: regular
22 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 | MODEL_ARGS:
 5 |   trainer.num_nodes: 1
 6 |   trainer.devices: 8
 7 |   trainer.max_steps: 50
 8 |   trainer.val_check_interval: 50
 9 |   trainer.limit_val_batches: 50
10 |   trainer.strategy.tensor_model_parallel_size: 1
11 |   trainer.strategy.pipeline_model_parallel_size: 1
12 |   data.micro_batch_size: 4
13 |   data.global_batch_size: 64
14 |   data.seq_length: 512
15 | TEST_TYPE: regular
16 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86114, "5": 10.87296, "10": 10.83903, "15": 10.8216, "20": 10.71697, "25": 10.5566, "30": 10.36032, "35": 10.26583, "40": 10.08719, "45": 9.82374, "50": 9.90498}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1696.0, "5": 1776.0, "10": 1413.0, "15": 1801.0, "20": 1624.0, "25": 1483.0, "30": 1856.0, "35": 1953.0, "40": 2183.0, "45": 2058.0, "50": 2134.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 948653056.0, "5": 948653056.0, "10": 948653056.0, "15": 948653056.0, "20": 948653056.0, "25": 948653056.0, "30": 948653056.0, "35": 948653056.0, "40": 948653056.0, "45": 948653056.0, "50": 948653056.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3275284480.0, "5": 3632653312.0, "10": 3632653312.0, "15": 3632653312.0, "20": 3632653312.0, "25": 3632653312.0, "30": 3632653312.0, "35": 3632653312.0, "40": 3632653312.0, "45": 3632653312.0, "50": 3632653312.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.68328, "5": 0.11284, "10": 0.1105, "15": 0.1127, "20": 0.11177, "25": 0.11822, "30": 0.11168, "35": 0.10923, "40": 0.11032, "45": 0.11159, "50": 0.10997}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.85678, "5": 10.88398, "10": 10.84079, "15": 10.82504, "20": 10.71912, "25": 10.55479, "30": 10.35998, "35": 10.26937, "40": 10.08396, "45": 9.82563, "50": 9.90725}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1654.0, "5": 1860.0, "10": 1317.0, "15": 1759.0, "20": 1730.0, "25": 1552.0, "30": 1895.0, "35": 1987.0, "40": 2099.0, "45": 1993.0, "50": 2085.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 779997184.0, "5": 779997184.0, "10": 779997184.0, "15": 779997184.0, "20": 779997184.0, "25": 779997184.0, "30": 779997184.0, "35": 779997184.0, "40": 779997184.0, "45": 779997184.0, "50": 779997184.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2463815680.0, "5": 2746575872.0, "10": 2746575872.0, "15": 2746575872.0, "20": 2746575872.0, "25": 2746575872.0, "30": 2746575872.0, "35": 2746575872.0, "40": 2746575872.0, "45": 2746575872.0, "50": 2746575872.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.21839, "5": 0.13024, "10": 0.13236, "15": 0.13158, "20": 0.12851, "25": 0.12984, "30": 0.13011, "35": 0.12981, "40": 0.12965, "45": 0.13094, "50": 0.13196}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86122, "5": 10.88248, "10": 10.83515, "15": 10.82747, "20": 10.72762, "25": 10.55769, "30": 10.37915, "35": 10.28345, "40": 10.08809, "45": 9.82642, "50": 9.91341}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1694.0, "5": 2127.0, "10": 1548.0, "15": 1997.0, "20": 1846.0, "25": 1802.0, "30": 2112.0, "35": 2172.0, "40": 2560.0, "45": 2397.0, "50": 2761.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 382956544.0, "5": 382956544.0, "10": 382956544.0, "15": 382956544.0, "20": 382956544.0, "25": 382956544.0, "30": 382956544.0, "35": 382956544.0, "40": 382956544.0, "45": 382956544.0, "50": 382956544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1497803264.0, "5": 1628741632.0, "10": 1628741632.0, "15": 1628741632.0, "20": 1628741632.0, "25": 1628741632.0, "30": 1628741632.0, "35": 1628741632.0, "40": 1628741632.0, "45": 1628741632.0, "50": 1628741632.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3.92164, "5": 0.29494, "10": 0.2941, "15": 0.29069, "20": 0.2914, "25": 0.29245, "30": 0.29159, "35": 0.29034, "40": 0.29023, "45": 0.29123, "50": 0.29039}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86122, "5": 10.88248, "10": 10.83515, "15": 10.82747, "20": 10.72762, "25": 10.55769, "30": 10.37919, "35": 10.28344, "40": 10.08807, "45": 9.82644, "50": 9.9134}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1694.0, "5": 2127.0, "10": 1548.0, "15": 1997.0, "20": 1846.0, "25": 1700.0, "30": 2165.0, "35": 2194.0, "40": 2540.0, "45": 2414.0, "50": 2586.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 382956544.0, "5": 382956544.0, "10": 382956544.0, "15": 382956544.0, "20": 382956544.0, "25": 382956544.0, "30": 382956544.0, "35": 382956544.0, "40": 382956544.0, "45": 382956544.0, "50": 382956544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1497803776.0, "5": 1629265408.0, "10": 1629265408.0, "15": 1629265408.0, "20": 1629265408.0, "25": 1629265408.0, "30": 1629265408.0, "35": 1629265408.0, "40": 1629265408.0, "45": 1629265408.0, "50": 1629265408.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.03009, "5": 0.32279, "10": 0.32497, "15": 0.32097, "20": 0.31241, "25": 0.30965, "30": 0.31321, "35": 0.30989, "40": 0.3143, "45": 0.31488, "50": 0.31594}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.57354, "5": 12.58052, "10": 12.47389, "15": 11.80615, "20": 11.49679, "25": 10.98441}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 521040608.0, "5": 520996544.0, "10": 521180480.0, "15": 521592480.0, "20": 521134336.0, "25": 523544480.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 24510808064.0, "5": 24510808064.0, "10": 24510808064.0, "15": 24510808064.0, "20": 24510808064.0, "25": 24510808064.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 52700401664.0, "5": 60489064448.0, "10": 60489064448.0, "15": 60489064448.0, "20": 60489064448.0, "25": 60489064448.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 2.84236, "15": "nan", "20": 2.8477, "25": "nan"}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.58569, "5": 12.5828, "10": 12.48258, "15": 11.79645, "20": 11.47664, "25": 10.97988}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 521035392.0, "5": 520993472.0, "10": 521176928.0, "15": 521588800.0, "20": 521133408.0, "25": 523547232.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 24540168192.0, "5": 24540168192.0, "10": 24540168192.0, "15": 24540168192.0, "20": 24540168192.0, "25": 24540168192.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 52729761792.0, "5": 60518424576.0, "10": 60518424576.0, "15": 60518424576.0, "20": 60518424576.0, "25": 60518424576.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 1.26794, "15": "nan", "20": 1.25096, "25": "nan"}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.61228, "5": 12.60403, "10": 12.49844, "15": 11.8178, "20": 11.50309, "25": 10.99207}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 523041344.0, "5": 523013024.0, "10": 523188736.0, "15": 523626720.0, "20": 523224480.0, "25": 525635552.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 20634324992.0, "5": 20634324992.0, "10": 20634324992.0, "15": 20634324992.0, "20": 20634324992.0, "25": 20634324992.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 51333926912.0, "5": 58188337152.0, "10": 58188337152.0, "15": 58188337152.0, "20": 58188337152.0, "25": 58188337152.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 2.59405, "15": "nan", "20": 2.60299, "25": "nan"}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.59715, "5": 12.59006, "10": 12.49071, "15": 11.82094, "20": 11.51707, "25": 11.00352}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 523037536.0, "5": 523010848.0, "10": 523184768.0, "15": 523629344.0, "20": 523228704.0, "25": 525639232.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 20663961600.0, "5": 20663961600.0, "10": 20663961600.0, "15": 20663961600.0, "20": 20663961600.0, "25": 20663961600.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 50289545216.0, "5": 57144233984.0, "10": 57144233984.0, "15": 57144233984.0, "20": 57144233984.0, "25": 57144233984.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 1.13086, "15": "nan", "20": 1.13253, "25": "nan"}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82975, "5": 10.8439, "10": 10.79337, "15": 10.77994, "20": 10.67712, "25": 10.48584, "30": 10.28468, "35": 10.18859, "40": 9.99279, "45": 9.72153, "50": 9.82127}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 226.0, "5": 275.0, "10": 181.0, "15": 253.0, "20": 248.0, "25": 207.0, "30": 265.0, "35": 281.0, "40": 315.0, "45": 282.0, "50": 336.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 831212544.0, "5": 831212544.0, "10": 831212544.0, "15": 831212544.0, "20": 831212544.0, "25": 831212544.0, "30": 831212544.0, "35": 831212544.0, "40": 831212544.0, "45": 831212544.0, "50": 831212544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 891582464.0, "5": 1250786304.0, "10": 1250786304.0, "15": 1250786304.0, "20": 1250786304.0, "25": 1250786304.0, "30": 1250786304.0, "35": 1251833856.0, "40": 1251833856.0, "45": 1251833856.0, "50": 1251833856.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 20.1181, "5": 0.47795, "10": 0.47291, "15": 0.48167, "20": 0.412, "25": 0.41115, "30": 0.41145, "35": 0.41136, "40": 0.41095, "45": 0.40816, "50": 0.42667}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.85004, "5": 10.86413, "10": 10.82533, "15": 10.81501, "20": 10.72113, "25": 10.53088, "30": 10.33843, "35": 10.24208, "40": 10.05219, "45": 9.76638, "50": 9.85497}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1683.0, "5": 1927.0, "10": 1648.0, "15": 2007.0, "20": 1833.0, "25": 1805.0, "30": 2032.0, "35": 2136.0, "40": 2234.0, "45": 2271.0, "50": 2398.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 886001664.0, "5": 886001664.0, "10": 886001664.0, "15": 886001664.0, "20": 886001664.0, "25": 886001664.0, "30": 886001664.0, "35": 886001664.0, "40": 886001664.0, "45": 886001664.0, "50": 886001664.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3212633088.0, "5": 3570001920.0, "10": 3570001920.0, "15": 3570001920.0, "20": 3570001920.0, "25": 3570001920.0, "30": 3570001920.0, "35": 3570001920.0, "40": 3570001920.0, "45": 3570001920.0, "50": 3570001920.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.76404, "5": 0.14426, "10": 0.14503, "15": 0.14512, "20": 0.14395, "25": 0.14807, "30": 0.14833, "35": 0.1429, "40": 0.14205, "45": 0.14208, "50": 0.14172}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8468, "5": 10.8657, "10": 10.82411, "15": 10.8128, "20": 10.72008, "25": 10.53151, "30": 10.33655, "35": 10.24133, "40": 10.05096, "45": 9.76804, "50": 9.85531}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1707.0, "5": 2121.0, "10": 1606.0, "15": 1959.0, "20": 1756.0, "25": 1848.0, "30": 2091.0, "35": 2089.0, "40": 2156.0, "45": 2137.0, "50": 2317.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3212632576.0, "5": 3572098560.0, "10": 3572098560.0, "15": 3572098560.0, "20": 3572098560.0, "25": 3572098560.0, "30": 3572098560.0, "35": 3572098560.0, "40": 3572098560.0, "45": 3572098560.0, "50": 3572098560.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.77598, "5": 0.14261, "10": 0.14233, "15": 0.14134, "20": 0.14113, "25": 0.141, "30": 0.1403, "35": 0.1406, "40": 0.1401, "45": 0.13985, "50": 0.14004}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79436, "5": 10.84798, "10": 10.7703, "15": 10.78948, "20": 10.68039, "25": 10.506, "30": 10.33228, "35": 10.2547, "40": 10.05593, "45": 9.80637, "50": 9.89113}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1637.0, "5": 1785.0, "10": 1384.0, "15": 1933.0, "20": 1624.0, "25": 1589.0, "30": 1959.0, "35": 1973.0, "40": 2248.0, "45": 2173.0, "50": 2448.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 718931456.0, "5": 718931456.0, "10": 718931456.0, "15": 718931456.0, "20": 718931456.0, "25": 718931456.0, "30": 718931456.0, "35": 718931456.0, "40": 718931456.0, "45": 718931456.0, "50": 718931456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2399714816.0, "5": 2685510144.0, "10": 2685510144.0, "15": 2685510144.0, "20": 2685510144.0, "25": 2685510144.0, "30": 2685510144.0, "35": 2685510144.0, "40": 2685510144.0, "45": 2685510144.0, "50": 2685510144.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3.76573, "5": 0.16293, "10": 0.16166, "15": 0.1618, "20": 0.16139, "25": 0.16605, "30": 0.162, "35": 0.16243, "40": 0.16141, "45": 0.16279, "50": 0.16404}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.73394, "5": 10.79243, "10": 10.70607, "15": 10.76012, "20": 10.68686, "25": 10.54768, "30": 10.45359, "35": 10.38572, "40": 10.24216, "45": 9.98159, "50": 10.06417}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2514.0, "5": 2818.0, "10": 2519.0, "15": 2543.0, "20": 2560.0, "25": 2574.0, "30": 2629.0, "35": 2568.0, "40": 2561.0, "45": 2508.0, "50": 2619.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 717420032.0, "5": 717420032.0, "10": 717420032.0, "15": 717420032.0, "20": 717420032.0, "25": 717420032.0, "30": 717420032.0, "35": 717420032.0, "40": 717420032.0, "45": 717420032.0, "50": 717420032.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2401419776.0, "5": 2684785152.0, "10": 2684785152.0, "15": 2684785152.0, "20": 2684785152.0, "25": 2684785152.0, "30": 2684785152.0, "35": 2684785152.0, "40": 2684785152.0, "45": 2684785152.0, "50": 2684785152.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.92787, "5": 0.17447, "10": 0.17372, "15": 0.17578, "20": 0.17588, "25": 0.17513, "30": 0.1731, "35": 0.1734, "40": 0.17385, "45": 0.17319, "50": 0.17333}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.74049, "5": 10.79201, "10": 10.71088, "15": 10.76031, "20": 10.6891, "25": 10.54338, "30": 10.4542, "35": 10.38324, "40": 10.24296, "45": 9.9834, "50": 10.06865}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2527.0, "5": 2875.0, "10": 2475.0, "15": 2508.0, "20": 2650.0, "25": 2392.0, "30": 2484.0, "35": 2573.0, "40": 2559.0, "45": 2519.0, "50": 2500.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 715322368.0, "5": 715322368.0, "10": 715322368.0, "15": 715322368.0, "20": 715322368.0, "25": 715322368.0, "30": 715322368.0, "35": 715322368.0, "40": 715322368.0, "45": 715322368.0, "50": 715322368.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2402991104.0, "5": 2683341824.0, "10": 2683341824.0, "15": 2683341824.0, "20": 2683341824.0, "25": 2683341824.0, "30": 2683341824.0, "35": 2683341824.0, "40": 2683341824.0, "45": 2683341824.0, "50": 2683341824.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.81379, "5": 0.17159, "10": 0.17073, "15": 0.16785, "20": 0.17251, "25": 0.17348, "30": 0.17312, "35": 0.17159, "40": 0.16987, "45": 0.17054, "50": 0.16978}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81478, "5": 10.85169, "10": 10.78745, "15": 10.79503, "20": 10.69101, "25": 10.52199, "30": 10.34557, "35": 10.25813, "40": 10.06995, "45": 9.80182, "50": 9.8759}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1549.0, "5": 1939.0, "10": 1348.0, "15": 1913.0, "20": 1684.0, "25": 1625.0, "30": 1929.0, "35": 1956.0, "40": 2108.0, "45": 2034.0, "50": 2458.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 731763200.0, "5": 731763200.0, "10": 731763200.0, "15": 731763200.0, "20": 731763200.0, "25": 731763200.0, "30": 731763200.0, "35": 731763200.0, "40": 731763200.0, "45": 731763200.0, "50": 731763200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895616.0, "5": 4120607232.0, "10": 4120607232.0, "15": 4120607232.0, "20": 4120607232.0, "25": 4120607232.0, "30": 4120607232.0, "35": 4120607232.0, "40": 4120607232.0, "45": 4120607232.0, "50": 4120607232.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92138, "5": 0.1642, "10": 0.16403, "15": 0.16127, "20": 0.16115, "25": 0.16151, "30": 0.16082, "35": 0.16141, "40": 0.1612, "45": 0.16203, "50": 0.16105}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85285, "10": 10.78449, "15": 10.79226, "20": 10.69196, "25": 10.52317, "30": 10.34507, "35": 10.25889, "40": 10.07027, "45": 9.80301, "50": 9.87673}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1915.0, "10": 1361.0, "15": 1831.0, "20": 1695.0, "25": 1596.0, "30": 1821.0, "35": 1872.0, "40": 2121.0, "45": 2090.0, "50": 2395.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 733859840.0, "5": 733859840.0, "10": 733859840.0, "15": 733859840.0, "20": 733859840.0, "25": 733859840.0, "30": 733859840.0, "35": 733859840.0, "40": 733859840.0, "45": 733859840.0, "50": 733859840.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895104.0, "5": 4122703872.0, "10": 4122703872.0, "15": 4122703872.0, "20": 4122703872.0, "25": 4122703872.0, "30": 4122703872.0, "35": 4122703872.0, "40": 4122703872.0, "45": 4122703872.0, "50": 4122703872.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.37934, "5": 0.166, "10": 0.16217, "15": 0.1635, "20": 0.16167, "25": 0.15901, "30": 0.15975, "35": 0.15935, "40": 0.15876, "45": 0.16028, "50": 0.15898}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81478, "5": 10.8517, "10": 10.78749, "15": 10.79505, "20": 10.69119, "25": 10.52294, "30": 10.34604, "35": 10.26165, "40": 10.072, "45": 9.80976, "50": 9.88336}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1549.0, "5": 1915.0, "10": 1391.0, "15": 1873.0, "20": 1698.0, "25": 1701.0, "30": 1980.0, "35": 1893.0, "40": 2037.0, "45": 1968.0, "50": 2391.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 731763200.0, "5": 731763200.0, "10": 731763200.0, "15": 731763200.0, "20": 731763200.0, "25": 731763200.0, "30": 731763200.0, "35": 731763200.0, "40": 731763200.0, "45": 731763200.0, "50": 731763200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895616.0, "5": 4120607232.0, "10": 4120607232.0, "15": 4120607232.0, "20": 4120607232.0, "25": 4120607232.0, "30": 4120607232.0, "35": 4120607232.0, "40": 4120607232.0, "45": 4120607232.0, "50": 4120607232.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.13825, "5": 0.15932, "10": 0.16236, "15": 0.16058, "20": 0.15952, "25": 0.15943, "30": 0.15981, "35": 0.15842, "40": 0.16085, "45": 0.16001, "50": 0.15982}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84523, "5": 10.87431, "10": 10.82854, "15": 10.8192, "20": 10.72736, "25": 10.55176, "30": 10.3649, "35": 10.27828, "40": 10.09756, "45": 9.84183, "50": 9.91243}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1725.0, "5": 1906.0, "10": 1451.0, "15": 1899.0, "20": 1576.0, "25": 1534.0, "30": 1886.0, "35": 1905.0, "40": 2136.0, "45": 2154.0, "50": 2246.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 763220480.0, "5": 763220480.0, "10": 763220480.0, "15": 763220480.0, "20": 763220480.0, "25": 763220480.0, "30": 763220480.0, "35": 763220480.0, "40": 763220480.0, "45": 763220480.0, "50": 763220480.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3868255744.0, "5": 4152064512.0, "10": 4152064512.0, "15": 4152064512.0, "20": 4152064512.0, "25": 4152064512.0, "30": 4152064512.0, "35": 4152064512.0, "40": 4152064512.0, "45": 4152064512.0, "50": 4152064512.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.86411, "5": 0.13803, "10": 0.13439, "15": 0.1352, "20": 0.14116, "25": 0.13406, "30": 0.13892, "35": 0.13943, "40": 0.14209, "45": 0.14014, "50": 0.14122}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.79229, "20": 10.69211, "25": 10.52412, "30": 10.34552, "35": 10.26239, "40": 10.07241, "45": 9.81101, "50": 9.88422}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1848.0, "20": 1601.0, "25": 1635.0, "30": 1936.0, "35": 1908.0, "40": 2100.0, "45": 2098.0, "50": 2333.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 732811264.0, "5": 732811264.0, "10": 732811264.0, "15": 732811264.0, "20": 732811264.0, "25": 732811264.0, "30": 732811264.0, "35": 732811264.0, "40": 732811264.0, "45": 732811264.0, "50": 732811264.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895104.0, "5": 4122703872.0, "10": 4122703872.0, "15": 4122703872.0, "20": 4122703872.0, "25": 4122703872.0, "30": 4122703872.0, "35": 4122703872.0, "40": 4122703872.0, "45": 4122703872.0, "50": 4122703872.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.9121, "5": 0.1731, "10": 0.17256, "15": 0.1722, "20": 0.17555, "25": 0.17245, "30": 0.17067, "35": 0.17091, "40": 0.17274, "45": 0.17151, "50": 0.17108}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.88761, "5": 10.9019, "10": 10.86847, "15": 10.84822, "20": 10.71762, "25": 10.54247, "30": 10.33628, "35": 10.23953, "40": 10.03243, "45": 9.768, "50": 9.8531}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 581.0, "5": 672.0, "10": 570.0, "15": 660.0, "20": 642.0, "25": 631.0, "30": 634.0, "35": 765.0, "40": 832.0, "45": 798.0, "50": 829.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 609140224.0, "5": 609140224.0, "10": 609140224.0, "15": 609140224.0, "20": 609140224.0, "25": 609140224.0, "30": 609140224.0, "35": 609140224.0, "40": 609140224.0, "45": 609140224.0, "50": 609140224.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 881272320.0, "5": 1139960320.0, "10": 1139960320.0, "15": 1139960320.0, "20": 1139960320.0, "25": 1139960320.0, "30": 1139960320.0, "35": 1139960320.0, "40": 1139960320.0, "45": 1139960320.0, "50": 1139960320.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.59675, "5": 0.37244, "10": 0.37323, "15": 0.37434, "20": 0.37587, "25": 0.37155, "30": 0.36463, "35": 0.361, "40": 0.36207, "45": 0.36168, "50": 0.35807}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.88759, "5": 10.90189, "10": 10.86849, "15": 10.84829, "20": 10.71772, "25": 10.54269, "30": 10.33645, "35": 10.23973, "40": 10.03266, "45": 9.76817, "50": 9.85325}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 584.0, "5": 690.0, "10": 501.0, "15": 618.0, "20": 573.0, "25": 605.0, "30": 678.0, "35": 702.0, "40": 775.0, "45": 787.0, "50": 830.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 611892224.0, "5": 611892224.0, "10": 611892224.0, "15": 611892224.0, "20": 611892224.0, "25": 611892224.0, "30": 611892224.0, "35": 611892224.0, "40": 611892224.0, "45": 611892224.0, "50": 611892224.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 879199232.0, "5": 1155304960.0, "10": 1155318784.0, "15": 1155318784.0, "20": 1155318784.0, "25": 1155318784.0, "30": 1155318784.0, "35": 1155318784.0, "40": 1155318784.0, "45": 1155318784.0, "50": 1155318784.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.16274, "5": 0.34171, "10": 0.34862, "15": 0.36414, "20": 0.34819, "25": 0.33727, "30": 0.35247, "35": 0.35263, "40": 0.33785, "45": 0.3406, "50": 0.35113}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.9297, "10": 10.90469, "15": 10.87115, "20": 10.74984, "25": 10.53727, "30": 10.32528, "35": 10.22874, "40": 10.01958, "45": 9.75531, "50": 9.84057}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 599.0, "5": 637.0, "10": 567.0, "15": 637.0, "20": 569.0, "25": 577.0, "30": 701.0, "35": 733.0, "40": 813.0, "45": 759.0, "50": 874.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 433619456.0, "5": 433619456.0, "10": 433619456.0, "15": 433619456.0, "20": 433619456.0, "25": 433619456.0, "30": 433619456.0, "35": 433619456.0, "40": 433619456.0, "45": 433619456.0, "50": 433619456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 675226112.0, "5": 857134592.0, "10": 857134592.0, "15": 857134592.0, "20": 857134592.0, "25": 857134592.0, "30": 857134592.0, "35": 857134592.0, "40": 857134592.0, "45": 857134592.0, "50": 857134592.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.29671, "5": 0.41356, "10": 0.41276, "15": 0.4124, "20": 0.41115, "25": 0.41244, "30": 0.41458, "35": 0.41419, "40": 0.41405, "45": 0.41469, "50": 0.41348}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87853, "10": 10.82979, "15": 10.82044, "20": 10.7038, "25": 10.49397, "30": 10.30529, "35": 10.20166, "40": 10.01885, "45": 9.74947, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 657.0, "5": 614.0, "10": 533.0, "15": 657.0, "20": 610.0, "25": 624.0, "30": 690.0, "35": 677.0, "40": 774.0, "45": 765.0, "50": 884.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759897600.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.93298, "5": 0.35524, "10": 0.35279, "15": 0.3474, "20": 0.35066, "25": 0.35914, "30": 0.35208, "35": 0.35087, "40": 0.35252, "45": 0.35496, "50": 0.36129}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.9297, "10": 10.90469, "15": 10.87115, "20": 10.74984, "25": 10.53727, "30": 10.32528, "35": 10.22874, "40": 10.01958, "45": 9.75531, "50": 9.84057}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 599.0, "5": 637.0, "10": 567.0, "15": 637.0, "20": 569.0, "25": 577.0, "30": 701.0, "35": 733.0, "40": 813.0, "45": 759.0, "50": 874.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 433619456.0, "5": 433619456.0, "10": 433619456.0, "15": 433619456.0, "20": 433619456.0, "25": 433619456.0, "30": 433619456.0, "35": 433619456.0, "40": 433619456.0, "45": 433619456.0, "50": 433619456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676274688.0, "5": 857131520.0, "10": 857134592.0, "15": 857134592.0, "20": 857134592.0, "25": 857134592.0, "30": 857134592.0, "35": 857134592.0, "40": 857134592.0, "45": 857134592.0, "50": 857134592.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.08022, "5": 0.41819, "10": 0.41975, "15": 0.42276, "20": 0.41504, "25": 0.41104, "30": 0.41458, "35": 0.41187, "40": 0.41442, "45": 0.41888, "50": 0.41596}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87853, "10": 10.82979, "15": 10.82044, "20": 10.7038, "25": 10.49397, "30": 10.30529, "35": 10.20166, "40": 10.01885, "45": 9.74947, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 657.0, "5": 614.0, "10": 533.0, "15": 657.0, "20": 610.0, "25": 624.0, "30": 690.0, "35": 677.0, "40": 774.0, "45": 765.0, "50": 884.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898112.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.82747, "5": 0.35095, "10": 0.35221, "15": 0.35252, "20": 0.35092, "25": 0.35493, "30": 0.35627, "35": 0.35299, "40": 0.35323, "45": 0.35997, "50": 0.34428}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.9297, "10": 10.90469, "15": 10.87115, "20": 10.74984, "25": 10.53727, "30": 10.32528, "35": 10.22874, "40": 10.01958, "45": 9.75531, "50": 9.84057}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 599.0, "5": 637.0, "10": 567.0, "15": 637.0, "20": 569.0, "25": 577.0, "30": 701.0, "35": 733.0, "40": 813.0, "45": 759.0, "50": 874.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 433619456.0, "5": 433619456.0, "10": 433619456.0, "15": 433619456.0, "20": 433619456.0, "25": 433619456.0, "30": 433619456.0, "35": 433619456.0, "40": 433619456.0, "45": 433619456.0, "50": 433619456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676274176.0, "5": 858183168.0, "10": 858183168.0, "15": 858183168.0, "20": 858183168.0, "25": 858183168.0, "30": 858183168.0, "35": 858183168.0, "40": 858183168.0, "45": 858183168.0, "50": 858183168.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.2965, "5": 0.44042, "10": 0.42384, "15": 0.4236, "20": 0.41899, "25": 0.42049, "30": 0.42347, "35": 0.42232, "40": 0.42267, "45": 0.42233, "50": 0.42382}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87853, "10": 10.82979, "15": 10.82044, "20": 10.7038, "25": 10.49397, "30": 10.30529, "35": 10.20166, "40": 10.01885, "45": 9.74947, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 657.0, "5": 614.0, "10": 533.0, "15": 657.0, "20": 610.0, "25": 624.0, "30": 690.0, "35": 677.0, "40": 774.0, "45": 765.0, "50": 884.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898624.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 17.78221, "5": 0.35745, "10": 0.35686, "15": 0.35523, "20": 0.34955, "25": 0.34923, "30": 0.35955, "35": 0.36112, "40": 0.34611, "45": 0.40112, "50": 0.34706}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87857, "10": 10.8298, "15": 10.82043, "20": 10.7038, "25": 10.49396, "30": 10.30535, "35": 10.20165, "40": 10.01884, "45": 9.74947, "50": 9.83976}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 632.0, "5": 590.0, "10": 548.0, "15": 633.0, "20": 581.0, "25": 568.0, "30": 662.0, "35": 713.0, "40": 768.0, "45": 808.0, "50": 814.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898624.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.52949, "5": 0.34536, "10": 0.34032, "15": 0.34188, "20": 0.34737, "25": 0.35154, "30": 0.34407, "35": 0.35764, "40": 0.34414, "45": 0.34298, "50": 0.34026}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.92969, "10": 10.9047, "15": 10.87118, "20": 10.74988, "25": 10.53733, "30": 10.32529, "35": 10.22869, "40": 10.01949, "45": 9.75528, "50": 9.84055}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 603.0, "5": 651.0, "10": 512.0, "15": 655.0, "20": 611.0, "25": 608.0, "30": 647.0, "35": 716.0, "40": 794.0, "45": 842.0, "50": 775.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 431522304.0, "5": 431522304.0, "10": 431522304.0, "15": 431522304.0, "20": 431522304.0, "25": 431522304.0, "30": 431522304.0, "35": 431522304.0, "40": 431522304.0, "45": 431522304.0, "50": 431522304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676274688.0, "5": 861328384.0, "10": 861328896.0, "15": 861328896.0, "20": 861328896.0, "25": 861328896.0, "30": 861328896.0, "35": 861328896.0, "40": 861328896.0, "45": 861328896.0, "50": 861328896.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.53857, "5": 0.41994, "10": 0.42437, "15": 0.4243, "20": 0.41968, "25": 0.41901, "30": 0.41523, "35": 0.41823, "40": 0.4121, "45": 0.4234, "50": 0.41265}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87858, "10": 10.82978, "15": 10.82045, "20": 10.70382, "25": 10.49393, "30": 10.30533, "35": 10.20167, "40": 10.01882, "45": 9.74952, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 615.0, "5": 655.0, "10": 536.0, "15": 663.0, "20": 604.0, "25": 625.0, "30": 742.0, "35": 711.0, "40": 744.0, "45": 840.0, "50": 883.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898624.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 934204416.0, "35": 934204416.0, "40": 934204416.0, "45": 934204416.0, "50": 934204416.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.22018, "5": 0.35813, "10": 0.36158, "15": 0.35614, "20": 0.35813, "25": 0.35947, "30": 0.35907, "35": 0.35505, "40": 0.35725, "45": 0.35408, "50": 0.35552}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93292, "5": 10.92965, "10": 10.90473, "15": 10.87127, "20": 10.74997, "25": 10.53754, "30": 10.32548, "35": 10.22895, "40": 10.01975, "45": 9.75546, "50": 9.84069}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 585.0, "5": 675.0, "10": 544.0, "15": 619.0, "20": 579.0, "25": 620.0, "30": 678.0, "35": 717.0, "40": 813.0, "45": 746.0, "50": 841.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 432177152.0, "5": 432177152.0, "10": 432177152.0, "15": 432177152.0, "20": 432177152.0, "25": 432177152.0, "30": 432177152.0, "35": 432177152.0, "40": 432177152.0, "45": 432177152.0, "50": 432177152.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676283904.0, "5": 856228864.0, "10": 857276928.0, "15": 857276928.0, "20": 857276928.0, "25": 857276928.0, "30": 857276928.0, "35": 857276928.0, "40": 857276928.0, "45": 857276928.0, "50": 857276928.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.34002, "5": 0.40276, "10": 0.39665, "15": 0.39344, "20": 0.39157, "25": 0.3871, "30": 0.38802, "35": 0.39196, "40": 0.38964, "45": 0.39313, "50": 0.39241}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92655, "5": 10.92722, "10": 10.9079, "15": 10.88296, "20": 10.77594, "25": 10.59266, "30": 10.39175, "35": 10.29701, "40": 10.09666, "45": 9.8447, "50": 9.90944}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1675.0, "5": 2035.0, "10": 1469.0, "15": 1853.0, "20": 1641.0, "25": 1685.0, "30": 1947.0, "35": 1941.0, "40": 2148.0, "45": 2122.0, "50": 2483.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 435191808.0, "5": 435191808.0, "10": 435191808.0, "15": 435191808.0, "20": 435191808.0, "25": 435191808.0, "30": 435191808.0, "35": 435191808.0, "40": 435191808.0, "45": 435191808.0, "50": 435191808.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.35385, "5": 0.17431, "10": 0.16906, "15": 0.16815, "20": 0.17162, "25": 0.17427, "30": 0.16998, "35": 0.172, "40": 0.17758, "45": 0.16824, "50": 0.16924}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92335, "5": 10.92815, "10": 10.9082, "15": 10.8847, "20": 10.77516, "25": 10.59065, "30": 10.39293, "35": 10.29701, "40": 10.09481, "45": 9.84578, "50": 9.90863}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 66.0, "5": 59.0, "10": 57.0, "15": 61.0, "20": 67.0, "25": 62.0, "30": 57.0, "35": 64.0, "40": 68.0, "45": 77.0, "50": 86.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 488145408.0, "5": 488145408.0, "10": 488145408.0, "15": 488145408.0, "20": 488145408.0, "25": 488145408.0, "30": 488145408.0, "35": 488145408.0, "40": 488145408.0, "45": 488145408.0, "50": 488145408.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2158389760.0, "5": 2340560384.0, "10": 2340560384.0, "15": 2340560384.0, "20": 2340560384.0, "25": 2340560384.0, "30": 2340560384.0, "35": 2340560384.0, "40": 2340560384.0, "45": 2340560384.0, "50": 2340560384.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.76555, "5": 0.21558, "10": 0.21605, "15": 0.21624, "20": 0.21751, "25": 0.21773, "30": 0.21692, "35": 0.21584, "40": 0.21624, "45": 0.21663, "50": 0.2151}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92799, "10": 10.90789, "15": 10.88313, "20": 10.77626, "25": 10.59138, "30": 10.39195, "35": 10.29687, "40": 10.0964, "45": 9.84466, "50": 9.90919}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 68.0, "5": 64.0, "10": 61.0, "15": 58.0, "20": 64.0, "25": 58.0, "30": 85.0, "35": 66.0, "40": 85.0, "45": 82.0, "50": 68.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 487096320.0, "5": 487096320.0, "10": 487096320.0, "15": 487096320.0, "20": 487096320.0, "25": 487096320.0, "30": 487096320.0, "35": 487096320.0, "40": 487096320.0, "45": 487096320.0, "50": 487096320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2158389248.0, "5": 2338462720.0, "10": 2338462720.0, "15": 2338462720.0, "20": 2338462720.0, "25": 2338462720.0, "30": 2338462720.0, "35": 2338462720.0, "40": 2338462720.0, "45": 2338462720.0, "50": 2338462720.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.46397, "5": 0.23056, "10": 0.22926, "15": 0.22949, "20": 0.23168, "25": 0.23436, "30": 0.2291, "35": 0.22829, "40": 0.22791, "45": 0.22801, "50": 0.20614}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92335, "5": 10.92817, "10": 10.90824, "15": 10.88471, "20": 10.77514, "25": 10.5907, "30": 10.39289, "35": 10.29701, "40": 10.09486, "45": 9.84576, "50": 9.90869}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1834.0, "5": 2149.0, "10": 1616.0, "15": 2030.0, "20": 1833.0, "25": 1728.0, "30": 2045.0, "35": 2164.0, "40": 2364.0, "45": 2242.0, "50": 2670.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 487096832.0, "5": 487096832.0, "10": 487096832.0, "15": 487096832.0, "20": 487096832.0, "25": 487096832.0, "30": 487096832.0, "35": 487096832.0, "40": 487096832.0, "45": 487096832.0, "50": 487096832.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1720084992.0, "5": 1900158464.0, "10": 1900158464.0, "15": 1900158464.0, "20": 1900158464.0, "25": 1900158464.0, "30": 1900158464.0, "35": 1900158464.0, "40": 1900158464.0, "45": 1900158464.0, "50": 1900158464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.80378, "5": 0.21906, "10": 0.21975, "15": 0.21443, "20": 0.21546, "25": 0.21553, "30": 0.21592, "35": 0.21551, "40": 0.21537, "45": 0.21378, "50": 0.21373}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92795, "10": 10.90786, "15": 10.88314, "20": 10.77629, "25": 10.59141, "30": 10.39192, "35": 10.29686, "40": 10.0964, "45": 9.84464, "50": 9.90918}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1627.0, "5": 2010.0, "10": 1368.0, "15": 1897.0, "20": 1626.0, "25": 1743.0, "30": 1930.0, "35": 1954.0, "40": 2199.0, "45": 2068.0, "50": 2460.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 488144896.0, "5": 488144896.0, "10": 488144896.0, "15": 488144896.0, "20": 488144896.0, "25": 488144896.0, "30": 488144896.0, "35": 488144896.0, "40": 488144896.0, "45": 488144896.0, "50": 488144896.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1720084480.0, "5": 1902255104.0, "10": 1902255104.0, "15": 1902255104.0, "20": 1902255104.0, "25": 1902255104.0, "30": 1902255104.0, "35": 1902255104.0, "40": 1902255104.0, "45": 1902255104.0, "50": 1902255104.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.72897, "5": 0.21376, "10": 0.21471, "15": 0.21644, "20": 0.21662, "25": 0.21524, "30": 0.21202, "35": 0.21278, "40": 0.21187, "45": 0.21266, "50": 0.21239}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91616, "15": 10.93902, "20": 10.93405, "25": 10.88579, "30": 10.81294, "35": 10.72198, "40": 10.55137, "45": 10.32844, "50": 10.28766}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 378378752.0, "5": 378378752.0, "10": 378903040.0, "15": 378903040.0, "20": 560548864.0, "25": 560811008.0, "30": 559500288.0, "35": 560548864.0, "40": 560548864.0, "45": 560548864.0, "50": 560548864.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1905351680.0, "5": 1905352192.0, "10": 1905352192.0, "15": 1905352192.0, "20": 2086473728.0, "25": 2086473728.0, "30": 2086473728.0, "35": 2086473728.0, "40": 2086473728.0, "45": 2086473728.0, "50": 2086473728.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.42624, "5": 0.19771, "10": 0.19787, "15": 0.19869, "20": 0.2134, "25": 0.21429, "30": 0.21327, "35": 0.21407, "40": 0.21288, "45": 0.21339, "50": 0.21186}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1751.0, "25": 2304.0, "30": 2419.0, "35": 1906.0, "40": 2063.0, "45": 2340.0, "50": 2943.0}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91616, "15": 10.93902, "20": 10.93405, "25": 10.88579, "30": 10.81295, "35": 10.72198, "40": 10.55137, "45": 10.32844, "50": 10.28766}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 378903040.0, "5": 378378752.0, "10": 378903040.0, "15": 378378752.0, "20": 560811008.0, "25": 560548864.0, "30": 561073152.0, "35": 562646016.0, "40": 560548864.0, "45": 562646016.0, "50": 560548864.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1905351680.0, "5": 1905352192.0, "10": 1905352192.0, "15": 1905352192.0, "20": 2087784448.0, "25": 2087784448.0, "30": 2087784448.0, "35": 2087784448.0, "40": 2087784448.0, "45": 2087784448.0, "50": 2087784448.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.5872, "5": 0.20393, "10": 0.20412, "15": 0.20193, "20": 0.22109, "25": 0.21826, "30": 0.21476, "35": 0.21348, "40": 0.21255, "45": 0.21142, "50": 0.21064}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1751.0, "25": 2491.0, "30": 2428.0, "35": 1827.0, "40": 2072.0, "45": 2361.0, "50": 2998.0}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69213, "25": 10.5241, "30": 10.34556, "35": 10.26241, "40": 10.07237, "45": 9.811, "50": 9.88419}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1699.0, "25": 1614.0, "30": 1905.0, "35": 1933.0, "40": 2169.0, "45": 2101.0, "50": 2421.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 523004928.0, "5": 523004928.0, "10": 523004928.0, "15": 523004928.0, "20": 523004928.0, "25": 523004928.0, "30": 523004928.0, "35": 523004928.0, "40": 523004928.0, "45": 523004928.0, "50": 523004928.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768873984.0, "5": 3912766464.0, "10": 3912766464.0, "15": 3912766464.0, "20": 3912766464.0, "25": 3912766464.0, "30": 3912766464.0, "35": 3912766464.0, "40": 3912766464.0, "45": 3912766464.0, "50": 3912766464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.88705, "5": 0.16956, "10": 0.17448, "15": 0.16853, "20": 0.1715, "25": 0.17071, "30": 0.17343, "35": 0.17213, "40": 0.1719, "45": 0.17357, "50": 0.17228}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_dgx_a100_1N8G/golden_values_dev_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81478, "5": 10.8517, "10": 10.78749, "15": 10.79506, "20": 10.69119, "25": 10.52293, "30": 10.34604, "35": 10.26168, "40": 10.07199, "45": 9.8098, "50": 9.88336}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1549.0, "5": 1915.0, "10": 1391.0, "15": 1773.0, "20": 1615.0, "25": 1748.0, "30": 1877.0, "35": 1915.0, "40": 2111.0, "45": 2009.0, "50": 2347.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 522846720.0, "5": 522846720.0, "10": 522846720.0, "15": 522846720.0, "20": 522846720.0, "25": 522846720.0, "30": 522846720.0, "35": 522846720.0, "40": 522846720.0, "45": 522846720.0, "50": 522846720.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768846848.0, "5": 3912608256.0, "10": 3912608256.0, "15": 3912608256.0, "20": 3912608256.0, "25": 3912608256.0, "30": 3912608256.0, "35": 3912608256.0, "40": 3912608256.0, "45": 3912608256.0, "50": 3912608256.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.36782, "5": 0.18832, "10": 0.16735, "15": 0.16595, "20": 0.16466, "25": 0.16564, "30": 0.16594, "35": 0.16362, "40": 0.16524, "45": 0.16382, "50": 0.16329}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69213, "25": 10.5241, "30": 10.34556, "35": 10.26241, "40": 10.07237, "45": 9.811, "50": 9.88419}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1699.0, "25": 1614.0, "30": 1905.0, "35": 1933.0, "40": 2169.0, "45": 2101.0, "50": 2421.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 523004928.0, "5": 523004928.0, "10": 523004928.0, "15": 523004928.0, "20": 523004928.0, "25": 523004928.0, "30": 523004928.0, "35": 523004928.0, "40": 523004928.0, "45": 523004928.0, "50": 523004928.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768873984.0, "5": 3912766464.0, "10": 3912766464.0, "15": 3912766464.0, "20": 3912766464.0, "25": 3912766464.0, "30": 3912766464.0, "35": 3912766464.0, "40": 3912766464.0, "45": 3912766464.0, "50": 3912766464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.88705, "5": 0.16956, "10": 0.17448, "15": 0.16853, "20": 0.1715, "25": 0.17071, "30": 0.17343, "35": 0.17213, "40": 0.1719, "45": 0.17357, "50": 0.17228}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69213, "25": 10.5241, "30": 10.34556, "35": 10.26241, "40": 10.07237, "45": 9.811, "50": 9.88419}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1699.0, "25": 1614.0, "30": 1905.0, "35": 1933.0, "40": 2169.0, "45": 2101.0, "50": 2421.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 523004928.0, "5": 523004928.0, "10": 523004928.0, "15": 523004928.0, "20": 523004928.0, "25": 523004928.0, "30": 523004928.0, "35": 523004928.0, "40": 523004928.0, "45": 523004928.0, "50": 523004928.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768873984.0, "5": 3912766464.0, "10": 3912766464.0, "15": 3912766464.0, "20": 3912766464.0, "25": 3912766464.0, "30": 3912766464.0, "35": 3912766464.0, "40": 3912766464.0, "45": 3912766464.0, "50": 3912766464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.88705, "5": 0.16956, "10": 0.17448, "15": 0.16853, "20": 0.1715, "25": 0.17071, "30": 0.17343, "35": 0.17213, "40": 0.1719, "45": 0.17357, "50": 0.17228}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp1_ep8_etp1_cp_memory_speed/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.30214, "5": 12.30024, "10": 12.30151, "15": 12.30332, "20": 12.30211, "25": 12.30186}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 328814144.0, "5": 328839744.0, "10": 328755232.0, "15": 328750112.0, "20": 328814144.0, "25": 328757856.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 43403747328.0, "5": 43403747328.0, "10": 43403747328.0, "15": 43403747328.0, "20": 43403747328.0, "25": 43403747328.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 50198523904.0, "5": 60830679040.0, "10": 60851249152.0, "15": 60851249152.0, "20": 60851265536.0, "25": 60851265536.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 57.07556, "5": 4.53974, "10": 4.46565, "15": 4.45791, "20": 4.45123, "25": 4.44671}}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 10.74903, "5": 11.07428, "10": 9.25131, "15": 8.7913, "20": 8.16509, "25": 7.78974}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 245867.0, "5": 251639.0, "10": 252463.0, "15": 262053.0, "20": 248279.0, "25": 237341.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 40574230528.0, "5": 40574230528.0, "10": 40574230528.0, "15": 40574230528.0, "20": 40574230528.0, "25": 40574230528.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 40574234624.0, "5": 44891381760.0, "10": 44900294656.0, "15": 44902916096.0, "20": 44902916096.0, "25": 44902916096.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 13.83435, "5": 0.45747, "10": 0.45413, "15": 0.45505, "20": 0.45544, "25": 0.45536}}}


--------------------------------------------------------------------------------
/tests/test_utils/recipes/_build-mcore-dev.yaml:
--------------------------------------------------------------------------------
 1 | type: build
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | spec:
 5 |   name: mcore-pyt-dev
 6 |   platforms: [linux/amd64]
 7 |   source:
 8 |     # The image tag will be added via `jet-tests.yaml`
 9 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
10 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_dev
11 |     


--------------------------------------------------------------------------------
/tests/test_utils/recipes/_build-mcore-lts.yaml:
--------------------------------------------------------------------------------
 1 | type: build
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | spec:
 5 |   name: mcore-pyt-lts
 6 |   platforms: [linux/amd64]
 7 |   source:
 8 |     # The image tag will be added via `jet-tests.yaml`
 9 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
10 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_lts
11 |     


--------------------------------------------------------------------------------
/tests/test_utils/recipes/_build-nemo.yaml:
--------------------------------------------------------------------------------
 1 | type: build
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | spec:
 5 |   name: mcore-nemo
 6 |   platforms: [linux/amd64]
 7 |   source:
 8 |     # The image tag will be added via `jet-tests.yaml`
 9 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
10 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
1 | import torch._dynamo
2 | 
3 | torch._dynamo.config.suppress_errors = True
4 | 


--------------------------------------------------------------------------------
/tests/unit_tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/data/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/dist_checkpointing/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | import pytest
 4 | 
 5 | from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy
 6 | from megatron.core.msc_utils import MultiStorageClientFeature
 7 | 
 8 | 
 9 | def pytest_sessionfinish(session, exitstatus):
10 |     if exitstatus == 5:
11 |         session.exitstatus = 0
12 | 
13 | 
14 | @pytest.fixture(scope="class")
15 | def tmp_dir_per_class(tmp_path_factory):
16 |     return tmp_path_factory.mktemp("data")
17 | 
18 | 
19 | @pytest.fixture(scope='session', autouse=True)
20 | def set_default_dist_ckpt_strategy():
21 |     # Disable MSC for tests
22 |     MultiStorageClientFeature.disable()
23 | 
24 |     def get_pyt_dist_save_sharded_strategy():
25 |         return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1)
26 | 
27 |     with mock.patch(
28 |         'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy',
29 |         new=get_pyt_dist_save_sharded_strategy,
30 |     ) as _fixture:
31 |         yield _fixture
32 | 


--------------------------------------------------------------------------------
/tests/unit_tests/dist_checkpointing/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/dist_checkpointing/models/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/export/trtllm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/export/trtllm/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/engines/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/model_inference_wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/model_inference_wrappers/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
 4 |     InferenceWrapperConfig,
 5 | )
 6 | 
 7 | 
 8 | class TestModelInferenceWrapperConfig:
 9 | 
10 |     def test_inference_config(self):
11 |         inference_config = InferenceWrapperConfig(
12 |             hidden_size=10,
13 |             inference_batch_times_seqlen_threshold=10,
14 |             padded_vocab_size=10,
15 |             params_dtype=torch.float,
16 |             fp32_residual_connection=False,
17 |         )
18 |         inference_config.add_attributes({"abc": 45})
19 |         assert (
20 |             inference_config.abc == 45
21 |         ), f"min tokens not set correctly. it is {inference_config.min_tokens}"
22 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/test_common_inference_params.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.inference.sampling_params import SamplingParams
 2 | 
 3 | 
 4 | class TestSamplingParams:
 5 | 
 6 |     def test_sampling_params(self):
 7 |         sampling_params = SamplingParams()
 8 |         sampling_params.add_attributes({"min_tokens": 45})
 9 |         assert (
10 |             sampling_params.min_tokens == 45
11 |         ), f"min tokens not set correctly. it is {sampling_params.min_tokens}"
12 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/test_flash_decode.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb_with_cos_sin
 4 | from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 5 | 
 6 | 
 7 | class TestRotaryEmbeddingWithPrecomputedCosSin:
 8 | 
 9 |     def setup_method(self):
10 |         self.batch_size = 3
11 |         self.seq_len = 4
12 |         self.d_rot = 6
13 |         self.rotary_embedding = RotaryEmbedding(kv_channels=4, rotary_percent=1.0)
14 | 
15 |     def test_output_shapes_match(self):
16 | 
17 |         # Create input tensors
18 |         t = torch.randn(self.seq_len, self.batch_size, 2, self.d_rot * 2, device="cuda")
19 |         rotary_pos_cos, rotary_pos_sin = self.rotary_embedding.get_cos_sin(self.seq_len)
20 | 
21 |         # Test using Flash Decoding optimized kernel which requires precomputed cos & sin tensors
22 |         expected_shape = torch.Size(
23 |             [self.seq_len, self.batch_size, self.seq_len // 2, self.seq_len * self.batch_size]
24 |         )
25 |         output_flash_rotary = apply_rotary_pos_emb_with_cos_sin(
26 |             t, rotary_pos_cos, rotary_pos_sin, rotary_interleaved=True
27 |         )
28 | 
29 |         assert (
30 |             output_flash_rotary.shape == expected_shape
31 |         ), f"Outputs do not match: {output_flash_rotary.shape} != {expected_shape}"
32 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/test_inference_utils.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.inference.utils import Counter
 2 | 
 3 | 
 4 | class TestInferenceUtils:
 5 | 
 6 |     def test_counter(self):
 7 |         counter = Counter()
 8 |         r = next(counter)
 9 |         assert r == 0, f'Counter return value should be 0 but it is {r}'
10 |         assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}'
11 |         counter.reset()
12 |         assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}'
13 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/text_generation_controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/text_generation_controllers/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/pipeline_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/post_training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/post_training/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/tensor_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
 5 | from tests.unit_tests.test_utilities import Utils
 6 | 
 7 | 
 8 | def test_vocab_parallel_cross_entropy():
 9 |     Utils.initialize_model_parallel(4, 2)
10 |     vocab_parallel_logits = torch.range(0, 7).repeat(16, 4).cuda()
11 |     target = torch.arange(0, 32, 2).cuda()
12 |     output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
13 |     expected_output = torch.tensor(
14 |         [
15 |             10.2309,
16 |             8.2309,
17 |             6.2309,
18 |             4.2309,
19 |             10.2309,
20 |             8.2309,
21 |             6.2309,
22 |             4.2309,
23 |             10.2309,
24 |             8.2309,
25 |             6.2309,
26 |             4.2309,
27 |             10.2309,
28 |             8.2309,
29 |             6.2309,
30 |             4.2309,
31 |         ]
32 |     ).cuda()
33 |     assert torch.equal(torch.round(expected_output), torch.round(output))
34 |     Utils.destroy_model_parallel()
35 | 


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from megatron.core.tensor_parallel.data import broadcast_data
 4 | from tests.unit_tests.test_utilities import Utils
 5 | 
 6 | 
 7 | def test_broadcast_data():
 8 |     Utils.initialize_model_parallel(2, 4)
 9 |     input_data = {
10 |         0: torch.ones((8, 8)).cuda() * 0.0,
11 |         1: torch.ones((8, 8)).cuda() * 1.0,
12 |         2: torch.ones((8, 8)).cuda() * 2.0,
13 |         3: torch.ones((8, 8)).cuda() * 3.0,
14 |         4: torch.ones((8, 8)).cuda() * 4.0,
15 |         5: torch.ones((8, 8)).cuda() * 5.0,
16 |         6: torch.ones((8, 8)).cuda() * 6.0,
17 |         7: torch.ones((8, 8)).cuda() * 7.0,
18 |     }
19 |     dtype = torch.float32
20 |     actual_output = broadcast_data([0, 1], input_data, dtype)
21 |     assert torch.equal(actual_output[0], input_data[0])
22 |     assert torch.equal(actual_output[1], input_data[1])
23 |     Utils.destroy_model_parallel()
24 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/transformer/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/transformer/moe/__init__.py


--------------------------------------------------------------------------------
/tools/autoformat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euox pipefail
 3 | 
 4 | GIT_VERSION=$(git version | awk '{print $3}')
 5 | GIT_MAJOR=$(echo $GIT_VERSION | awk -F. '{print $1}')
 6 | GIT_MINOR=$(echo $GIT_VERSION | awk -F. '{print $2}')
 7 | 
 8 | if [[ $GIT_MAJOR -eq 2 && $GIT_MINOR -lt 31 ]]; then
 9 |     echo "Git version must be at least 2.31.0. Found $GIT_VERSION"
10 |     exit 1
11 | fi
12 | 
13 | SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
14 | CHECK_ONLY=${CHECK_ONLY:-false}
15 | SKIP_DOCS=${SKIP_DOCS:-false}
16 | 
17 | BASE_REF=${BASE_REF:-main}
18 | CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/${BASE_REF} megatron/core tests/ | grep '\.py$' || true)
19 | ADDITIONAL_ARGS=""
20 | ADDITIONAL_BLACK_ARGS=""
21 | ADDITIONAL_PYLINT_ARGS=""
22 | 
23 | if [[ $CHECK_ONLY == true ]]; then
24 |     ADDITIONAL_ARGS="--check"
25 |     ADDITIONAL_BLACK_ARGS="--diff"
26 | fi
27 | 
28 | if [[ $SKIP_DOCS == true ]]; then
29 |     ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
30 | fi
31 | 
32 | if [[ -n "$CHANGED_FILES" ]]; then
33 |     black --skip-magic-trailing-comma $ADDITIONAL_ARGS $ADDITIONAL_BLACK_ARGS --verbose $CHANGED_FILES
34 |     isort $ADDITIONAL_ARGS $CHANGED_FILES
35 |     pylint $ADDITIONAL_PYLINT_ARGS $CHANGED_FILES
36 |     mypy --explicit-package-bases --follow-imports=skip $CHANGED_FILES || true
37 | else
38 |     echo Changeset is empty, all good.
39 | fi
40 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder
4 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "h5py",
 7 |     "transformers", # for huggingface bert
 8 | ]
 9 | 
10 | for lib in required_libs:
11 |     try:
12 |         globals()[lib] = importlib.import_module(lib)
13 |     except ImportError as e:
14 |         raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.")
15 | 


--------------------------------------------------------------------------------
/tools/copyright.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Files ending with .py should have Copyright notice in the first line.
 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 5 | 
 6 | # Move to the project root
 7 | cd $SCRIPT_DIR/..
 8 | find_files_with_missing_copyright() {
 9 | find ./megatron/ -type f -name '*.py' | while read path; do
10 |     echo -en $path"\t"
11 |     head -2 $path | grep -iv 'coding=' | head -1
12 | done \
13 |    | egrep -iv 'Copyright.*NVIDIA CORPORATION.*All rights reserved.' \
14 |    | grep -iv 'BSD 3-Clause License' \
15 |    | grep -iv 'Copyright.*Microsoft' \
16 |    | grep -iv 'Copyright.*The Open AI Team' \
17 |    | grep -iv 'Copyright.*The Google AI' \
18 |    | grep -iv 'Copyright.*Facebook' | while read line; do
19 |      echo $line | cut -d' ' -f1
20 |    done
21 | }
22 | 
23 | 
24 | declare RESULT=($(find_files_with_missing_copyright))  # (..) = array
25 | 
26 | if [ "${#RESULT[@]}" -gt 0 ]; then
27 |    echo "Error: Found files with missing copyright:"
28 |    for (( i=0; i<"${#RESULT[@]}"; i++ )); do
29 |       echo "path= ${RESULT[$i]}"
30 |    done
31 |    exit 1;
32 | else
33 |    echo "Ok: All files start with copyright notice"
34 | fi
35 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/report_theoretical_memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Computes theoretical memory footprint for model training without instantiating
 4 | a model and running training iterations on GPU(s)."""
 5 | 
 6 | from megatron.training import get_args
 7 | from megatron.training.initialize import initialize_megatron
 8 | from megatron.training.theoretical_memory_usage import report_theoretical_memory
 9 | 
10 | if __name__ == "__main__":
11 |     initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)
12 |     args = get_args()
13 | 
14 |     report_theoretical_memory(args, verbose=True)
15 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .cli import retro
4 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | from . import retro
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     retro.init(os.environ["RETRO_PROJECT_DIR"])
10 | 


--------------------------------------------------------------------------------
/tools/retro/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3
 2 | 
 3 | RUN pip install -U faiss-gpu
 4 | 
 5 | RUN apt update
 6 | 
 7 | RUN apt install -qy htop
 8 | 
 9 | RUN pip install -U transformers
10 | 
11 | RUN pip install --upgrade google-api-python-client
12 | 
13 | RUN pip install sentencepiece
14 | 
15 | RUN pip install h5py
16 | 
17 | RUN pip install nltk
18 | 
19 | RUN pip install einops
20 | 


--------------------------------------------------------------------------------
/tools/retro/sft/README.md:
--------------------------------------------------------------------------------
1 | ## Note
2 | 
3 | The content within this `sft` directory is still under active development and will be updated soon.


--------------------------------------------------------------------------------
/tools/retro/sft/open_inst.sh:
--------------------------------------------------------------------------------
1 | DATA_BLEND="1.0 open_inst"
2 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import ast
 3 | import sys
 4 | import json
 5 | import requests
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     url = sys.argv[1]
10 |     url = 'http://' + url + '/api'
11 |     headers = {'Content-Type': 'application/json'}
12 | 
13 |     while True:
14 |         sentence = input("Enter prompt: ")
15 |         tokens_to_generate = ast.literal_eval(input("Enter number of tokens to generate: "))
16 | 
17 |         data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate}
18 |         response = requests.put(url, data=json.dumps(data), headers=headers)
19 | 
20 |         if response.status_code != 200:
21 |             print(f"Error {response.status_code}: {response.json()['message']}")
22 |         else:
23 |             print("Megatron Response: ")
24 |             print(response.json()['text'][0])
25 | 


--------------------------------------------------------------------------------
/tools/wait_daemon.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SLURM by default terminates all user processes when the main job process is
 4 | # finished. This also immediately terminates inprocess.MonitorProcess and
 5 | # prevents it from submitting information to distributed store, and finalizing
 6 | # iteration by waiting on termination barrier.
 7 | #
 8 | # This script waits for all "python" processes launched by the current user to
 9 | # finish before terminating the SLURM job.
10 | 
11 | is_daemon_running() {
12 |     pgrep -u $USER "python" > /dev/null
13 | }
14 | 
15 | wait_daemon() {
16 |    while is_daemon_running; do
17 |       sleep 1
18 |    done
19 | }
20 | 


--------------------------------------------------------------------------------