├── .coveragerc ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── enhancement.md │ ├── question.md │ └── regression.md └── workflows │ └── stale.yml ├── .gitignore ├── .gitlab-ci.yml ├── .gitlab ├── labeler-config.yml ├── scripts │ ├── build.sh │ └── fetch-legacy-suite.sh └── stages │ ├── 00.pre.yml │ ├── 01.build.yml │ ├── 02.test.yml │ ├── 03.integration-tests.yml │ ├── 04.functional-tests.yml │ └── 05.publish.yml ├── .pre-commit-config.yaml ├── .pylintrc ├── CHANGELOG.md ├── CODEOWNERS ├── CONTRIBUTING.md ├── Dockerfile.ci.dev ├── Dockerfile.ci.lts ├── Dockerfile.linting ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── llama_mistral.md └── source │ ├── api-guide │ ├── context_parallel.rst │ ├── custom_fsdp.md │ ├── datasets.rst │ ├── dist_checkpointing.rst │ ├── dist_checkpointing.strategies.rst │ ├── dist_optimizer.md │ ├── distributed.rst │ ├── encoder_decoder_parallelism.rst │ ├── fusions.rst │ ├── index.rst │ ├── models.bert.rst │ ├── models.gpt.rst │ ├── models.rst │ ├── models.t5.rst │ ├── moe.rst │ ├── multi_latent_attention.rst │ ├── multi_token_prediction.md │ ├── num_microbatches_calculator.rst │ ├── optimizer_cpu_offload.rst │ ├── optimizer_param_scheduler.rst │ ├── pipeline_parallel.rst │ ├── tensor_parallel.rst │ └── transformer.rst │ ├── images │ ├── context_parallel │ │ ├── CP_overview.png │ │ └── CP_results.png │ ├── custom_fsdp │ │ ├── FSDP_Allreduce.png │ │ ├── FSDP_workflow.png │ │ └── MCore_Custom_FSDP_Class_Diagram.png │ ├── distrib_optimizer │ │ ├── data_flow.png │ │ └── sharding_scheme.png │ ├── moe │ │ └── token_drop.png │ └── multi_token_prediction │ │ └── MTP_implementation.png │ ├── index.rst │ └── user-guide │ └── index.rst ├── examples ├── academic_paper_scripts │ ├── detxoify_lm │ │ ├── README.md │ │ ├── annotations │ │ │ ├── filter-selfgeneration.py │ │ │ ├── perspective_api_annotate.py │ │ │ └── preprocess.sh │ │ ├── finetune_gpt.py │ │ ├── finetune_gpt_distributed-1.3b.sh │ │ ├── generate-1.3b.sh │ │ ├── generate_samples_gpt.py │ │ ├── perspective_api.py │ │ └── self_generation │ │ │ └── selfgenerate-1.3b-unconditional.sh │ ├── msdp │ │ ├── README.md │ │ ├── data_processing.sh │ │ ├── eval_knwl_generation.sh │ │ ├── eval_resp_generation.sh │ │ ├── prep_resp_gen.sh │ │ ├── prompt_knwl_gen.sh │ │ └── prompt_resp_gen.sh │ └── sc21 │ │ ├── CONFIG.sh │ │ ├── README.md │ │ ├── SBATCH.sh │ │ ├── SRUN.sh │ │ ├── run_figure_11.sh │ │ ├── run_figure_12.sh │ │ ├── run_figure_13.sh │ │ ├── run_figure_14.sh │ │ ├── run_figure_15.sh │ │ ├── run_figure_16.sh │ │ ├── run_figure_17.sh │ │ ├── run_figure_18.sh │ │ └── run_table_1.sh ├── bert │ ├── README.md │ └── train_bert_340m_distributed.sh ├── export │ ├── README.md │ └── trtllm_export │ │ ├── README.md │ │ ├── distributed_export │ │ └── gpt_distributed_gpu_export.py │ │ └── single_device_export │ │ └── gpt_single_device_cpu_export.py ├── gpt3 │ ├── README.md │ ├── gpt_config.yaml │ └── train_gpt3_175b_distributed.sh ├── inference │ ├── README.md │ ├── gpt │ │ ├── gpt_dynamic_inference.py │ │ ├── gpt_dynamic_inference_12b.sh │ │ ├── gpt_dynamic_inference_357m.sh │ │ ├── gpt_static_inference.py │ │ └── utils.py │ ├── llama_mistral │ │ ├── huggingface_reference.py │ │ ├── run_text_generation_llama3.1.sh │ │ ├── run_text_generation_llama3.sh │ │ └── run_text_generation_mistral.sh │ ├── run_text_generation_server_345M.sh │ ├── run_text_generation_server_345M_8_tensor_parallel.sh │ └── t5 │ │ └── simple_t5_batch_inference.py ├── mamba │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── run_text_gen_server_8b.sh │ ├── run_text_gen_server_8b_gpt3.sh │ └── train.sh ├── mixtral │ ├── README.md │ └── train_mixtral_8x7b_distributed.sh ├── multimodal │ ├── Dockerfile │ ├── README.md │ ├── assets │ │ └── pretrain_curves.png │ ├── combine_lm_vision_checkpoints.sh │ ├── combine_state_dicts.py │ ├── config.py │ ├── convert_llava_pretrain_to_wds.py │ ├── dataloader_provider.py │ ├── dataset_helpers.py │ ├── energon_util.py │ ├── evaluation │ │ ├── evaluate_ai2d.py │ │ ├── evaluate_chartqa.py │ │ ├── evaluate_coco.py │ │ ├── evaluate_infovqa.py │ │ ├── evaluate_mathvista.py │ │ ├── evaluate_mmmu.py │ │ ├── evaluate_ocrbench.py │ │ ├── evaluate_ocrbench_v2.py │ │ ├── evaluate_rd_tablebench.py │ │ ├── evaluate_realworldqa.py │ │ ├── evaluate_spdocvqa.py │ │ ├── evaluate_textvqa.py │ │ ├── evaluate_video_motionbench.py │ │ ├── evaluate_video_mvbench.py │ │ ├── evaluate_video_phys_game_bench.py │ │ ├── evaluate_vqav2.py │ │ ├── evaluation_datasets.py │ │ └── mmmu_utils.py │ ├── image_processing.py │ ├── layer_scaling.py │ ├── layer_specs.py │ ├── manual_prompts.json │ ├── model.py │ ├── model_converter │ │ ├── clip_converter.py │ │ ├── internvit_converter.py │ │ ├── radio_converter.py │ │ ├── siglip_converter.py │ │ └── vision_model_tester.py │ ├── multimodal_args.py │ ├── nvlm │ │ ├── README.md │ │ ├── internvit.py │ │ ├── nvlm_prompts.json │ │ ├── pp_checkpoint_converter.py │ │ ├── pretrain_blend.yaml │ │ ├── pretrain_qwen20_72b_internvit_6b.sh │ │ ├── pretrain_yi_34b_internvit_6b.sh │ │ ├── run_text_generation_qwen20_72b_internvit_6b.sh │ │ ├── run_text_generation_qwen25_7b_internvit_video.sh │ │ ├── run_text_generation_qwen25_7b_siglip.sh │ │ ├── run_text_generation_yi_34b_internvit_6b.sh │ │ ├── sft_34b_internvit.sh │ │ ├── sft_blend.yaml │ │ ├── sft_qwen20_72b_internvit_6b.sh │ │ └── sft_qwen2p5_7b_internvit_6b_video.sh │ ├── pretrain_dataset.yaml │ ├── pretrain_mistral_clip.sh │ ├── radio │ │ └── radio_g.py │ ├── run_text_generation.py │ ├── sft_dataset.yaml │ ├── sft_mistral_clip.sh │ ├── text_generation_mistral_clip.sh │ └── train.py ├── post_training │ └── modelopt │ │ ├── README.md │ │ ├── conf │ │ ├── arguments.sh │ │ ├── deepseek-ai │ │ │ ├── DeepSeek-R1.sh │ │ │ └── DeepSeek-V2-Lite.sh │ │ ├── meta-llama │ │ │ ├── Llama-3.1-8B-Instruct.sh │ │ │ ├── Llama-3.2-1B-Instruct.sh │ │ │ ├── Llama-4-Maverick-17B-128E-Instruct.sh │ │ │ └── Llama-4-Scout-17B-16E-Instruct.sh │ │ ├── nvidia │ │ │ ├── Nemotron-H-4B-Instruct.sh │ │ │ ├── Nemotron-H-8B-Base-8K.sh │ │ │ └── Nemotron-Mini-4B-Instruct.sh │ │ └── qwen │ │ │ ├── Qwen3-235B-A22B.sh │ │ │ └── Qwen3-30B-A3B.sh │ │ ├── convert.sh │ │ ├── convert_model.py │ │ ├── export.py │ │ ├── export.sh │ │ ├── finetune.py │ │ ├── finetune.sh │ │ ├── generate.py │ │ ├── generate.sh │ │ ├── mmlu.py │ │ ├── mmlu.sh │ │ ├── quantize.py │ │ ├── quantize.sh │ │ ├── requirements.txt │ │ └── speculative.md ├── retro │ ├── README.md │ ├── preprocess_data.sh │ └── train_retro_2b_distributed.sh ├── run_simple_mcore_train_loop.py └── t5 │ ├── README.md │ ├── t5_mcore_train_curve.png │ └── train_t5_220m_distributed.sh ├── images ├── model_table.png ├── strong_scaling.png └── weak_scaling.png ├── megatron ├── core │ ├── MSC_Integration.md │ ├── QuickStart.md │ ├── README.md │ ├── README_STRAGGLER.md │ ├── __init__.py │ ├── config.py │ ├── config_logger.py │ ├── datasets │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── bert_dataset.py │ │ ├── blended_dataset.py │ │ ├── blended_megatron_dataset_builder.py │ │ ├── blended_megatron_dataset_config.py │ │ ├── gpt_dataset.py │ │ ├── helpers.cpp │ │ ├── helpers.py │ │ ├── indexed_dataset.py │ │ ├── masked_dataset.py │ │ ├── megatron_dataset.py │ │ ├── megatron_tokenizer.py │ │ ├── multimodal_dataset.py │ │ ├── object_storage_utils.py │ │ ├── readme.md │ │ ├── retro │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_embedders.py │ │ │ │ ├── config.py │ │ │ │ ├── gpt_chunk_datasets.py │ │ │ │ └── tokenizers.py │ │ │ ├── db │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── dataset.py │ │ │ │ └── utils.py │ │ │ ├── external_libs.py │ │ │ ├── index │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── factory.py │ │ │ │ ├── index.py │ │ │ │ ├── indexes │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── faiss_base.py │ │ │ │ │ └── faiss_par_add.py │ │ │ │ ├── utils.py │ │ │ │ └── validate.py │ │ │ ├── query │ │ │ │ ├── __init__.py │ │ │ │ ├── gpt_chunk_dataset.py │ │ │ │ ├── multi_split_gpt_dataset.py │ │ │ │ ├── query.py │ │ │ │ ├── retro_dataset.py │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── t5_dataset.py │ │ ├── utils.py │ │ ├── utils_object_storage.py │ │ └── utils_s3.py │ ├── dist_checkpointing │ │ ├── __init__.py │ │ ├── core.py │ │ ├── dict_utils.py │ │ ├── exchange_utils.py │ │ ├── mapping.py │ │ ├── optimizer.py │ │ ├── serialization.py │ │ ├── state_dict_utils.py │ │ ├── strategies │ │ │ ├── __init__.py │ │ │ ├── async_utils.py │ │ │ ├── base.py │ │ │ ├── cached_metadata_filesystem_reader.py │ │ │ ├── common.py │ │ │ ├── filesystem_async.py │ │ │ ├── fully_parallel.py │ │ │ ├── resharding.py │ │ │ ├── state_dict_saver.py │ │ │ ├── tensorstore.py │ │ │ ├── torch.py │ │ │ ├── two_stage.py │ │ │ └── zarr.py │ │ ├── tensor_aware_state_dict.py │ │ ├── utils.py │ │ └── validation.py │ ├── distributed │ │ ├── README.md │ │ ├── __init__.py │ │ ├── custom_fsdp │ │ │ ├── __init__.py │ │ │ ├── fully_sharded_data_parallel.py │ │ │ └── param_and_grad_buffer.py │ │ ├── data_parallel_base.py │ │ ├── distributed_data_parallel.py │ │ ├── distributed_data_parallel_config.py │ │ ├── finalize_model_grads.py │ │ ├── param_and_grad_buffer.py │ │ ├── torch_fully_sharded_data_parallel.py │ │ └── torch_fully_sharded_data_parallel_config.py │ ├── enums.py │ ├── export │ │ ├── __init__.py │ │ ├── data_type.py │ │ ├── export_config.py │ │ ├── model_type.py │ │ └── trtllm │ │ │ ├── __init__.py │ │ │ ├── engine_builder │ │ │ ├── __init__.py │ │ │ └── trtllm_engine_builder.py │ │ │ ├── model_to_trllm_mapping │ │ │ ├── __init__.py │ │ │ └── default_conversion_dict.py │ │ │ ├── trt_model_config.py │ │ │ ├── trt_model_type.py │ │ │ ├── trtllm_helper.py │ │ │ ├── trtllm_layers.py │ │ │ └── trtllm_weights_converter │ │ │ ├── __init__.py │ │ │ ├── distributed_trtllm_model_weights_converter.py │ │ │ ├── single_device_trtllm_model_weights_converter.py │ │ │ └── utils.py │ ├── extensions │ │ ├── __init__.py │ │ └── transformer_engine.py │ ├── fp8_utils.py │ ├── fusions │ │ ├── __init__.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_bias_geglu.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_bias_swiglu.py │ │ ├── fused_cross_entropy.py │ │ ├── fused_indices_converter.py │ │ ├── fused_layer_norm.py │ │ └── fused_softmax.py │ ├── inference │ │ ├── __init__.py │ │ ├── async_stream.py │ │ ├── common_inference_params.py │ │ ├── communication_utils.py │ │ ├── contexts │ │ │ ├── __init__.py │ │ │ ├── base_context.py │ │ │ ├── dynamic_chunk_allocator.py │ │ │ ├── dynamic_context.py │ │ │ └── static_context.py │ │ ├── engines │ │ │ ├── __init__.py │ │ │ ├── abstract_engine.py │ │ │ ├── dynamic_engine.py │ │ │ ├── mcore_engine.py │ │ │ └── static_engine.py │ │ ├── inference_request.py │ │ ├── model_inference_wrappers │ │ │ ├── __init__.py │ │ │ ├── abstract_model_inference_wrapper.py │ │ │ ├── gpt │ │ │ │ ├── __init__.py │ │ │ │ └── gpt_inference_wrapper.py │ │ │ ├── inference_wrapper_config.py │ │ │ ├── multimodal │ │ │ │ └── vlm_inference_wrapper.py │ │ │ └── t5 │ │ │ │ ├── __init__.py │ │ │ │ └── t5_inference_wrapper.py │ │ ├── sampling_params.py │ │ ├── scheduler.py │ │ ├── text_generation_controllers │ │ │ ├── __init__.py │ │ │ ├── encoder_decoder_text_generation_controller.py │ │ │ ├── simple_text_generation_controller.py │ │ │ ├── text_generation_controller.py │ │ │ └── vlm_text_generation_controller.py │ │ └── utils.py │ ├── inference_params.py │ ├── jit.py │ ├── model_parallel_config.py │ ├── models │ │ ├── T5 │ │ │ ├── __init__.py │ │ │ ├── t5_model.py │ │ │ └── t5_spec.py │ │ ├── __init__.py │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── bert_layer_specs.py │ │ │ ├── bert_lm_head.py │ │ │ ├── bert_model.py │ │ │ └── pooler.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── embeddings │ │ │ │ ├── __init__.py │ │ │ │ ├── language_model_embedding.py │ │ │ │ ├── relative_pos_embedding.py │ │ │ │ ├── rope_utils.py │ │ │ │ ├── rotary_pos_embedding.py │ │ │ │ └── yarn_rotary_pos_embedding.py │ │ │ ├── language_module │ │ │ │ ├── __init__.py │ │ │ │ └── language_module.py │ │ │ └── vision_module │ │ │ │ ├── __init__.py │ │ │ │ └── vision_module.py │ │ ├── gpt │ │ │ ├── __init__.py │ │ │ ├── gpt_layer_specs.py │ │ │ ├── gpt_model.py │ │ │ ├── heterogeneous │ │ │ │ └── heterogeneous_layer_specs.py │ │ │ └── moe_module_specs.py │ │ ├── huggingface │ │ │ ├── __init__.py │ │ │ ├── clip_model.py │ │ │ ├── module.py │ │ │ └── qwen_model.py │ │ ├── mamba │ │ │ ├── __init__.py │ │ │ ├── mamba_layer_specs.py │ │ │ └── mamba_model.py │ │ ├── mimo │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── __init__.py │ │ │ │ └── base_configs.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ └── submodules │ │ │ │ ├── audio.py │ │ │ │ ├── base.py │ │ │ │ └── vision.py │ │ ├── multimodal │ │ │ ├── __init__.py │ │ │ ├── context_parallel.py │ │ │ ├── llava_model.py │ │ │ └── llava_spec.py │ │ ├── retro │ │ │ ├── __init__.py │ │ │ ├── base_attention.py │ │ │ ├── config.py │ │ │ ├── decoder_attention.py │ │ │ ├── decoder_spec.py │ │ │ ├── encoder_attention.py │ │ │ ├── encoder_spec.py │ │ │ ├── model.py │ │ │ └── utils.py │ │ └── vision │ │ │ ├── __init__.py │ │ │ ├── clip_vit_model.py │ │ │ ├── multimodal_projector.py │ │ │ ├── radio.py │ │ │ └── vit_layer_specs.py │ ├── msc_utils.py │ ├── num_microbatches_calculator.py │ ├── optimizer │ │ ├── __init__.py │ │ ├── clip_grads.py │ │ ├── cpu_offloading │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── hybrid_optimizer.py │ │ ├── distrib_optimizer.py │ │ ├── grad_scaler.py │ │ ├── optimizer.py │ │ └── optimizer_config.py │ ├── optimizer_param_scheduler.py │ ├── package_info.py │ ├── packed_seq_params.py │ ├── parallel_state.py │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── p2p_communication.py │ │ └── schedules.py │ ├── post_training │ │ ├── __init__.py │ │ └── modelopt │ │ │ ├── __init__.py │ │ │ ├── gpt │ │ │ ├── __init__.py │ │ │ ├── model_specs.py │ │ │ └── state_dict_hooks.py │ │ │ ├── layers.py │ │ │ └── mamba │ │ │ ├── __init__.py │ │ │ └── model_specs.py │ ├── process_groups_config.py │ ├── requirements.txt │ ├── rerun_state_machine.py │ ├── ssm │ │ ├── __init__.py │ │ ├── mamba_block.py │ │ ├── mamba_hybrid_layer_allocation.py │ │ ├── mamba_layer.py │ │ ├── mamba_mixer.py │ │ ├── mlp_layer.py │ │ └── triton_cache_manager.py │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ ├── timers.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── cuda_graphs.py │ │ ├── custom_layers │ │ │ ├── __init__.py │ │ │ └── transformer_engine.py │ │ ├── dot_product_attention.py │ │ ├── enums.py │ │ ├── heterogeneous │ │ │ ├── heterogeneous_config.py │ │ │ └── linear_replacements.py │ │ ├── identity_op.py │ │ ├── mlp.py │ │ ├── module.py │ │ ├── moe │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── experts.py │ │ │ ├── fused_a2a.py │ │ │ ├── grouped_gemm_util.py │ │ │ ├── legacy_a2a_token_dispatcher.py │ │ │ ├── moe_layer.py │ │ │ ├── moe_utils.py │ │ │ ├── router.py │ │ │ ├── shared_experts.py │ │ │ ├── token_dispatcher.py │ │ │ └── upcycling_utils.py │ │ ├── multi_latent_attention.py │ │ ├── multi_token_prediction.py │ │ ├── spec_utils.py │ │ ├── torch_layer_norm.py │ │ ├── torch_norm.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── utils.py │ └── utils.py ├── inference │ ├── __init__.py │ ├── endpoints │ │ ├── common.py │ │ └── completions.py │ ├── static │ │ └── index.html │ ├── text_generation │ │ ├── __init__.py │ │ ├── api.py │ │ ├── beam_utils.py │ │ ├── communication.py │ │ ├── forward_step.py │ │ ├── generation.py │ │ ├── mcore_engine_server.py │ │ ├── sampling.py │ │ └── tokenization.py │ └── text_generation_server.py ├── legacy │ ├── data │ │ ├── __init__.py │ │ ├── autoaugment.py │ │ ├── biencoder_dataset_utils.py │ │ ├── data_samplers.py │ │ ├── dataset_utils.py │ │ ├── ict_dataset.py │ │ ├── image_folder.py │ │ ├── multimodal_dataset.py │ │ ├── orqa_wiki_dataset.py │ │ ├── realm_dataset_utils.py │ │ ├── realm_index.py │ │ └── vit_dataset.py │ ├── fp16_deprecated │ │ └── loss_scaler.py │ ├── fused_kernels │ │ ├── __init__.py │ │ ├── compat.h │ │ ├── tests │ │ │ ├── __init__.py │ │ │ └── test_fused_kernels.py │ │ └── type_shim.h │ ├── indexer.py │ ├── model │ │ ├── __init__.py │ │ ├── bert_model.py │ │ ├── biencoder_model.py │ │ ├── classification.py │ │ ├── enums.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_layer_norm.py │ │ ├── fused_softmax.py │ │ ├── gpt_model.py │ │ ├── language_model.py │ │ ├── module.py │ │ ├── multiple_choice.py │ │ ├── realm_model.py │ │ ├── rms_norm.py │ │ ├── t5_model.py │ │ ├── transformer.py │ │ ├── utils.py │ │ └── vision │ │ │ ├── classification.py │ │ │ ├── dino.py │ │ │ ├── esvit_swin_backbone.py │ │ │ ├── inpainting.py │ │ │ ├── knn_monitor.py │ │ │ ├── mit_backbone.py │ │ │ ├── swin_backbone.py │ │ │ ├── utils.py │ │ │ └── vit_backbone.py │ └── mpu │ │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py ├── post_training │ ├── __init__.py │ ├── algos │ │ ├── __init__.py │ │ └── distillation.py │ ├── arguments.py │ ├── checkpointing.py │ ├── docs │ │ └── distillation.md │ ├── generate.py │ ├── loss_func.py │ ├── model_provider.py │ ├── non_loss_data_func.py │ └── utils.py └── training │ ├── __init__.py │ ├── activations.py │ ├── arguments.py │ ├── async_utils.py │ ├── checkpointing.py │ ├── dist_signal_handler.py │ ├── ft_integration.py │ ├── global_vars.py │ ├── initialize.py │ ├── inprocess_restart.py │ ├── log_handler.py │ ├── one_logger_utils.py │ ├── theoretical_memory_usage.py │ ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ ├── multimodal_tokenizer.py │ └── tokenizer.py │ ├── training.py │ ├── utils.py │ ├── wandb_utils.py │ └── yaml_arguments.py ├── model_config.yaml ├── mypy.ini ├── patches └── nemo_2.3.0_te.patch ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_ict.py ├── pretrain_mamba.py ├── pretrain_retro.py ├── pretrain_t5.py ├── pretrain_vision_classify.py ├── pretrain_vision_dino.py ├── pretrain_vision_inpaint.py ├── pretrain_vlm.py ├── pyproject.toml ├── pytest.ini ├── requirements ├── pytorch_24.01 │ └── requirements.txt ├── pytorch_24.07 │ └── requirements.txt ├── pytorch_24.10 │ └── requirements.txt └── pytorch_25.03 │ └── requirements.txt ├── requirements_ci.txt ├── requirements_mlm.txt ├── setup.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── main.py ├── msdp │ ├── README.md │ ├── evaluate.py │ ├── main.py │ ├── metrics.py │ ├── preprocessing.py │ └── prompt.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── quantize │ └── calibrate_gpt.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification │ │ ├── classification.py │ │ └── eval_utils.py │ ├── finetune_utils.py │ ├── main.py │ └── segmentation │ │ ├── cityscapes.py │ │ ├── data.py │ │ ├── finetune_segformer.py │ │ ├── finetune_setr.py │ │ ├── metrics.py │ │ ├── seg_heads.py │ │ ├── seg_models.py │ │ ├── transforms.py │ │ └── utils.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── __init__.py ├── functional_tests │ ├── __init__.py │ ├── python_test_utils │ │ ├── __init__.py │ │ ├── common.py │ │ ├── conftest.py │ │ ├── get_test_results_from_tensorboard_logs.py │ │ ├── test_inference_regular_pipeline.py │ │ ├── test_pretraining_regular_pipeline.py │ │ └── test_pretraining_resume_checkpoint_pipeline.py │ ├── shell_test_utils │ │ ├── _run_training.sh │ │ ├── run_ci_test.sh │ │ └── start_interactive_job.sh │ └── test_cases │ │ ├── bert │ │ ├── bert_mr_mcore_tp2_pp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ └── bert_release │ │ │ ├── golden_values_0.10.0_dgx_a100.json │ │ │ ├── golden_values_0.11.0_dgx_a100.json │ │ │ ├── golden_values_0.12.0_dgx_a100.json │ │ │ ├── golden_values_0.9.0_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── common │ │ └── ckpt_converter │ │ │ ├── __main__.py │ │ │ └── model_config.yaml │ │ ├── gpt-nemo │ │ ├── bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G │ │ │ └── model_config.yaml │ │ ├── gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G │ │ │ └── model_config.yaml │ │ ├── llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G │ │ │ └── model_config.yaml │ │ └── t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt │ │ ├── gpt3_15b_8t_release │ │ │ ├── golden_values_0.10.0_dgx_a100.json │ │ │ ├── golden_values_0.11.0_dgx_a100.json │ │ │ ├── golden_values_0.12.0_dgx_a100.json │ │ │ ├── golden_values_0.8.0_dgx_a100.json │ │ │ ├── golden_values_0.9.0_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_15b_8t_release_sm │ │ │ ├── golden_values_0.11.0_PyT24.10_dgx_a100.json │ │ │ ├── golden_values_0.11.0_PyT25.01_dgx_a100.json │ │ │ ├── golden_values_0.12.0_PyT25.03_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16 │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1 │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ └── gpt_inference_tp1_pp1_583m_logitsmatch │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── hybrid │ │ ├── hybrid_mr_mcore_te_tp1_pp1_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── hybrid_mr_mcore_te_tp1_pp4_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ └── hybrid_mr_mcore_te_tp2_pp1_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── mixtral │ │ ├── mixtral_8x22b_tp2pp8ep8vpp1_release │ │ │ ├── golden_values_0.10.0_dgx_a100.json │ │ │ ├── golden_values_0.12.0_dgx_a100.json │ │ │ ├── golden_values_0.9.0_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── mixtral_8x7b_alltoall_tp2pp4ep4_release │ │ │ ├── golden_values_0.10.0_dgx_a100.json │ │ │ ├── golden_values_0.11.0_dgx_a100.json │ │ │ ├── golden_values_0.11.0_patch_dgx_a100.json │ │ │ ├── golden_values_0.12.0_dgx_a100.json │ │ │ ├── golden_values_0.8.0_dgx_a100.json │ │ │ ├── golden_values_0.9.0_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── mixtral_8x7b_alltoall_tp2pp4ep4_release_sm │ │ │ └── model_config.yaml │ │ └── mixtral_8x7b_tp1pp4ep8vpp8_release │ │ │ ├── golden_values_0.10.0_dgx_a100.json │ │ │ ├── golden_values_0.11.0_dgx_a100.json │ │ │ ├── golden_values_0.12.0_dgx_a100.json │ │ │ ├── golden_values_0.9.0_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── moe │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mcore_te_tp1_pp1_ep8_etp1_cp_memory_speed │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ └── gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── multimodal-llava │ │ ├── multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ ├── multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ └── multimodal_llava_mr_mcore_te_tp4_sp_cp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev_dgx_a100.json │ │ │ ├── golden_values_dev_dgx_h100.json │ │ │ ├── golden_values_lts_dgx_a100.json │ │ │ └── model_config.yaml │ │ └── t5 │ │ ├── t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 │ │ ├── golden_values_dev_dgx_a100.json │ │ ├── golden_values_dev_dgx_h100.json │ │ ├── golden_values_lts_dgx_a100.json │ │ └── model_config.yaml │ │ ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 │ │ └── golden_values_lts_dgx_a100.json │ │ ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel │ │ └── golden_values_lts_dgx_a100.json │ │ └── t5_release │ │ ├── golden_values_0.10.0_dgx_a100.json │ │ ├── golden_values_0.11.0_dgx_a100.json │ │ ├── golden_values_0.12.0_dgx_a100.json │ │ ├── golden_values_0.9.0_dgx_a100.json │ │ └── model_config.yaml ├── test_utils │ ├── python_scripts │ │ ├── auto_reminder.py │ │ ├── check_status_of_main.py │ │ ├── common.py │ │ ├── dashboard.py │ │ ├── download_coverage_results.py │ │ ├── download_golden_values.py │ │ ├── generate_jet_trigger_job.py │ │ ├── generate_local_jobs.py │ │ ├── launch_jet_workload.py │ │ ├── notify.py │ │ └── wait_for_resources.py │ └── recipes │ │ ├── _build-mcore-dev.yaml │ │ ├── _build-mcore-lts.yaml │ │ ├── _build-nemo.yaml │ │ ├── bert.yaml │ │ ├── common.yaml │ │ ├── gpt-inference.yaml │ │ ├── gpt-nemo.yaml │ │ ├── gpt.yaml │ │ ├── mamba.yaml │ │ ├── moe.yaml │ │ ├── multimodal-llava.yaml │ │ ├── t5.yaml │ │ └── unit-tests.yaml └── unit_tests │ ├── __init__.py │ ├── conftest.py │ ├── data │ ├── __init__.py │ ├── test_bin_reader.py │ ├── test_builder.py │ ├── test_gpt_dataset.py │ ├── test_multimodal_dataset.py │ ├── test_preprocess_data.py │ └── test_preprocess_mmdata.py │ ├── dist_checkpointing │ ├── __init__.py │ ├── conftest.py │ ├── models │ │ ├── __init__.py │ │ ├── common.py │ │ ├── test_bert_model.py │ │ ├── test_gpt_model.py │ │ ├── test_mamba.py │ │ ├── test_mlp_glu.py │ │ ├── test_moe_experts.py │ │ └── test_t5_model.py │ ├── test_async_save.py │ ├── test_flattened_resharding.py │ ├── test_fp8.py │ ├── test_fully_parallel.py │ ├── test_global_metadata_reuse.py │ ├── test_local.py │ ├── test_mapping.py │ ├── test_msc.py │ ├── test_nonpersistent.py │ ├── test_optimizer.py │ ├── test_replication.py │ ├── test_serialization.py │ ├── test_torch_dist.py │ └── utils.py │ ├── distributed │ ├── test_distributed_data_parallel.py │ ├── test_finalize_model_grads.py │ ├── test_grad_reduce_for_replicated_embedder.py │ ├── test_grad_sync_with_expert_parallel.py │ ├── test_mcore_fully_sharded_data_parallel.py │ ├── test_param_and_grad_buffer.py │ └── test_torch_fully_sharded_parallel.py │ ├── export │ └── trtllm │ │ ├── __init__.py │ │ ├── test_distributed_fp8.py │ │ ├── test_single_device_fp8.py │ │ ├── test_trtllm_distributed_gpu_converter.py │ │ ├── test_trtllm_helper.py │ │ ├── test_trtllm_layers.py │ │ └── test_trtllm_single_device_converter.py │ ├── fusions │ ├── test_bias_dropout_fusion.py │ ├── test_swiglu_fusion.py │ └── test_torch_softmax.py │ ├── inference │ ├── __init__.py │ ├── contexts │ │ └── test_dynamic_context.py │ ├── engines │ │ ├── __init__.py │ │ ├── test_dynamic_engine.py │ │ └── test_static_engine.py │ ├── model_inference_wrappers │ │ ├── __init__.py │ │ ├── gpt │ │ │ └── test_gpt_inference_wrapper.py │ │ ├── t5 │ │ │ └── test_t5_inference_wrapper.py │ │ └── test_model_inference_wrapper_config.py │ ├── test_common_inference_params.py │ ├── test_communication_utils.py │ ├── test_flash_decode.py │ ├── test_inference_utils.py │ ├── test_scheduler.py │ └── text_generation_controllers │ │ ├── __init__.py │ │ ├── test_encoder_decoder_text_generation_controller.py │ │ ├── test_simple_text_generation_controller.py │ │ └── test_vlm_text_generation_controller.py │ ├── models │ ├── __init__.py │ ├── test_base_embedding.py │ ├── test_bert_model.py │ ├── test_clip_vit_model.py │ ├── test_gpt_model.py │ ├── test_heterogeneous_gpt_model.py │ ├── test_llava_model.py │ ├── test_mamba_model.py │ ├── test_mimo_audio_submodules.py │ ├── test_mimo_embedding_alignment.py │ ├── test_mimo_model.py │ ├── test_mimo_submodules.py │ ├── test_multimodal_projector.py │ ├── test_radio_model.py │ └── test_t5_model.py │ ├── pipeline_parallel │ ├── __init__.py │ ├── test_helpers.py │ └── test_schedules.py │ ├── post_training │ ├── __init__.py │ └── test_modelopt_module_spec.py │ ├── run_ci_test.sh │ ├── ssm │ ├── test_mamba_block.py │ ├── test_mamba_hybrid_layer_allocation.py │ ├── test_mamba_layer.py │ └── test_mamba_mixer.py │ ├── tensor_parallel │ ├── __init__.py │ ├── test_cross_entropy.py │ ├── test_data.py │ ├── test_initialization.py │ ├── test_layers.py │ ├── test_mappings.py │ ├── test_random.py │ └── test_tensor_parallel_utils.py │ ├── test_basic.py │ ├── test_checkpointing.py │ ├── test_fp8_param.py │ ├── test_imports.py │ ├── test_inference.py │ ├── test_local_multi_tensor_fns.py │ ├── test_model_configs.py │ ├── test_num_microbatches_calculator.py │ ├── test_optimizer.py │ ├── test_optimizer_cpu_offloading.py │ ├── test_optimizer_param_scheduler.py │ ├── test_parallel_state.py │ ├── test_process_groups_config.py │ ├── test_tokenizer.py │ ├── test_training.py │ ├── test_utilities.py │ ├── test_utils.py │ └── transformer │ ├── __init__.py │ ├── moe │ ├── __init__.py │ ├── conftest.py │ ├── test_a2a_token_dispatcher.py │ ├── test_aux_loss.py │ ├── test_grouped_mlp.py │ ├── test_moe_layer.py │ ├── test_moe_layer_discrepancy.py │ ├── test_multihot_indices_converter.py │ ├── test_routers.py │ ├── test_sequential_mlp.py │ ├── test_shared_experts.py │ ├── test_token_dispatcher.py │ └── test_upcycling.py │ ├── test_attention.py │ ├── test_attention_no_rope.py │ ├── test_attention_packed_seq.py │ ├── test_core_attention.py │ ├── test_cuda_graphs.py │ ├── test_mlp.py │ ├── test_module.py │ ├── test_multi_latent_attention.py │ ├── test_multi_token_prediction.py │ ├── test_relative_attention.py │ ├── test_retro_attention.py │ ├── test_rope.py │ ├── test_spec_customization.py │ ├── test_transformer_block.py │ ├── test_transformer_block_custom_pgs.py │ ├── test_transformer_layer.py │ └── test_utils.py └── tools ├── autoformat.sh ├── bert_embedding ├── __init__.py ├── dataset.py ├── embed.py ├── external_libs.py └── huggingface.py ├── checkpoint ├── convert.py ├── hybrid_conversion.py ├── loader_base.py ├── loader_core.py ├── loader_legacy.py ├── loader_llama_mistral.py ├── loader_llava.py ├── loader_mixtral_hf.py ├── saver_base.py ├── saver_core.py ├── saver_hf_llava.py ├── saver_legacy.py ├── saver_llava.py ├── schema_base.py ├── schema_core.py ├── schema_hf.py └── utils.py ├── copyright.sh ├── linter.py ├── merge_datasets.py ├── preprocess_data.py ├── preprocess_data_nmt.py ├── preprocess_mmdata.py ├── report_theoretical_memory.py ├── retro ├── README.md ├── build_db.md ├── cli │ ├── __init__.py │ ├── __main__.py │ └── cli.py ├── config_utils.py ├── docker │ └── Dockerfile ├── preprocess_data.py ├── sft │ ├── README.md │ ├── dataset_conv.py │ ├── open_inst.sh │ ├── sft_retro.py │ └── sft_retro_lm.sh └── text_generation │ ├── evaluate.py │ ├── metrics.py │ ├── retro_api.py │ ├── retro_generate.sh │ ├── retro_generation.py │ └── retro_text_generation.py ├── run_inference_performance_test.py ├── run_mamba_text_generation_server.py ├── run_mamba_text_generation_server_completions.py ├── run_text_generation_server.py ├── run_vlm_text_generation.py ├── text_generation_cli.py └── wait_daemon.sh /.coveragerc: -------------------------------------------------------------------------------- 1 | [html] 2 | directory = coverage 3 | 4 | [run] 5 | data_file = .coverage_$LOCAL_RANK 6 | relative_files = true 7 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | extend-ignore = E203,E501,F401,E402,E714 4 | per-file-ignores = __init__.py:F401 -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: BUG 3 | about: Report a bug that needs attention 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Stack trace/logs** 20 | If applicable, add the stack trace or logs from the time of the error. 21 | 22 | **Environment (please complete the following information):** 23 | - Megatron-LM commit ID 24 | - PyTorch version 25 | - CUDA version 26 | - NCCL version 27 | 28 | **Proposed fix** 29 | If you have a proposal for how to fix the issue state it here or link to a PR. 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: ENHANCEMENT 3 | about: Suggest an idea to improve this project 4 | title: "[ENHANCEMENT]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Proposed implementation** 20 | If you have a proposed implementation for the feature state it here or link to a PR. 21 | 22 | **Additional context** 23 | Add any other context or screenshots about the feature request here. 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: QUESTION 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement 4 | request 5 | title: "[QUESTION]" 6 | labels: '' 7 | assignees: '' 8 | 9 | --- 10 | 11 | **Your question** 12 | Ask a clear and concise question about Megatron-LM. 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/regression.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: REGRESSION 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update 4 | title: "[REGRESSION]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the regression** 11 | A clear and concise description of what the regression is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Previous performance** 17 | What speed or accuracy did you previously see. 18 | 19 | **New performance** 20 | What speed or accuracy do you see after the update. 21 | 22 | **Stack trace/logs** 23 | If applicable, add the stack trace or logs related to the regression. 24 | 25 | **Environment (please complete the following information):** 26 | - Previous Megatron-LM commit ID 27 | - New Megatron-LM commit ID 28 | - Previous PyTorch version 29 | - New PyTorch version 30 | - Previous CUDA version 31 | - New CUDA version 32 | - Previous NCCL version 33 | - New NCCL version 34 | 35 | **Proposed fix** 36 | If you have a proposal for how to fix the issue state it here or link to a PR. 37 | 38 | **Additional context** 39 | Add any other context about the problem here. 40 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '15 18 * * *' 11 | 12 | jobs: 13 | stale: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/stale@v5 22 | with: 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | days-before-stale: 60 25 | stale-issue-message: 'Marking as stale. No activity in 60 days.' 26 | stale-pr-message: 'Marking as stale. No activity in 60 days.' 27 | stale-issue-label: 'stale' 28 | stale-pr-label: 'stale' 29 | remove-stale-when-updated: true 30 | operations-per-run: 1000 31 | days-before-close: -1 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | slurm* 8 | logs 9 | .vscode 10 | local/ 11 | .gitmodules 12 | wandb/ 13 | onelogger.log 14 | onelogger.err -------------------------------------------------------------------------------- /.gitlab/labeler-config.yml: -------------------------------------------------------------------------------- 1 | CI: 2 | - .gitlab-ci.yml 3 | - Dockerfile.ci.lts 4 | - Dockerfile.ci.dev 5 | - .github/** 6 | - .gitlab/** 7 | 8 | Datasets: 9 | - megatron/core/datasets/** 10 | 11 | BERT: 12 | - megatron/core/models/bert/** 13 | 14 | GPT: 15 | - megatron/core/models/gpt/** 16 | 17 | RETRO: 18 | - megatron/core/models/retro/** 19 | 20 | Dist-Ckpt: 21 | - megatron/core/dist_checkpointing 22 | 23 | Dist-Opt: 24 | - megatron/core/optimizer/distrib_optimizer 25 | 26 | Inference: 27 | - megatron/core/inference 28 | 29 | MoE: 30 | - megatron/core/transformer/moe 31 | 32 | Tests: 33 | - tests/** 34 | 35 | ParallelState: 36 | - megatron/core/parallel_state.py 37 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 'refs/tags/24.4.2:refs/tags/24.4.2' 4 | hooks: 5 | - id: black 6 | files: ^megatron/core/.* 7 | args: ["--skip-magic-trailing-comma"] 8 | - repo: https://github.com/pycqa/pylint 9 | rev: v3.2.6 10 | hooks: 11 | - id: pylint 12 | files: ^megatron/core/.* 13 | - repo: https://github.com/pycqa/isort 14 | rev: 5.13.2 15 | hooks: 16 | - id: isort 17 | files: ^megatron/core/.* -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MAIN] 2 | ignore-paths=tests 3 | max-line-length=100 4 | 5 | [MESSAGES CONTROL] 6 | disable=all 7 | 8 | enable=C0115,C0116,W0611,C0301,E0606 9 | # C0115: missing-class-docstring 10 | # C0116: missing-function-docstring 11 | # W0611: unused-import 12 | # C0301: line-too-long 13 | # E0606: possibly-used-before-assignment -------------------------------------------------------------------------------- /Dockerfile.linting: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | 3 | ARG FROM_IMAGE_NAME 4 | FROM $FROM_IMAGE_NAME as main 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \ 8 | /etc/apt/apt.conf.d/docker-clean 9 | 10 | RUN apt-get update && \ 11 | apt-get install -y python3-venv && \ 12 | apt-get clean && \ 13 | python -m venv /opt/jet 14 | 15 | RUN pip3 install --no-cache-dir \ 16 | black==24.4.2 \ 17 | isort==5.13.2 \ 18 | flake8==7.1.0 \ 19 | pylint==3.2.6 \ 20 | coverage \ 21 | mypy \ 22 | python-gitlab \ 23 | pandas \ 24 | slack-sdk 25 | 26 | WORKDIR /opt/megatron-lm 27 | 28 | ##### For NVIDIANS only ##### 29 | FROM main as jet 30 | ARG JET_API_VERSION 31 | RUN --mount=type=secret,id=JET_INDEX_URLS \ 32 | JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ 33 | pip install --no-cache-dir "jet-client~=2.0" --upgrade $JET_INDEX_URLS 34 | ENV PATH="$PATH:/opt/jet/bin" 35 | ### -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/core/requirements.txt 2 | include megatron/core/README.md 3 | recursive-include requirements * 4 | -------------------------------------------------------------------------------- /docs/source/api-guide/index.rst: -------------------------------------------------------------------------------- 1 | API Guide 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | models 8 | tensor_parallel 9 | context_parallel 10 | pipeline_parallel 11 | custom_fsdp 12 | fusions 13 | transformer 14 | moe 15 | dist_checkpointing 16 | dist_optimizer 17 | distributed 18 | datasets 19 | multi_latent_attention 20 | num_microbatches_calculator 21 | optimizer_param_scheduler 22 | optimizer_cpu_offload 23 | multi_token_prediction 24 | encoder_decoder_parallelism -------------------------------------------------------------------------------- /docs/source/api-guide/models.bert.rst: -------------------------------------------------------------------------------- 1 | models.bert package 2 | =================== 3 | Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 4 | 5 | Submodules 6 | ---------- 7 | 8 | models.bert.bert\_model module 9 | ------------------------------ 10 | 11 | .. automodule:: core.models.bert.bert_model 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: core.models.bert 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.gpt.rst: -------------------------------------------------------------------------------- 1 | models.gpt package 2 | ================== 3 | This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 4 | 5 | Submodules 6 | ---------- 7 | 8 | models.gpt.gpt\_model module 9 | ---------------------------- 10 | 11 | .. automodule:: core.models.gpt.gpt_model 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: core.models.gpt 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.rst: -------------------------------------------------------------------------------- 1 | models package 2 | ============== 3 | This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 4 | 5 | Subpackages 6 | ----------- 7 | 8 | .. toctree:: 9 | :maxdepth: 4 10 | 11 | models.gpt 12 | models.t5 13 | models.bert 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: core.models 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.t5.rst: -------------------------------------------------------------------------------- 1 | models.t5 package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | models.t5.t5\_model module 8 | -------------------------- 9 | 10 | .. automodule:: core.models.T5.t5_model 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: core.models.T5 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api-guide/moe.rst: -------------------------------------------------------------------------------- 1 | Mixture of Experts package 2 | ========================== 3 | 4 | .. mdinclude :: ../../../megatron/core/transformer/moe/README.md 5 | -------------------------------------------------------------------------------- /docs/source/api-guide/multi_latent_attention.rst: -------------------------------------------------------------------------------- 1 | Multi-Latent Attention 2 | ====================== 3 | 4 | Multi-Latent Attention overview 5 | ------------------------------- 6 | 7 | Multi-Latent Attention ("MLA") is an innovative attention mechanism introduced by Deepseek team that enhances the efficiency of attention computation by leveraging multiple latent spaces. This approach is particularly beneficial for large language models (LLMs), as it reduces the computational burden associated with traditional attention mechanisms. According to Deepseek-V2 technical report, MLA achieves better performance compared to Multi-Head Attention (MHA) and requires smaller KV cache. 8 | 9 | Enabling Multi-Latent Attention 10 | ------------------------------- 11 | 12 | To enable MLA in Megatron-LM, set the following flags in command line: 13 | - `--multi-latent-attention` to enable MLA in MLP. 14 | - Set `MLATransformerConfig` to configure MLA. 15 | -------------------------------------------------------------------------------- /docs/source/api-guide/num_microbatches_calculator.rst: -------------------------------------------------------------------------------- 1 | Microbatches Calculator 2 | ======================= 3 | This api is used to calculate the number of microbatches required to fit a given model on a given batch size. 4 | 5 | 6 | Module contents 7 | --------------- 8 | 9 | .. automodule:: core.num_microbatches_calculator 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/api-guide/optimizer_cpu_offload.rst: -------------------------------------------------------------------------------- 1 | Optimizer CPU offload package 2 | ============================== 3 | 4 | .. mdinclude :: ../../../megatron/core/optimizer/cpu_offloading/README.md 5 | -------------------------------------------------------------------------------- /docs/source/api-guide/optimizer_param_scheduler.rst: -------------------------------------------------------------------------------- 1 | Optimizer Parameters Scheduler 2 | ============================== 3 | This api is used to calculate the learning rate and weight decay for the optimizer. 4 | 5 | 6 | Module contents 7 | --------------- 8 | 9 | .. automodule:: core.optimizer_param_scheduler 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/images/context_parallel/CP_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/context_parallel/CP_overview.png -------------------------------------------------------------------------------- /docs/source/images/context_parallel/CP_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/context_parallel/CP_results.png -------------------------------------------------------------------------------- /docs/source/images/custom_fsdp/FSDP_Allreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/custom_fsdp/FSDP_Allreduce.png -------------------------------------------------------------------------------- /docs/source/images/custom_fsdp/FSDP_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/custom_fsdp/FSDP_workflow.png -------------------------------------------------------------------------------- /docs/source/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/custom_fsdp/MCore_Custom_FSDP_Class_Diagram.png -------------------------------------------------------------------------------- /docs/source/images/distrib_optimizer/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/distrib_optimizer/data_flow.png -------------------------------------------------------------------------------- /docs/source/images/distrib_optimizer/sharding_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/distrib_optimizer/sharding_scheme.png -------------------------------------------------------------------------------- /docs/source/images/moe/token_drop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/moe/token_drop.png -------------------------------------------------------------------------------- /docs/source/images/multi_token_prediction/MTP_implementation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/docs/source/images/multi_token_prediction/MTP_implementation.png -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Lumache documentation master file, created by 2 | sphinx-quickstart on Tue Aug 15 13:44:10 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Megatron Core User Guide 7 | =================================== 8 | 9 | **Megatron Core** is a Python library that has the core components required to build your language models. 10 | A reference implementation of Megatron Core can be found in `NeMo `_ It offers a *simple* and 11 | *intuitive* API. 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: User Guide 16 | 17 | user-guide/index 18 | 19 | .. toctree:: 20 | :maxdepth: 3 21 | :caption: API Guide 22 | 23 | api-guide/index 24 | -------------------------------------------------------------------------------- /docs/source/user-guide/index.rst: -------------------------------------------------------------------------------- 1 | User Guide 2 | ============ 3 | 4 | .. mdinclude:: ../../../megatron/core/QuickStart.md 5 | .. mdinclude:: ../../../megatron/core/MSC_Integration.md -------------------------------------------------------------------------------- /examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh: -------------------------------------------------------------------------------- 1 | VOCAB_FILE=pt2-vocab.json 2 | MERGE_FILE=gpt2-merges.txt 3 | 4 | python3 tools/preprocess_data.py \ 5 | --input $1 \ 6 | --output-prefix $2 \ 7 | --vocab-file $VOCAB_FILE \ 8 | --merge-file $MERGE_FILE \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --append-eod --workers 20 --chunk-size 25 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp). 5 | 6 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/prep_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Preparing the input file for the response generation (second-stage prompting) 4 | 5 | DIR=`pwd` 6 | 7 | TEST_FILE= \ 8 | (e.g., /testseen_processed.txt) 9 | KNOWLEDGE_FILE= \ 10 | (e.g., /testseen_knowledge_generations.txt) 11 | PROCESSED_FILE= \ 12 | (e.g., /testseen_processed_with_generated_knowledge.txt) 13 | 14 | python ${DIR}/tasks/msdp/preprocessing.py \ 15 | --func prepare_input \ 16 | --test_file ${TEST_FILE} \ 17 | --knwl_gen_file ${KNOWLEDGE_FILE} \ 18 | --processed_file ${PROCESSED_FILE} 19 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/SBATCH.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | sbatch -p ${SLURM_PARTITION} \ 5 | -A ${SLURM_ACCOUNT} \ 6 | --job-name=${JOB_NAME} \ 7 | --nodes=${NNODES} \ 8 | --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh 9 | 10 | exit 0 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/SRUN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 4 | 5 | 6 | THIS_DIR=`pwd` 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 8 | mkdir -p ${THIS_DIR}/logs 9 | 10 | 11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" 12 | 13 | 14 | srun -l \ 15 | --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ 16 | --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ 17 | --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" 18 | 19 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [1, 2, 4, 8]. 8 | PP=1 9 | 10 | # Batch size (global batch size) options = [8, 128]. 11 | GBS=8 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel size options. 18 | NLS=$((3*PP)) 19 | NNODES=${PP} 20 | 21 | 22 | # Other params. 23 | TP=8 24 | MBS=1 25 | HS=20480 26 | NAH=128 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Interleaved schedule options = [YES, NO]. 8 | INTERLEAVED=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set interleaved schedule options. 18 | if [ ${INTERLEAVED} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${INTERLEAVED} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 128]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and tensor-parallel size options. 18 | TP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | MBS=1 23 | NLS=32 24 | HS=20480 25 | NAH=128 26 | DDP=local 27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 28 | NNODES=8 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and data-parallel size options. 18 | DP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | TP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32]. 8 | TP=2 9 | 10 | # Batch size (global batch size) options = [32, 128, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set tensor-parallel and data-parallel size options. 18 | DP=$((64/TP)) 19 | 20 | 21 | # Other params. 22 | PP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Microbatch size options = [1, 2, 4, 8]. 8 | MBS=1 9 | 10 | # Batch size (global batch size) options = [128, 512]. 11 | GBS=128 12 | 13 | 14 | 15 | 16 | 17 | # Other params. 18 | TP=8 19 | PP=8 20 | NLS=32 21 | HS=15360 22 | NAH=128 23 | DDP=local 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | NNODES=8 26 | 27 | 28 | # Name of the job. 29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} 30 | 31 | 32 | # Import the configs. 33 | . `pwd`/CONFIG.sh 34 | 35 | 36 | # Submit the job. 37 | . `pwd`/SBATCH.sh 38 | 39 | 40 | exit 0 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_17.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Activation recomputation options = [YES, NO]. 8 | ACTIVATION_RECOMPUTATION=YES 9 | 10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256]. 11 | GBS=1 12 | 13 | 14 | 15 | 16 | 17 | # Set activation recomputation. 18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="" 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=16 31 | MBS=1 32 | NLS=80 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=16 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_18.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Scatter-gather communication optimization options = [YES, NO]. 8 | SCATTER_GATHER=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set scatter-gather communication optimization options. 18 | if [ ${SCATTER_GATHER} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${SCATTER_GATHER} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/export/README.md: -------------------------------------------------------------------------------- 1 | # Megatron Core Export 2 | 3 | This module is used to export megatron core models to different inference frameworks. 4 | Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 5 | 6 | ## PTQ AND EXPORT 7 | Follow the examples of [TensorRT Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment. 8 | 9 | # TRTLLM EXPORT 10 | Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone. 11 | -------------------------------------------------------------------------------- /examples/inference/llama_mistral/huggingface_reference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer 3 | 4 | # Set up argument parsing 5 | parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.") 6 | parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation") 7 | parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint") 8 | 9 | # Parse command-line arguments 10 | args = parser.parse_args() 11 | 12 | model_path = args.model_path 13 | prompt = args.prompt 14 | 15 | config = AutoConfig.from_pretrained(model_path) 16 | tokenizer = AutoTokenizer.from_pretrained(model_path, config=config) 17 | model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda() 18 | 19 | inputs = tokenizer(prompt, return_tensors="pt") 20 | for key in inputs: 21 | inputs[key] = inputs[key].cuda() 22 | # top_k, top_p and do_sample are set for greedy argmax based sampling 23 | 24 | outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0) 25 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -------------------------------------------------------------------------------- /examples/inference/run_text_generation_server_345M.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model. 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | export CUDA_DEVICE_MAX_CONNECTIONS=1 14 | 15 | pip install flask-restful 16 | 17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 18 | --tensor-model-parallel-size 1 \ 19 | --pipeline-model-parallel-size 1 \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --load ${CHECKPOINT} \ 23 | --num-attention-heads 16 \ 24 | --max-position-embeddings 1024 \ 25 | --tokenizer-type GPT2BPETokenizer \ 26 | --fp16 \ 27 | --micro-batch-size 1 \ 28 | --seq-length 1024 \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --seed 42 32 | -------------------------------------------------------------------------------- /examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | pip install flask-restful 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 16 | --tensor-model-parallel-size 8 \ 17 | --pipeline-model-parallel-size 1 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --load ${CHECKPOINT} \ 21 | --num-attention-heads 16 \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --micro-batch-size 1 \ 26 | --seq-length 1024 \ 27 | --vocab-file $VOCAB_FILE \ 28 | --merge-file $MERGE_FILE \ 29 | --seed 42 30 | -------------------------------------------------------------------------------- /examples/mamba/.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints/ 2 | data-cache/ 3 | tensorboard/ 4 | triton-cache/ 5 | -------------------------------------------------------------------------------- /examples/multimodal/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.02-py3 2 | 3 | RUN apt update && \ 4 | apt -y upgrade && \ 5 | apt install -y --no-install-recommends \ 6 | software-properties-common \ 7 | build-essential \ 8 | python3-pip \ 9 | python3-dev \ 10 | bash \ 11 | git \ 12 | vim \ 13 | tmux \ 14 | python-is-python3 \ 15 | default-jre 16 | 17 | RUN pip install --upgrade pip 18 | RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging 19 | RUN pip install transformers datasets accelerate timm 20 | RUN pip install pytest-cov pytest_mock nltk wrapt 21 | RUN pip install zarr "tensorstore==0.1.45" 22 | RUN pip install black isort click==8.0.2 23 | RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken 24 | RUN pip install git+https://github.com/openai/CLIP.git 25 | # Use --no-deps for the following to avoid outdated and unnecessary dependencies. 26 | RUN pip install open_clip_torch open-flamingo[eval] --no-deps 27 | -------------------------------------------------------------------------------- /examples/multimodal/assets/pretrain_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/examples/multimodal/assets/pretrain_curves.png -------------------------------------------------------------------------------- /examples/multimodal/convert_llava_pretrain_to_wds.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import webdataset as wds 4 | 5 | from tqdm import tqdm 6 | 7 | llava_pretrain_dir = '' 8 | 9 | # Paths to the dataset files 10 | json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json') 11 | output = os.path.join(llava_pretrain_dir, 'wds') 12 | 13 | if not os.path.exists(output): 14 | os.mkdir(output) 15 | 16 | # Load data 17 | with open(json_file, 'r') as f: 18 | data = json.load(f) 19 | 20 | with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer: 21 | for entry in tqdm(data): 22 | with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file: 23 | image_data = img_file.read() 24 | sample = { 25 | "__key__": entry['id'], 26 | "jpg": image_data, 27 | "json": json.dumps(entry['conversations']).encode("utf-8"), 28 | } 29 | shard_writer.write(sample) 30 | 31 | print(f"Dataset successfully converted to wds") 32 | -------------------------------------------------------------------------------- /examples/multimodal/nvlm/pretrain_blend.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 0.579 # Datasets are weighted according to their size. Weights sum up to 1. 7 | path: 8 | subflavors: 9 | augmentation: False 10 | 11 | - weight: 0.02 12 | path: 13 | subflavors: 14 | augmentation: False 15 | 16 | - weight: 0.01 17 | path: 18 | subflavors: 19 | augmentation: False 20 | 21 | # Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets. 22 | # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format. 23 | val: 24 | datasets: 25 | - weight: 1. 26 | path: 27 | subflavors: 28 | augmentation: False 29 | -------------------------------------------------------------------------------- /examples/multimodal/nvlm/sft_blend.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 0.01 # # Datasets are weighted according to their size. Weights sum up to 1. 7 | path: 8 | subflavors: 9 | augmentation: False 10 | 11 | - weight: 0.02 12 | path: 13 | subflavors: 14 | augmentation: False 15 | 16 | # Please refer to Table 6 in https://arxiv.org/pdf/2409.11402 for full list of SFT datasets. 17 | # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format. 18 | val: 19 | datasets: 20 | - weight: 1. 21 | path: 22 | subflavors: 23 | augmentation: False 24 | -------------------------------------------------------------------------------- /examples/multimodal/pretrain_dataset.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 1. 7 | path: 8 | subflavors: 9 | augmentation: false 10 | val: 11 | datasets: 12 | - weight: 1. 13 | path: 14 | subflavors: 15 | augmentation: false 16 | -------------------------------------------------------------------------------- /examples/multimodal/sft_dataset.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 1. 7 | path: 8 | subflavors: 9 | augmentation: false 10 | val: 11 | datasets: 12 | - weight: 1. 13 | path: 14 | subflavors: 15 | augmentation: false 16 | -------------------------------------------------------------------------------- /examples/post_training/modelopt/conf/meta-llama/Llama-3.1-8B-Instruct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${HF_MODEL_CKPT} ]; then 4 | HF_MODEL_CKPT=meta-llama/Llama-3.1-8B-Instruct 5 | TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8 6 | else 7 | TOKENIZER_MODEL=${HF_MODEL_CKPT} 8 | fi 9 | 10 | MODEL_ARGS=" \ 11 | --save-interval 100000 \ 12 | --micro-batch-size 1 \ 13 | --bf16 \ 14 | --no-masked-softmax-fusion \ 15 | --disable-bias-linear \ 16 | --untie-embeddings-and-output-weights \ 17 | --use-rotary-position-embeddings \ 18 | --rotary-percent 1.0 \ 19 | --no-rope-fusion \ 20 | --no-position-embedding \ 21 | --normalization RMSNorm \ 22 | --swiglu \ 23 | --num-layers 32 \ 24 | --hidden-size 4096 \ 25 | --ffn-hidden-size 14336 \ 26 | --num-attention-heads 32 \ 27 | --group-query-attention \ 28 | --num-query-groups 8 \ 29 | --seq-length 4096 \ 30 | --max-position-embeddings 8192 \ 31 | --tokenizer-type HuggingFaceTokenizer \ 32 | --make-vocab-size-divisible-by 1 \ 33 | --use-mcore-models \ 34 | --rotary-base 500000 \ 35 | --use-rope-scaling \ 36 | " 37 | -------------------------------------------------------------------------------- /examples/post_training/modelopt/conf/meta-llama/Llama-3.2-1B-Instruct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${HF_MODEL_CKPT} ]; then 4 | HF_MODEL_CKPT=meta-llama/Llama-3.2-1B-Instruct 5 | TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8 6 | else 7 | TOKENIZER_MODEL=${HF_MODEL_CKPT} 8 | fi 9 | 10 | MODEL_ARGS=" \ 11 | --save-interval 100000 \ 12 | --micro-batch-size 1 \ 13 | --bf16 \ 14 | --no-masked-softmax-fusion \ 15 | --disable-bias-linear \ 16 | --use-rotary-position-embeddings \ 17 | --no-rope-fusion \ 18 | --no-position-embedding \ 19 | --normalization RMSNorm \ 20 | --swiglu \ 21 | --num-layers 16 \ 22 | --hidden-size 2048 \ 23 | --ffn-hidden-size 8192 \ 24 | --num-attention-heads 32 \ 25 | --group-query-attention \ 26 | --num-query-groups 8 \ 27 | --seq-length 4096 \ 28 | --max-position-embeddings 8192 \ 29 | --tokenizer-type HuggingFaceTokenizer \ 30 | --make-vocab-size-divisible-by 1 \ 31 | --use-mcore-models \ 32 | --rotary-percent 1.0 \ 33 | --rotary-base 500000 \ 34 | --use-rope-scaling \ 35 | --export-force-local-attention \ 36 | " 37 | -------------------------------------------------------------------------------- /examples/post_training/modelopt/conf/nvidia/Nemotron-H-4B-Instruct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${HF_MODEL_CKPT} ]; then 4 | HF_MODEL_CKPT=nvidia/Nemotron-H-4B-Instruct 5 | TOKENIZER_MODEL=nvidia/Nemotron-H-4B-Instruct 6 | else 7 | TOKENIZER_MODEL=${HF_MODEL_CKPT} 8 | fi 9 | 10 | MODEL_ARGS=" \ 11 | --save-interval 100000 \ 12 | --micro-batch-size 1 \ 13 | --bf16 \ 14 | --no-masked-softmax-fusion \ 15 | --disable-bias-linear \ 16 | --untie-embeddings-and-output-weights \ 17 | --use-rotary-position-embeddings \ 18 | --rotary-percent 0.5 \ 19 | --no-rope-fusion \ 20 | --no-position-embedding \ 21 | --normalization RMSNorm \ 22 | --squared-relu \ 23 | --num-layers 52 \ 24 | --hidden-size 3072 \ 25 | --ffn-hidden-size 12288 \ 26 | --kv-channels 128 \ 27 | --num-attention-heads 32 \ 28 | --group-query-attention \ 29 | --num-query-groups 8 \ 30 | --hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \ 31 | --mamba-head-dim 64 \ 32 | --mamba-num-heads 112 \ 33 | --mamba-num-groups 8 \ 34 | --mamba-state-dim 128 \ 35 | --seq-length 4096 \ 36 | --max-position-embeddings 8192 \ 37 | --tokenizer-type HuggingFaceTokenizer \ 38 | --make-vocab-size-divisible-by 1 \ 39 | --use-mcore-models \ 40 | --rotary-base 10000 \ 41 | --export-model-type MambaModel \ 42 | " 43 | -------------------------------------------------------------------------------- /examples/post_training/modelopt/conf/nvidia/Nemotron-Mini-4B-Instruct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z ${HF_MODEL_CKPT} ]; then 4 | HF_MODEL_CKPT=nvidia/Nemotron-Mini-4B-Instruct 5 | TOKENIZER_MODEL=nvidia/Nemotron-Mini-4B-Instruct 6 | else 7 | TOKENIZER_MODEL=${HF_MODEL_CKPT} 8 | fi 9 | 10 | MODEL_ARGS=" \ 11 | --save-interval 100000 \ 12 | --micro-batch-size 1 \ 13 | --bf16 \ 14 | --no-masked-softmax-fusion \ 15 | --disable-bias-linear \ 16 | --untie-embeddings-and-output-weights \ 17 | --use-rotary-position-embeddings \ 18 | --rotary-percent 0.5 \ 19 | --no-rope-fusion \ 20 | --no-position-embedding \ 21 | --normalization LayerNorm \ 22 | --apply-layernorm-1p \ 23 | --squared-relu \ 24 | --num-layers 32 \ 25 | --hidden-size 3072 \ 26 | --ffn-hidden-size 9216 \ 27 | --num-attention-heads 24 \ 28 | --group-query-attention \ 29 | --num-query-groups 8 \ 30 | --seq-length 4096 \ 31 | --max-position-embeddings 4096 \ 32 | --tokenizer-type HuggingFaceTokenizer \ 33 | --make-vocab-size-divisible-by 1 \ 34 | --use-mcore-models \ 35 | --rotary-base 10000 \ 36 | " 37 | -------------------------------------------------------------------------------- /examples/post_training/modelopt/mmlu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" 4 | 5 | # Common arguments and base model specific arguments 6 | source "${SCRIPT_DIR}/conf/arguments.sh" 7 | 8 | # Extra arguments of this script 9 | MLM_DEFAULT_ARGS="--finetune --auto-detect-ckpt-format --export-te-mcore-model --sequence-parallel" 10 | 11 | ${LAUNCH_SCRIPT} ${SCRIPT_DIR}/mmlu.py \ 12 | ${MODEL_ARGS} \ 13 | --tensor-model-parallel-size ${TP} \ 14 | --expert-model-parallel-size ${EP} \ 15 | --pipeline-model-parallel-size ${PP} \ 16 | --tokenizer-model ${TOKENIZER_MODEL} \ 17 | --load ${MLM_MODEL_CKPT} \ 18 | ${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS} 19 | -------------------------------------------------------------------------------- /examples/post_training/modelopt/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | jsonlines 3 | mamba-ssm 4 | causal-conv1d 5 | nvidia-modelopt 6 | omegaconf 7 | pulp 8 | tensorstore!=0.1.46,!=0.1.72 9 | torchprofile 10 | transformers 11 | zarr 12 | -------------------------------------------------------------------------------- /examples/t5/t5_mcore_train_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/examples/t5/t5_mcore_train_curve.png -------------------------------------------------------------------------------- /images/model_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/images/model_table.png -------------------------------------------------------------------------------- /images/strong_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/images/strong_scaling.png -------------------------------------------------------------------------------- /images/weak_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/images/weak_scaling.png -------------------------------------------------------------------------------- /megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import megatron.core.tensor_parallel 4 | import megatron.core.utils 5 | from megatron.core import parallel_state 6 | from megatron.core.distributed import DistributedDataParallel 7 | from megatron.core.inference_params import InferenceParams 8 | from megatron.core.model_parallel_config import ModelParallelConfig 9 | from megatron.core.package_info import ( 10 | __contact_emails__, 11 | __contact_names__, 12 | __description__, 13 | __download_url__, 14 | __homepage__, 15 | __keywords__, 16 | __license__, 17 | __package_name__, 18 | __repository_url__, 19 | __shortversion__, 20 | __version__, 21 | ) 22 | from megatron.core.timers import Timers 23 | 24 | # Alias parallel_state as mpu, its legacy name 25 | mpu = parallel_state 26 | 27 | __all__ = [ 28 | "parallel_state", 29 | "tensor_parallel", 30 | "utils", 31 | "DistributedDataParallel", 32 | "InferenceParams", 33 | "ModelParallelConfig", 34 | "Timers", 35 | ] 36 | -------------------------------------------------------------------------------- /megatron/core/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | ENABLE_EXPERIMENTAL = False 4 | -------------------------------------------------------------------------------- /megatron/core/datasets/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | 4 | LIBNAME = helpers_cpp 5 | LIBEXT = $(shell python3-config --extension-suffix) 6 | 7 | OUT = $(LIBNAME)$(LIBEXT) 8 | SRC = helpers.cpp 9 | 10 | default: $(OUT) 11 | 12 | $(OUT): $(SRC) 13 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 14 | -------------------------------------------------------------------------------- /megatron/core/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/datasets/__init__.py -------------------------------------------------------------------------------- /megatron/core/datasets/retro/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .config import RetroGPTChunkDatasets 4 | from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig 5 | from .query.retro_dataset import get_retro_datasets 6 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - Embedder: Base class for all Bert embedders. 7 | - RetroBertEmbedders: Container class for in-memory and on-disk embedders. 8 | - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing. 9 | - RetroGPTChunkDatasets: Container class for train, valid, and test datasets. 10 | - RetroTokenizers: Container class for GPT and Bert tokenizers. 11 | """ 12 | 13 | from .bert_embedders import Embedder, RetroBertEmbedders 14 | from .config import RetroPreprocessingConfig 15 | from .gpt_chunk_datasets import RetroGPTChunkDatasets 16 | from .tokenizers import RetroTokenizers 17 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/gpt_chunk_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Container dataclass for GPT chunk datasets (train, valid, and test).""" 4 | 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass 9 | class RetroGPTChunkDatasets: 10 | """Container dataclass for GPT chunk datasets.""" 11 | 12 | # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'. 13 | train: dict = None 14 | valid: dict = None 15 | test: dict = None 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/tokenizers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Container class for GPT and Bert tokenizers.""" 4 | 5 | from dataclasses import dataclass 6 | 7 | from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer 8 | 9 | 10 | @dataclass 11 | class RetroTokenizers: 12 | """Container class for GPT and Bert tokenizers.""" 13 | 14 | gpt: MegatronTokenizer = None 15 | bert: MegatronTokenizer = None 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/db/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - build_db: Build a chunk database from a list of indexed datasets. 7 | """ 8 | 9 | from .build import build_db 10 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Required external libraries for Retro preprocessing.""" 4 | 5 | import importlib 6 | 7 | required_libs = ["faiss", "h5py", "transformers"] # for huggingface bert 8 | 9 | for lib in required_libs: 10 | try: 11 | globals()[lib] = importlib.import_module(lib) 12 | except ImportError as e: 13 | raise Exception( 14 | f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'." 15 | ) 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - train_index: Train an index on representative vectors. 7 | - add_to_index: Add vectors to a trained index. 8 | - build_index: Wrapper function that calls above two functions. 9 | """ 10 | 11 | from .build import add_to_index, build_index, train_index 12 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/index/indexes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | - FaissBaseIndex: Unoptimized Faiss index wrapper 6 | - FaissParallelAddIndex: Optimized index.add() for Faiss index. 7 | """ 8 | 9 | from .faiss_base import FaissBaseIndex 10 | from .faiss_par_add import FaissParallelAddIndex 11 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/query/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for querying the pretraining dataset.""" 4 | 5 | import os 6 | 7 | from megatron.core.datasets.megatron_dataset import MegatronDataset 8 | 9 | 10 | def get_query_dir(project_dir: str) -> str: 11 | """Get root directory of all saved query data. 12 | 13 | Args: 14 | project_dir (str): Retro project dir. 15 | 16 | Returns: 17 | Path to query sub-directory in Retro project. 18 | """ 19 | return os.path.join(project_dir, "query") 20 | 21 | 22 | def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str: 23 | """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test). 24 | 25 | Args: 26 | project_dir (str): Retro project dir. 27 | key (str): Dataset split key; 'train', 'valid', or 'test'. 28 | dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors. 29 | 30 | Returns: 31 | Path to directory containing this dataset's neighbors within Retro project. 32 | """ 33 | return os.path.join( 34 | get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}") 35 | ) 36 | -------------------------------------------------------------------------------- /megatron/core/datasets/utils_s3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from megatron.core.datasets.object_storage_utils import ( # pylint: disable=unused-import 3 | S3_PREFIX, 4 | S3Client, 5 | ) 6 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .core import check_is_distributed_checkpoint 4 | from .mapping import LocalNonpersistentObject, ShardedObject, ShardedTensor 5 | from .serialization import ( 6 | load, 7 | load_common_state_dict, 8 | load_plain_tensors, 9 | load_tensors_metadata, 10 | remove_sharded_tensors, 11 | save, 12 | ) 13 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Various loading and saving strategies """ 4 | from megatron.core.dist_checkpointing.strategies.common import register_default_common_strategies 5 | 6 | # We load "common" strategies by default to be always available 7 | register_default_common_strategies() 8 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ FS Reader with metadata cached support. """ 4 | 5 | import os 6 | from typing import Union 7 | 8 | from torch.distributed.checkpoint import FileSystemReader, Metadata 9 | 10 | 11 | class CachedMetadataFileSystemReader(FileSystemReader): 12 | """ 13 | Extends FileSystemReader to cache metadata for improved performance. 14 | 15 | Attributes: 16 | _cached_metadata (Metadata or None): Cached metadata from the file system. 17 | """ 18 | 19 | def __init__(self, path: Union[str, os.PathLike]) -> None: 20 | """ 21 | Initialize with file system path. 22 | 23 | Args: 24 | path (Union[str, os.PathLike]): Path to the checkpoint directory or file. 25 | """ 26 | super().__init__(path=path) 27 | self._cached_metadata = None 28 | 29 | def read_metadata(self) -> Metadata: 30 | """ 31 | Read metadata from file system, caching for subsequent calls. 32 | 33 | Returns: 34 | Metadata: Checkpoint metadata. 35 | """ 36 | if self._cached_metadata is None: 37 | self._cached_metadata = super().read_metadata() 38 | return self._cached_metadata 39 | -------------------------------------------------------------------------------- /megatron/core/distributed/README.md: -------------------------------------------------------------------------------- 1 | ## How to use pytorch FSDP2? 2 | 3 | Add these flag to enable Torch FSDP2. 4 | 5 | ``` 6 | --use-torch-fsdp2 7 | --no-gradient-accumulation-fusion 8 | --ckpt-format torch_dist 9 | ``` 10 | 11 | It is worth noting that CUDA_MAX_CONNECTIONS=1 should not be enabled to ensure that the communication of FSDP and the computation on the primary stream can be fully parallelized. 12 | -------------------------------------------------------------------------------- /megatron/core/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from packaging.version import Version 4 | 5 | from .distributed_data_parallel import DistributedDataParallel 6 | from .distributed_data_parallel_config import DistributedDataParallelConfig 7 | from .finalize_model_grads import finalize_model_grads 8 | from .torch_fully_sharded_data_parallel import TorchFullyShardedDataParallel 9 | from .torch_fully_sharded_data_parallel_config import TorchFullyShardedDataParallelConfig 10 | -------------------------------------------------------------------------------- /megatron/core/distributed/custom_fsdp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fully_sharded_data_parallel import FullyShardedDataParallel 4 | -------------------------------------------------------------------------------- /megatron/core/distributed/torch_fully_sharded_data_parallel_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from dataclasses import dataclass 4 | from typing import Union 5 | 6 | from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig 7 | 8 | 9 | @dataclass 10 | class TorchFullyShardedDataParallelConfig(DistributedDataParallelConfig): 11 | """Configuration for TorchFullyShardedDataParallel.""" 12 | 13 | reshard_after_forward: Union[bool, int] = True 14 | """ 15 | Controls the parameter behavior after forward. 16 | 17 | See PyTorch for complete documentation: 18 | https://github.com/pytorch/pytorch/blob/ac8ddf115065106f038865389a07f2d0c9ed5e11/torch/distributed/fsdp/_fully_shard/_fully_shard.py#L97C31-L97C49 # pylint: disable=line-too-long 19 | """ 20 | -------------------------------------------------------------------------------- /megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | class ModelType(enum.Enum): 7 | """Model type.""" 8 | 9 | encoder_or_decoder = 1 10 | encoder_and_decoder = 2 11 | retro_encoder = 3 12 | retro_decoder = 4 13 | 14 | 15 | class Fp8Recipe(str, enum.Enum): 16 | """FP8 recipe names: delayed, tensorwise, mxfp8, blockwise.""" 17 | 18 | delayed = "delayed" 19 | tensorwise = "tensorwise" 20 | mxfp8 = "mxfp8" 21 | blockwise = "blockwise" 22 | -------------------------------------------------------------------------------- /megatron/core/export/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/data_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from enum import Enum 4 | 5 | DataType = Enum('DataType', ["bfloat16", "float16", "float32"]) 6 | -------------------------------------------------------------------------------- /megatron/core/export/export_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import warnings 4 | from dataclasses import dataclass 5 | from typing import Optional 6 | 7 | 8 | @dataclass 9 | class ExportConfig: 10 | """Base configuration for Megatron Core Export 11 | 12 | These parameters control the export setting for trtllm 13 | """ 14 | 15 | inference_tp_size: int = 1 16 | 17 | inference_pp_size: int = 1 18 | 19 | use_parallel_embedding: bool = False 20 | 21 | use_embedding_sharing: Optional[bool] = None 22 | 23 | def __post_init__(self): 24 | if self.use_embedding_sharing is not None: 25 | with warnings.catch_warnings(): 26 | warnings.simplefilter("always") 27 | warnings.warn( 28 | "use_embedding_sharing is deprecated in ExportConfig, " 29 | "use share_embeddings_and_output_weights in TRTLLMHelper instead", 30 | DeprecationWarning, 31 | stacklevel=3, 32 | ) 33 | -------------------------------------------------------------------------------- /megatron/core/export/model_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from enum import Enum 4 | 5 | ModelType = Enum( 6 | 'ModelType', 7 | ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma", "nemotron_nas"], 8 | ) 9 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/engine_builder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/trt_model_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import tensorrt_llm 4 | 5 | from megatron.core.export.model_type import ModelType 6 | 7 | TRT_MODEL_CONFIG = { 8 | ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig, 9 | ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig, 10 | ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig, 11 | ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig, 12 | ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig, 13 | ModelType.gemma: tensorrt_llm.models.GemmaConfig, 14 | ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig, 15 | ModelType.nemotron_nas: tensorrt_llm.models.nemotron_nas.config.DeciConfig, 16 | } 17 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/trt_model_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from megatron.core.export.model_type import ModelType 4 | 5 | TRT_MODEL_TYPE_STRING = { 6 | ModelType.gpt: 'GPTForCausalLM', 7 | ModelType.gptnext: 'GPTForCausalLM', 8 | ModelType.starcoder: 'GPTForCausalLM', 9 | ModelType.mixtral: 'LlamaForCausalLM', 10 | ModelType.llama: 'LlamaForCausalLM', 11 | ModelType.gemma: 'GemmaForCausalLM', 12 | ModelType.falcon: 'FalconForCausalLM', 13 | ModelType.nemotron_nas: 'DeciLMForCausalLM', 14 | } 15 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/trtllm_weights_converter/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/trtllm_weights_converter/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | GATED_ACTIVATION = ["swiglu", "geglu", "fast-swiglu", "fast-geglu"] 4 | 5 | 6 | def is_gated_activation(helper): 7 | """Check whether the model is gated activation""" 8 | return helper.activation in GATED_ACTIVATION or helper.transformer_config.gated_linear_unit 9 | -------------------------------------------------------------------------------- /megatron/core/extensions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/extensions/__init__.py -------------------------------------------------------------------------------- /megatron/core/fusions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/fusions/__init__.py -------------------------------------------------------------------------------- /megatron/core/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/common_inference_params.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from megatron.core.inference.sampling_params import ( # noqa: F401 # pylint: disable=unused-import 3 | SamplingParams as CommonInferenceParams, 4 | ) 5 | -------------------------------------------------------------------------------- /megatron/core/inference/contexts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import warnings 4 | 5 | from .base_context import BaseInferenceContext 6 | from .dynamic_chunk_allocator import ChunkAllocator 7 | from .static_context import StaticInferenceContext 8 | 9 | warnings.warn( 10 | "The following imports from `dynamic_context.py` will be removed " 11 | "in this file in `megatron-core` 0.14. The imports here result in " 12 | "a cyclic import issue that causes rotary embeddings to import " 13 | "from Apex rather than Transformer Engine.", 14 | DeprecationWarning, 15 | ) 16 | from .dynamic_context import ( 17 | ChunkOverflowError, 18 | ContextOverflowError, 19 | DynamicInferenceContext, 20 | RequestOverflowError, 21 | TokenOverflowError, 22 | ) 23 | -------------------------------------------------------------------------------- /megatron/core/inference/contexts/base_context.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import abc 4 | 5 | 6 | class BaseInferenceContext(abc.ABC): 7 | """Base class for inference contexts. 8 | 9 | Currently extended by `StaticInferenceContext` and `DynamicInferenceContext`. 10 | Extend this class for any future contexts types. 11 | """ 12 | 13 | @abc.abstractmethod 14 | def is_static_batching(self) -> bool: 15 | """Return `True` if context uses static batching.""" 16 | pass 17 | 18 | def is_dynamic_batching(self) -> bool: 19 | """Return `True` if context uses dynamic batching.""" 20 | return not self.is_static_batching() 21 | -------------------------------------------------------------------------------- /megatron/core/inference/engines/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .abstract_engine import AbstractEngine 4 | from .dynamic_engine import DynamicInferenceEngine 5 | from .static_engine import StaticInferenceEngine 6 | -------------------------------------------------------------------------------- /megatron/core/inference/engines/abstract_engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | from typing import List 4 | 5 | 6 | class AbstractEngine(ABC): 7 | @staticmethod 8 | @abstractmethod 9 | def generate(self) -> dict: 10 | """The abstract backend's generate function. 11 | 12 | To define a new backend, implement this and return the outputs as a dictionary. 13 | 14 | Returns: 15 | dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`. 16 | """ 17 | pass 18 | -------------------------------------------------------------------------------- /megatron/core/inference/engines/mcore_engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .static_engine import ( # noqa: F401 # pylint: disable=unused-import 4 | StaticInferenceEngine as MCoreEngine, 5 | ) 6 | -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/t5/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/text_generation_controllers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from megatron.core.inference.text_generation_controllers.text_generation_controller import ( # noqa: F401 # pylint: disable=unused-import 4 | TextGenerationController as SimpleTextGenerationController, 5 | ) 6 | -------------------------------------------------------------------------------- /megatron/core/inference/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | class Counter: 3 | """A simple counter class 4 | 5 | This class is responsible for assigning request ids to incoming requests 6 | """ 7 | 8 | def __init__(self, start: int = 0) -> None: 9 | self.counter = start 10 | 11 | def __next__(self) -> int: 12 | i = self.counter 13 | self.counter += 1 14 | return i 15 | 16 | def reset(self) -> None: 17 | """Reset counter""" 18 | self.counter = 0 19 | -------------------------------------------------------------------------------- /megatron/core/inference_params.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .inference.contexts import ( # noqa: F401 # pylint: disable=unused-import 4 | StaticInferenceContext as InferenceParams, 5 | ) 6 | -------------------------------------------------------------------------------- /megatron/core/jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core.utils import is_torch_min_version 6 | 7 | jit_fuser = torch.jit.script 8 | # nvFuser is deprecated in PyTorch JIT starting from 2.2 9 | if is_torch_min_version("2.2.0a0"): 10 | jit_fuser = torch.compile 11 | -------------------------------------------------------------------------------- /megatron/core/models/T5/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .t5_model import T5Model 3 | -------------------------------------------------------------------------------- /megatron/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/bert/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/common/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .rope_utils import apply_rotary_pos_emb 4 | from .rotary_pos_embedding import MultimodalRotaryEmbedding, RotaryEmbedding 5 | from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale 6 | -------------------------------------------------------------------------------- /megatron/core/models/common/language_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/common/language_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/common/vision_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/vision_module.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | """Megatron Vision Module.""" 3 | 4 | from megatron.core.transformer.module import MegatronModule 5 | from megatron.core.transformer.transformer_config import TransformerConfig 6 | 7 | 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes. 9 | class VisionModule(MegatronModule): 10 | """Base vision module that has common helper functions used across CLIP, ViT, etc. 11 | 12 | Args: 13 | config (TransformerConfig): Input transformer config for the model 14 | """ 15 | 16 | def __init__(self, config: TransformerConfig) -> None: 17 | super().__init__(config=config) 18 | -------------------------------------------------------------------------------- /megatron/core/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .gpt_model import GPTModel 3 | -------------------------------------------------------------------------------- /megatron/core/models/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | from .module import HuggingFaceModule, build_hf_model 3 | -------------------------------------------------------------------------------- /megatron/core/models/huggingface/clip_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from transformers import AutoModel 4 | from transformers.models.siglip.modeling_siglip import SiglipEncoderLayer 5 | 6 | from megatron.core.models.huggingface import HuggingFaceModule 7 | 8 | 9 | class SiglipHuggingFaceModel(HuggingFaceModule): 10 | """ 11 | Wrapper for Siglip HuggingFace models. 12 | """ 13 | 14 | # Currently applies to FSDP2 only, not the custom FSDP implementation. 15 | _fsdp_modules = [SiglipEncoderLayer] 16 | 17 | def __init__(self, config): 18 | super().__init__(config) 19 | self.model = AutoModel.from_pretrained(config.vision_model_type.split("hf://")[1]) 20 | 21 | def forward(self, *args, **kwargs): 22 | """Siglip forward.""" 23 | x = self.model(*args, **kwargs) 24 | x = x["last_hidden_state"] 25 | 26 | return x 27 | -------------------------------------------------------------------------------- /megatron/core/models/mamba/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .mamba_model import MambaModel 3 | -------------------------------------------------------------------------------- /megatron/core/models/mimo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from megatron.core.models.mimo.config.base_configs import MimoModelConfig 4 | from megatron.core.models.mimo.model import MimoModel 5 | from megatron.core.models.mimo.submodules.audio import AudioModalitySubmodules 6 | from megatron.core.models.mimo.submodules.base import ModalitySubmodules 7 | from megatron.core.models.mimo.submodules.vision import VisionModalitySubmodules 8 | 9 | __all__ = [ 10 | 'MimoModelConfig', 11 | 'MimoModel', 12 | # Submodule classes 13 | 'ModalitySubmodules', 14 | 'VisionModalitySubmodules', 15 | 'AudioModalitySubmodules', 16 | ] 17 | -------------------------------------------------------------------------------- /megatron/core/models/mimo/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from megatron.core.models.mimo.config.base_configs import MimoModelConfig 4 | 5 | __all__ = ['MimoModelConfig'] 6 | -------------------------------------------------------------------------------- /megatron/core/models/mimo/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | from megatron.core.models.mimo.model.base import MimoModel 3 | 4 | __all__ = ['MimoModel'] 5 | -------------------------------------------------------------------------------- /megatron/core/models/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/models/retro/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - RetroConfig: configuration dataclass for RetroModel. 7 | - RetroModel: The Retro model. 8 | - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block. 9 | """ 10 | 11 | from .config import RetroConfig 12 | from .decoder_spec import get_retro_decoder_block_spec 13 | from .model import RetroModel 14 | -------------------------------------------------------------------------------- /megatron/core/models/retro/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | import torch 6 | 7 | 8 | def get_config_path(project_dir: str) -> str: 9 | """Config copy stored within retro project dir.""" 10 | return os.path.join(project_dir, "config.json") 11 | 12 | 13 | def get_gpt_data_dir(project_dir: str) -> str: 14 | """Get project-relative directory of GPT bin/idx datasets.""" 15 | return os.path.join(project_dir, "data") 16 | 17 | 18 | # ** Note ** : Retro's compatibility between cross attention and Flash/Fused 19 | # Attention is currently a work in progress. We default to returning None for 20 | # now. 21 | # def get_all_true_mask(size, device): 22 | # return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device) 23 | def get_all_true_mask(size, device): 24 | return None 25 | -------------------------------------------------------------------------------- /megatron/core/models/vision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/models/vision/__init__.py -------------------------------------------------------------------------------- /megatron/core/optimizer/cpu_offloading/README.md: -------------------------------------------------------------------------------- 1 | ## How to use ? 2 | 3 | Add these flags to enable optimizer cpu offload in MCore. 4 | 5 | ```bash 6 | --optimizer-cpu-offload 7 | --optimizer-offload-fraction 1.0 8 | --use-precision-aware-optimizer 9 | ``` 10 | 11 | ## Configuration Recommendataions 12 | 13 | Gradient copy from GPU to CPU, CPU optimizer step, and subsequent parameter copy from CPU to GPU can be time-consuming operations, and it is recommended to use the flag `--overlap-cpu-optimizer-d2h-h2d` to execute them concurrently. 14 | -------------------------------------------------------------------------------- /megatron/core/optimizer/cpu_offloading/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | from .hybrid_optimizer import HybridDeviceOptimizer 3 | -------------------------------------------------------------------------------- /megatron/core/package_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | MAJOR = 0 5 | MINOR = 13 6 | PATCH = 0 7 | PRE_RELEASE = 'rc0' 8 | 9 | # Use the following formatting: (major, minor, patch, pre-release) 10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) 11 | 12 | __shortversion__ = '.'.join(map(str, VERSION[:3])) 13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) 14 | 15 | __package_name__ = 'megatron_core' 16 | __contact_names__ = 'NVIDIA' 17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email 18 | __homepage__ = ( 19 | 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage 20 | ) 21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' 22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 23 | __description__ = ( 24 | 'Megatron Core - a library for efficient and scalable training of transformer based models' 25 | ) 26 | __license__ = 'BSD-3' 27 | __keywords__ = ( 28 | 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' 29 | ) 30 | -------------------------------------------------------------------------------- /megatron/core/packed_seq_params.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from dataclasses import dataclass 3 | 4 | from torch import Tensor 5 | 6 | 7 | @dataclass 8 | class PackedSeqParams: 9 | ''' 10 | parameters to TEDotProductAttention and fused rope kernels for the 11 | `thd` (packed) sequence format 12 | ''' 13 | 14 | qkv_format: str = None 15 | cu_seqlens_q: Tensor = None 16 | cu_seqlens_kv: Tensor = None 17 | cu_seqlens_q_padded: Tensor = None 18 | cu_seqlens_kv_padded: Tensor = None 19 | max_seqlen_q: Tensor = None 20 | max_seqlen_kv: Tensor = None 21 | -------------------------------------------------------------------------------- /megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .schedules import get_forward_backward_func 3 | -------------------------------------------------------------------------------- /megatron/core/post_training/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/post_training/modelopt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | """Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt). 3 | 4 | ModelOpt is a library comprising state-of-the-art model optimization techniques 5 | including quantization and sparsity to compress model for efficient inference on 6 | NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless 7 | experience for users to optimize their Megatron-core models for inference. 8 | More details on ModelOpt including installation and usage can be found at 9 | https://github.com/NVIDIA/TensorRT-Model-Optimizer. 10 | """ 11 | -------------------------------------------------------------------------------- /megatron/core/post_training/modelopt/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/post_training/modelopt/mamba/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | packaging 3 | -------------------------------------------------------------------------------- /megatron/core/ssm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/ssm/mlp_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from typing import Optional 4 | 5 | from megatron.core.process_groups_config import ModelCommProcessGroups 6 | from megatron.core.transformer import ( 7 | TransformerConfig, 8 | TransformerLayer, 9 | TransformerLayerSubmodules, 10 | ) 11 | 12 | 13 | class MLPLayer(TransformerLayer): 14 | """Drop-in replacement for TransformerLayer but initializes only an MLP via the spec.""" 15 | 16 | def __init__( 17 | self, 18 | config: TransformerConfig, 19 | submodules: TransformerLayerSubmodules, 20 | layer_number: int = 1, 21 | hidden_dropout: float = None, 22 | model_comm_pgs: Optional[ModelCommProcessGroups] = None, 23 | ): 24 | super().__init__( 25 | config=config, 26 | submodules=submodules, 27 | layer_number=layer_number, 28 | hidden_dropout=hidden_dropout, 29 | model_comm_pgs=model_comm_pgs, 30 | ) 31 | -------------------------------------------------------------------------------- /megatron/core/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .module import MegatronModule 4 | from .spec_utils import ModuleSpec, build_module 5 | from .transformer_config import MLATransformerConfig, TransformerConfig 6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules 7 | -------------------------------------------------------------------------------- /megatron/core/transformer/custom_layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/transformer/custom_layers/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/custom_layers/transformer_engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import warnings 4 | 5 | warnings.warn( 6 | """The 'megatron.core.transformer.custom_layers.transformer_engine' 7 | module is deprecated and will be removed in 0.10.0. Please use 8 | 'megatron.core.extensions.transformer_engine' instead.""", 9 | DeprecationWarning, 10 | stacklevel=2, 11 | ) 12 | from megatron.core.extensions.transformer_engine import * 13 | -------------------------------------------------------------------------------- /megatron/core/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | # can we get rid of this? 7 | # it's being used in pipeline schedules 8 | class ModelType(enum.Enum): 9 | """Model Type 10 | 11 | encoder_or_decoder for bert, gpt etc 12 | encoder_and_decoder for multimodal , T5 etc 13 | """ 14 | 15 | encoder_or_decoder = 1 16 | encoder_and_decoder = 2 17 | 18 | 19 | # class LayerType(enum.Enum): 20 | # encoder = 1 21 | # decoder = 2 22 | 23 | 24 | class AttnType(enum.Enum): 25 | """Attention type""" 26 | 27 | self_attn = 1 28 | cross_attn = 2 29 | 30 | 31 | class AttnMaskType(enum.Enum): 32 | """Attention Mask Type""" 33 | 34 | padding = 1 35 | causal = 2 36 | no_mask = 3 # only used for TE 37 | padding_causal = 4 # only used for thd attention 38 | arbitrary = 5 39 | 40 | 41 | class AttnBackend(enum.Enum): 42 | """Attention Backend""" 43 | 44 | flash = 1 45 | fused = 2 46 | unfused = 3 47 | local = 4 48 | auto = 5 49 | -------------------------------------------------------------------------------- /megatron/core/transformer/identity_op.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | 4 | 5 | class IdentityOp(torch.nn.Module): 6 | """ 7 | This is a placeholder for IdentityOp(x) -> x 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | 17 | class IdentityFuncOp(IdentityOp): 18 | """ 19 | This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x. 20 | Such a func is handy for ops like `bias_dropout_fusion` which themselves 21 | return a function at runtime based on passed arguments 22 | """ 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__() 26 | 27 | def forward(self, *args, **kwargs): 28 | return super().forward 29 | -------------------------------------------------------------------------------- /megatron/core/transformer/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/core/transformer/moe/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/moe/grouped_gemm_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | try: 4 | import grouped_gemm 5 | except ImportError: 6 | grouped_gemm = None 7 | 8 | 9 | def grouped_gemm_is_available(): 10 | """Check if grouped_gemm is available.""" 11 | return grouped_gemm is not None 12 | 13 | 14 | def assert_grouped_gemm_is_available(): 15 | """Assert that grouped_gemm is available.""" 16 | assert grouped_gemm_is_available(), ( 17 | "Grouped GEMM is not available. Please run " 18 | "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4`." 19 | ) 20 | 21 | 22 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None 23 | -------------------------------------------------------------------------------- /megatron/core/transformer/torch_layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from megatron.core.transformer.torch_norm import WrappedTorchNorm 3 | 4 | WrappedTorchLayerNorm = WrappedTorchNorm 5 | -------------------------------------------------------------------------------- /megatron/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/inference/endpoints/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | import threading 5 | 6 | GENERATE_NUM = 0 7 | BEAM_NUM = 1 8 | LOCK = threading.Lock() 9 | 10 | 11 | def send_do_generate(): 12 | choice = torch.tensor([GENERATE_NUM], dtype=torch.long, device="cuda") 13 | torch.distributed.broadcast(choice, 0) 14 | 15 | 16 | def send_do_beam_search(): 17 | choice = torch.tensor([BEAM_NUM], dtype=torch.long, device="cuda") 18 | torch.distributed.broadcast(choice, 0) 19 | -------------------------------------------------------------------------------- /megatron/inference/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /megatron/legacy/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/legacy/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /megatron/legacy/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /megatron/legacy/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/legacy/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/legacy/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 4 | from .rms_norm import RMSNorm 5 | 6 | from .bert_model import BertModel 7 | from .gpt_model import GPTModel 8 | from .t5_model import T5Model 9 | from .language_model import get_language_model 10 | -------------------------------------------------------------------------------- /megatron/legacy/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | retro_decoder_with_retriever = 5 11 | 12 | class AttnType(enum.Enum): 13 | self_attn = 1 14 | cross_attn = 2 15 | 16 | class AttnMaskType(enum.Enum): 17 | padding = 1 18 | causal = 2 19 | 20 | # For backward compatibility with old model checkpoints 21 | from megatron.core.enums import ModelType 22 | -------------------------------------------------------------------------------- /megatron/legacy/model/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from torch import nn 5 | 6 | class RMSNorm(torch.nn.Module): 7 | 8 | def __init__(self, 9 | dim: int, 10 | eps: float = 1e-6, 11 | sequence_parallel: bool = False, 12 | config: dict = None): 13 | """RMS Normaliation module 14 | 15 | Args: 16 | dim (int): The width of input, i.e. hidden size 17 | eps (float): epsilon to use for the norm, default to 1e-6 18 | sequence_parallel (bool): Set to true if sequence parallelism is being used, 19 | this marks the weights as needing to be allreduced. 20 | """ 21 | super().__init__() 22 | self.eps = eps 23 | self.weight = nn.Parameter(torch.ones(dim)) 24 | 25 | setattr(self.weight, 'sequence_parallel', sequence_parallel) 26 | 27 | def _norm(self, x): 28 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 29 | 30 | def forward(self, x): 31 | output = self._norm(x.float()).type_as(x) 32 | return output * self.weight 33 | -------------------------------------------------------------------------------- /megatron/legacy/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | import warnings 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | def resize(input, 8 | size=None, 9 | scale_factor=None, 10 | mode='nearest', 11 | align_corners=None, 12 | warning=True): 13 | if warning: 14 | if size is not None and align_corners: 15 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 16 | output_h, output_w = tuple(int(x) for x in size) 17 | if output_h > input_h or output_w > output_h: 18 | if ((output_h > 1 and output_w > 1 and input_h > 1 19 | and input_w > 1) and (output_h - 1) % (input_h - 1) 20 | and (output_w - 1) % (input_w - 1)): 21 | warnings.warn( 22 | f'When align_corners={align_corners}, ' 23 | 'the output would more aligned if ' 24 | f'input size {(input_h, input_w)} is `x+1` and ' 25 | f'out size {(output_h, output_w)} is `nx+1`') 26 | if isinstance(size, torch.Size): 27 | size = tuple(int(x) for x in size) 28 | return F.interpolate(input, size, scale_factor, mode, align_corners) 29 | -------------------------------------------------------------------------------- /megatron/legacy/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/megatron/legacy/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/post_training/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/post_training/algos/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/post_training/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from datasets import load_dataset 5 | 6 | 7 | def get_current_memory_info(): 8 | """Get current memory usage.""" 9 | remaining_mem, total_mem = torch.cuda.mem_get_info() 10 | info = "rank {:3}/{:3} memory remaining {:03}% ({}/{} MB) ".format( 11 | torch.distributed.get_rank(), 12 | torch.distributed.get_world_size(), 13 | int(remaining_mem * 100 / total_mem), 14 | remaining_mem // 1048576, 15 | total_mem // 1048576, 16 | ) 17 | return info 18 | 19 | 20 | def report_current_memory_info(): 21 | """Report current memory usage.""" 22 | print(get_current_memory_info(), flush=True) 23 | torch.distributed.barrier() 24 | 25 | 26 | def get_mtbench_chat_data(): 27 | """Return a MTBench dataset.""" 28 | 29 | def mtbench_to_oai_chat(example): 30 | """Convert MTBench data to OpenAI chat completion format.""" 31 | conversations = [] 32 | for prompt in example["prompt"]: 33 | conversations.append({"role": "user", "content": prompt}) 34 | example["conversations"] = conversations 35 | return example 36 | 37 | dataset = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train") 38 | return dataset.map(mtbench_to_oai_chat) 39 | -------------------------------------------------------------------------------- /megatron/training/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args 6 | from .global_vars import get_signal_handler 7 | from .global_vars import get_tokenizer 8 | from .global_vars import get_tensorboard_writer 9 | from .global_vars import get_wandb_writer 10 | from .global_vars import get_one_logger 11 | from .global_vars import get_adlr_autoresume 12 | from .global_vars import get_timers 13 | from .initialize import initialize_megatron 14 | from .training import pretrain, get_model, get_train_valid_test_num_samples 15 | 16 | from .utils import (print_rank_0, 17 | is_last_rank, 18 | print_rank_last) 19 | -------------------------------------------------------------------------------- /megatron/training/activations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | from megatron.core.jit import jit_fuser 6 | 7 | 8 | @jit_fuser 9 | def squared_relu(x: torch.Tensor) -> torch.Tensor: 10 | return torch.pow(F.relu(x), 2) 11 | 12 | 13 | @jit_fuser 14 | def quick_gelu(x: torch.Tensor) -> torch.Tensor: 15 | return x * torch.sigmoid(1.702 * x) 16 | 17 | @jit_fuser 18 | def fast_gelu(x: torch.Tensor) -> torch.Tensor: 19 | return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) 20 | -------------------------------------------------------------------------------- /megatron/training/log_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import sys 4 | from logging import LogRecord, StreamHandler 5 | 6 | BLACKLISTED_MODULES = ["torch.distributed"] 7 | 8 | 9 | class CustomHandler(StreamHandler): 10 | """ 11 | Custom handler to filter out logging from code outside of 12 | Megatron Core, and dump to stdout. 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__(stream=sys.stdout) 17 | 18 | def filter(self, record: LogRecord) -> bool: 19 | # Prevent log entries that come from the blacklisted modules 20 | # through (e.g., PyTorch Distributed). 21 | for blacklisted_module in BLACKLISTED_MODULES: 22 | if record.name.startswith(blacklisted_module): 23 | return False 24 | return True 25 | -------------------------------------------------------------------------------- /megatron/training/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | check_untyped_defs = False 4 | disallow_untyped_calls = False 5 | disallow_untyped_defs = False 6 | disallow_incomplete_defs = False 7 | 8 | disable_error_code = call-arg,operator,var-annotated,union-attr,import-untyped 9 | 10 | # Enable only `assignment` error checking 11 | enable_error_code = assignment -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # content of pytest.ini 2 | [pytest] 3 | markers = 4 | internal: mark a test as a test to private/internal functions. 5 | flaky: mark flaky tests for LTS environment 6 | flaky_in_dev: mark flaky tests for DEV environment 7 | -------------------------------------------------------------------------------- /requirements/pytorch_24.01/requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | flask-restful 3 | nltk 4 | pytest 5 | pytest_asyncio 6 | pytest-cov 7 | pytest_mock 8 | pytest-random-order 9 | sentencepiece 10 | tiktoken 11 | wrapt 12 | zarr 13 | wandb 14 | triton==2.1.0 15 | tensorstore!=0.1.46,!=0.1.72 16 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin" 17 | nvtx 18 | -------------------------------------------------------------------------------- /requirements/pytorch_24.07/requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | flask-restful 3 | nltk 4 | pytest 5 | pytest_asyncio 6 | pytest-cov 7 | pytest_mock 8 | pytest-random-order 9 | sentencepiece 10 | tiktoken 11 | wrapt 12 | zarr 13 | wandb 14 | tensorstore!=0.1.46 15 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin" 16 | nvidia-resiliency-ext 17 | -------------------------------------------------------------------------------- /requirements/pytorch_24.10/requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | zarr 3 | tensorstore!=0.1.46,!=0.1.72 4 | torch 5 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin" 6 | nvidia-resiliency-ext; platform_machine == "x86_64" 7 | nvtx 8 | -------------------------------------------------------------------------------- /requirements/pytorch_25.03/requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | flask-restful 3 | nltk 4 | pytest 5 | pytest-cov 6 | pytest_mock 7 | pytest-random-order 8 | sentencepiece 9 | tiktoken 10 | wrapt 11 | zarr 12 | wandb 13 | tensorstore!=0.1.46,!=0.1.72 14 | torch 15 | nvidia-modelopt[torch]>=0.23.2; sys_platform != "darwin" 16 | nvtx 17 | -------------------------------------------------------------------------------- /requirements_ci.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | wrapt 3 | pytest 4 | pytest_asyncio 5 | pytest-cov 6 | pytest_mock 7 | pytest-random-order 8 | wandb -------------------------------------------------------------------------------- /requirements_mlm.txt: -------------------------------------------------------------------------------- 1 | tiktoken 2 | flask-restful 3 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | from megatron.training import get_args, print_rank_0 6 | from megatron.legacy.indexer import IndexBuilder 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator 8 | 9 | def main(): 10 | """ 11 | Main program 12 | """ 13 | 14 | args = get_args() 15 | 16 | """ 17 | Create a BlockData data structure by running an IndexBuilder over an 18 | ICT Dataset and then evaluate on NQ task 19 | """ 20 | 21 | print_rank_0("Starting index builder!") 22 | 23 | index_builder = IndexBuilder() 24 | index_builder.build_and_save_index() 25 | print_rank_0("Build and save indices: done!") 26 | 27 | 28 | print_rank_0("Starting evaluations!") 29 | 30 | # Set up the model and evaluator 31 | evaluator = ORQAEvaluator() 32 | 33 | # Run evaluation 34 | if args.qa_data_dev is not None: 35 | evaluator.evaluate(args.qa_data_dev, "DEV") 36 | 37 | if args.qa_data_test is not None: 38 | evaluator.evaluate(args.qa_data_test, "TEST") 39 | 40 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/python_test_utils/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49689, "5": 10.48165, "10": 10.50192, "15": 10.45891, "20": 10.44599, "25": 10.35067, "30": 10.16617, "35": 10.04377, "40": 9.90903, "45": 9.75804, "50": 9.67525}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2071.0, "5": 2603.0, "10": 2120.0, "15": 2502.0, "20": 2235.0, "25": 2509.0, "30": 2938.0, "35": 2948.0, "40": 2197.0, "45": 3921.0, "50": 3479.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.89689, "5": 1.31101, "10": 1.31458, "15": 1.39008, "20": 1.43723, "25": 1.38294, "30": 1.37996, "35": 1.34031, "40": 1.38199, "45": 1.37809, "50": 1.40054}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.48367, "5": 10.47639, "10": 10.47262, "15": 10.47929, "20": 10.45433, "25": 10.38155, "30": 10.21158, "35": 10.1058, "40": 9.98135, "45": 9.8233, "50": 9.7299}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2570.0, "5": 2068.0, "10": 2597.0, "15": 2038.0, "20": 2750.0, "25": 2493.0, "30": 2850.0, "35": 2434.0, "40": 3418.0, "45": 3632.0, "50": 2132.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1784014336.0, "5": 1784014336.0, "10": 1784014336.0, "15": 1784014336.0, "20": 1784014336.0, "25": 1784014336.0, "30": 1784014336.0, "35": 1784014336.0, "40": 1784014336.0, "45": 1784014336.0, "50": 1784014336.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2365860864.0, "5": 3108842496.0, "10": 3108842496.0, "15": 3108842496.0, "20": 3108842496.0, "25": 3108842496.0, "30": 3108842496.0, "35": 3108842496.0, "40": 3108842496.0, "45": 3108842496.0, "50": 3108842496.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.5518, "5": 1.13792, "10": 1.13766, "15": 1.22776, "20": 1.1374, "25": 1.18568, "30": 1.23204, "35": 1.14281, "40": 1.37036, "45": 1.13878, "50": 1.3794}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34954, "30": 10.17283, "35": 10.0427, "40": 9.9076, "45": 9.7577, "50": 9.67688}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2166.0, "25": 2639.0, "30": 2769.0, "35": 3080.0, "40": 2282.0, "45": 3831.0, "50": 3519.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2300849152.0, "5": 3043311616.0, "10": 3043311616.0, "15": 3043311616.0, "20": 3043311616.0, "25": 3043311616.0, "30": 3043311616.0, "35": 3043311616.0, "40": 3043311616.0, "45": 3043311616.0, "50": 3043311616.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.92253, "5": 1.17517, "10": 1.16204, "15": 1.1534, "20": 1.15142, "25": 1.1777, "30": 1.14956, "35": 1.15257, "40": 1.14342, "45": 1.14293, "50": 1.14651}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49226, "5": 10.49833, "10": 10.49375, "15": 10.48886, "20": 10.46612, "25": 10.39219, "30": 10.20812, "35": 10.06926, "40": 9.93854, "45": 9.75472, "50": 9.6868}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2098.0, "5": 2869.0, "10": 2322.0, "15": 2605.0, "20": 2299.0, "25": 2583.0, "30": 2637.0, "35": 3051.0, "40": 1841.0, "45": 3921.0, "50": 3392.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3375511040.0, "5": 3375511040.0, "10": 3375511040.0, "15": 3375511040.0, "20": 3375511040.0, "25": 3375511040.0, "30": 3375511040.0, "35": 3375511040.0, "40": 3375511040.0, "45": 3375511040.0, "50": 3375511040.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4165166080.0, "5": 5631605760.0, "10": 5631605760.0, "15": 5631605760.0, "20": 5631605760.0, "25": 5631605760.0, "30": 5631605760.0, "35": 5631605760.0, "40": 5631605760.0, "45": 5631605760.0, "50": 5631605760.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.75804, "5": 0.68618, "10": 0.68574, "15": 0.71793, "20": 0.79578, "25": 0.68652, "30": 0.69897, "35": 0.68192, "40": 0.69111, "45": 0.68688, "50": 0.79338}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.47723, "5": 10.4715, "10": 10.46311, "15": 10.48841, "20": 10.44522, "25": 10.35474, "30": 10.2301, "35": 10.08868, "40": 9.93794, "45": 9.80332, "50": 9.70238}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2137.0, "5": 2003.0, "10": 2053.0, "15": 1807.0, "20": 2617.0, "25": 2429.0, "30": 2748.0, "35": 2364.0, "40": 3423.0, "45": 3125.0, "50": 2396.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3404871168.0, "5": 3404871168.0, "10": 3404871168.0, "15": 3404871168.0, "20": 3404871168.0, "25": 3404871168.0, "30": 3404871168.0, "35": 3404871168.0, "40": 3404871168.0, "45": 3404871168.0, "50": 3404871168.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4194526208.0, "5": 5660965376.0, "10": 5660965376.0, "15": 5660965376.0, "20": 5660965376.0, "25": 5660965376.0, "30": 5660965376.0, "35": 5660965376.0, "40": 5660965376.0, "45": 5660965376.0, "50": 5660965376.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.23346, "5": 0.60029, "10": 0.63129, "15": 0.587, "20": 0.60414, "25": 0.59205, "30": 0.66378, "35": 0.64433, "40": 0.65072, "45": 0.64763, "50": 0.63206}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49405, "5": 10.49933, "10": 10.49631, "15": 10.4873, "20": 10.46572, "25": 10.39496, "30": 10.2104, "35": 10.07333, "40": 9.94011, "45": 9.75651, "50": 9.69025}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2018.0, "5": 2740.0, "10": 2260.0, "15": 2649.0, "20": 2205.0, "25": 2675.0, "30": 2687.0, "35": 2930.0, "40": 1853.0, "45": 4016.0, "50": 2978.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3375511040.0, "5": 3375511040.0, "10": 3375511040.0, "15": 3375511040.0, "20": 3375511040.0, "25": 3375511040.0, "30": 3375511040.0, "35": 3375511040.0, "40": 3375511040.0, "45": 3375511040.0, "50": 3375511040.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4153629696.0, "5": 5620069376.0, "10": 5620069376.0, "15": 5620069376.0, "20": 5620069376.0, "25": 5620069376.0, "30": 5620069376.0, "35": 5620069376.0, "40": 5620069376.0, "45": 5620069376.0, "50": 5620069376.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.9519, "5": 0.61548, "10": 0.60778, "15": 0.60342, "20": 0.59844, "25": 0.60331, "30": 0.60426, "35": 0.59982, "40": 0.59928, "45": 0.80076, "50": 0.64239}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.46352, "5": 10.45321, "10": 10.4481, "15": 10.45891, "20": 10.41677, "25": 10.34598, "30": 10.1814, "35": 10.03992, "40": 9.90206, "45": 9.74954, "50": 9.66818}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2485.0, "5": 2824.0, "10": 2427.0, "15": 2767.0, "20": 2412.0, "25": 2691.0, "30": 2807.0, "35": 3077.0, "40": 2363.0, "45": 3744.0, "50": 3526.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2032164352.0, "5": 2032164352.0, "10": 2032164352.0, "15": 2032164352.0, "20": 2032164352.0, "25": 2032164352.0, "30": 2032164352.0, "35": 2032164352.0, "40": 2032164352.0, "45": 2032164352.0, "50": 2032164352.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4360259072.0, "5": 5220507136.0, "10": 5220507136.0, "15": 5220507136.0, "20": 5220507136.0, "25": 5220507136.0, "30": 5220507136.0, "35": 5220507136.0, "40": 5220507136.0, "45": 5220507136.0, "50": 5220507136.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.15656, "5": 0.90105, "10": 0.87495, "15": 0.87775, "20": 0.99829, "25": 0.90462, "30": 0.89264, "35": 0.90859, "40": 1.22654, "45": 0.98086, "50": 0.99661}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.4681, "5": 10.45367, "10": 10.45093, "15": 10.45833, "20": 10.42029, "25": 10.3405, "30": 10.18378, "35": 10.03886, "40": 9.89837, "45": 9.75107, "50": 9.67018}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2373.0, "5": 2811.0, "10": 2502.0, "15": 2556.0, "20": 2392.0, "25": 2764.0, "30": 2957.0, "35": 3046.0, "40": 2373.0, "45": 3854.0, "50": 3568.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2032164352.0, "5": 2032164352.0, "10": 2032164352.0, "15": 2032164352.0, "20": 2032164352.0, "25": 2032164352.0, "30": 2032164352.0, "35": 2032164352.0, "40": 2032164352.0, "45": 2032164352.0, "50": 2032164352.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4341384704.0, "5": 5201632768.0, "10": 5201632768.0, "15": 5201632768.0, "20": 5201632768.0, "25": 5201632768.0, "30": 5201632768.0, "35": 5201632768.0, "40": 5201632768.0, "45": 5201632768.0, "50": 5201632768.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22.0069, "5": 0.82183, "10": 1.0945, "15": 0.82371, "20": 0.84695, "25": 1.04803, "30": 0.79308, "35": 0.77873, "40": 0.98672, "45": 0.84816, "50": 0.7713}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.42004, "5": 10.44687, "10": 10.44032, "15": 10.43081, "20": 10.40841, "25": 10.32605, "30": 10.18604, "35": 10.03131, "40": 9.91274, "45": 9.75116, "50": 9.66124}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3226.0, "5": 3843.0, "10": 2475.0, "15": 2700.0, "20": 3443.0, "25": 2788.0, "30": 2821.0, "35": 4077.0, "40": 3244.0, "45": 4769.0, "50": 3733.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1632405504.0, "5": 1632405504.0, "10": 1632405504.0, "15": 1632405504.0, "20": 1632405504.0, "25": 1632405504.0, "30": 1632405504.0, "35": 1632405504.0, "40": 1632405504.0, "45": 1632405504.0, "50": 1632405504.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2477577728.0, "5": 3175497216.0, "10": 3175497216.0, "15": 3175497216.0, "20": 3178637312.0, "25": 3178637312.0, "30": 3178637312.0, "35": 3178637312.0, "40": 3178637312.0, "45": 3178637312.0, "50": 3178637312.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.09431, "5": 2.352, "10": 2.3669, "15": 2.36187, "20": 2.34867, "25": 2.34813, "30": 2.35284, "35": 2.36644, "40": 2.35505, "45": 2.34778, "50": 2.35217}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.42626, "5": 10.42178, "10": 10.40882, "15": 10.40955, "20": 10.40433, "25": 10.31113, "30": 10.1472, "35": 10.04626, "40": 9.91097, "45": 9.74281, "50": 9.65795}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3452.0, "5": 3418.0, "10": 3298.0, "15": 3261.0, "20": 3448.0, "25": 2542.0, "30": 4164.0, "35": 3701.0, "40": 3387.0, "45": 4965.0, "50": 3268.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1661765632.0, "5": 1661765632.0, "10": 1661765632.0, "15": 1661765632.0, "20": 1661765632.0, "25": 1661765632.0, "30": 1661765632.0, "35": 1661765632.0, "40": 1661765632.0, "45": 1661765632.0, "50": 1661765632.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2510669824.0, "5": 3207546368.0, "10": 3207546368.0, "15": 3209637888.0, "20": 3209637888.0, "25": 3209637888.0, "30": 3209637888.0, "35": 3209637888.0, "40": 3209637888.0, "45": 3209637888.0, "50": 3209637888.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.05902, "5": 2.10578, "10": 2.07255, "15": 2.28342, "20": 2.04777, "25": 2.03295, "30": 2.0347, "35": 2.05296, "40": 2.03634, "45": 2.02561, "50": 2.04166}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.42107, "5": 10.44497, "10": 10.44241, "15": 10.43152, "20": 10.40907, "25": 10.3264, "30": 10.18328, "35": 10.03461, "40": 9.91258, "45": 9.74932, "50": 9.66168}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2229.0, "5": 2848.0, "10": 2437.0, "15": 3644.0, "20": 3449.0, "25": 3783.0, "30": 2913.0, "35": 4128.0, "40": 2230.0, "45": 4790.0, "50": 4716.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1632405504.0, "5": 1632405504.0, "10": 1632405504.0, "15": 1632405504.0, "20": 1632405504.0, "25": 1632405504.0, "30": 1632405504.0, "35": 1632405504.0, "40": 1632405504.0, "45": 1632405504.0, "50": 1632405504.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2458703360.0, "5": 3155576320.0, "10": 3155576320.0, "15": 3155576320.0, "20": 3155576320.0, "25": 3155576320.0, "30": 3155576320.0, "35": 3155576320.0, "40": 3155576320.0, "45": 3155576320.0, "50": 3155576320.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.46827, "5": 2.04207, "10": 2.0714, "15": 2.06559, "20": 2.04371, "25": 2.04465, "30": 2.0474, "35": 2.21838, "40": 2.04636, "45": 2.05719, "50": 2.04581}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 4 | NCCL_ALGO: Tree 5 | CUBLAS_WORKSPACE_CONFIG: :4096:8 6 | MODEL_ARGS: 7 | TEST_TYPE: regular 8 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/bert-nemo_340m_mr_mbs2_gbs32_mcore_te_tp2_pp2_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | MODEL_ARGS: 5 | trainer.num_nodes: 1 6 | trainer.devices: 8 7 | trainer.max_steps: 50 8 | trainer.val_check_interval: 50 9 | trainer.limit_val_batches: 50 10 | trainer.strategy.tensor_model_parallel_size: 2 11 | trainer.strategy.pipeline_model_parallel_size: 2 12 | trainer.strategy.sequence_parallel: True 13 | data.micro_batch_size: 2 14 | data.global_batch_size: 32 15 | data.seq_length: 512 16 | log.log_dir: ${CHECKPOINT_SAVE_PATH} 17 | TEST_TYPE: regular 18 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/gemma2-nemo_2b_mr_mbs1_gbs8_mcore_te_tp4_pp1_cp1_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | MODEL_ARGS: 5 | trainer.num_nodes: 1 6 | trainer.devices: 8 7 | trainer.max_steps: 50 8 | trainer.val_check_interval: 50 9 | trainer.limit_val_batches: 50 10 | trainer.strategy.tensor_model_parallel_size: 4 11 | trainer.strategy.pipeline_model_parallel_size: 1 12 | trainer.strategy.context_parallel_size: 1 13 | trainer.strategy.sequence_parallel: True 14 | data.micro_batch_size: 1 15 | data.global_batch_size: 8 16 | data.seq_length: 2048 17 | TEST_TYPE: regular 18 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/llama3-nemo_8b_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp2_dgx_a100_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | NVTE_APPLY_QK_LAYER_SCALING: 1 5 | MODEL_ARGS: 6 | trainer.num_nodes: 1 7 | trainer.devices: 8 8 | trainer.max_steps: 50 9 | trainer.val_check_interval: 50 10 | trainer.limit_val_batches: 50 11 | trainer.strategy.tensor_model_parallel_size: 2 12 | trainer.strategy.pipeline_model_parallel_size: 2 13 | trainer.strategy.expert_model_parallel_size: 2 14 | trainer.strategy.context_parallel_size: 1 15 | trainer.strategy.sequence_parallel: True 16 | model.config.num_layers: 12 17 | model.config.hidden_size: 768 18 | model.config.num_attention_heads: 16 19 | model.config.ffn_hidden_size: 3072 20 | model.config.apply_query_key_layer_scaling: True 21 | model.config.bias_activation_fusion: False 22 | model.config.add_bias_linear: False 23 | model.config.num_moe_experts: 8 24 | model.config.moe_grouped_gemm: True 25 | model.config.moe_router_load_balancing_type: aux_loss 26 | model.config.moe_router_topk: 2 27 | model.config.moe_aux_loss_coeff: 1e-2 28 | data.micro_batch_size: 1 29 | data.global_batch_size: 8 30 | data.seq_length: 2048 31 | log.log_dir: ${CHECKPOINT_SAVE_PATH} 32 | TEST_TYPE: regular 33 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/llama3-nemo_8b_mr_mbs4_gbs64_mcore_te_tp1_pp1_cp2_dgx_a100_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | MODEL_ARGS: 5 | trainer.num_nodes: 1 6 | trainer.devices: 8 7 | trainer.max_steps: 50 8 | trainer.val_check_interval: 50 9 | trainer.limit_val_batches: 50 10 | trainer.strategy.tensor_model_parallel_size: 1 11 | trainer.strategy.pipeline_model_parallel_size: 1 12 | trainer.strategy.context_parallel_size: 2 13 | trainer.strategy.sequence_parallel: True 14 | model.config.num_layers: 12 15 | model.config.hidden_size: 768 16 | model.config.num_attention_heads: 16 17 | model.config.ffn_hidden_size: 3072 18 | data.micro_batch_size: 4 19 | data.global_batch_size: 64 20 | data.seq_length: 2048 21 | log.log_dir: ${CHECKPOINT_SAVE_PATH} 22 | TEST_TYPE: regular 23 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/mixtral-nemo_8x7b_mr_mbs1_gbs8_mcore_te_tp2_pp1_ep2_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | MODEL_ARGS: 5 | trainer.num_nodes: 1 6 | trainer.devices: 8 7 | trainer.max_steps: 50 8 | trainer.val_check_interval: 50 9 | trainer.limit_val_batches: 50 10 | trainer.strategy.tensor_model_parallel_size: 2 11 | trainer.strategy.pipeline_model_parallel_size: 1 12 | trainer.strategy.expert_model_parallel_size: 4 13 | trainer.strategy.sequence_parallel: True 14 | model.config.num_layers: 12 15 | model.config.hidden_size: 768 16 | model.config.num_attention_heads: 16 17 | model.config.ffn_hidden_size: 3072 18 | data.micro_batch_size: 1 19 | data.global_batch_size: 8 20 | data.seq_length: 2048 21 | TEST_TYPE: regular 22 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/t5-nemo_220m_mr_mbs4_gbs64_te_tp1_pp1_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | MODEL_ARGS: 5 | trainer.num_nodes: 1 6 | trainer.devices: 8 7 | trainer.max_steps: 50 8 | trainer.val_check_interval: 50 9 | trainer.limit_val_batches: 50 10 | trainer.strategy.tensor_model_parallel_size: 1 11 | trainer.strategy.pipeline_model_parallel_size: 1 12 | data.micro_batch_size: 4 13 | data.global_batch_size: 64 14 | data.seq_length: 512 15 | TEST_TYPE: regular 16 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86114, "5": 10.87296, "10": 10.83903, "15": 10.8216, "20": 10.71697, "25": 10.5566, "30": 10.36032, "35": 10.26583, "40": 10.08719, "45": 9.82374, "50": 9.90498}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1696.0, "5": 1776.0, "10": 1413.0, "15": 1801.0, "20": 1624.0, "25": 1483.0, "30": 1856.0, "35": 1953.0, "40": 2183.0, "45": 2058.0, "50": 2134.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 948653056.0, "5": 948653056.0, "10": 948653056.0, "15": 948653056.0, "20": 948653056.0, "25": 948653056.0, "30": 948653056.0, "35": 948653056.0, "40": 948653056.0, "45": 948653056.0, "50": 948653056.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3275284480.0, "5": 3632653312.0, "10": 3632653312.0, "15": 3632653312.0, "20": 3632653312.0, "25": 3632653312.0, "30": 3632653312.0, "35": 3632653312.0, "40": 3632653312.0, "45": 3632653312.0, "50": 3632653312.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.68328, "5": 0.11284, "10": 0.1105, "15": 0.1127, "20": 0.11177, "25": 0.11822, "30": 0.11168, "35": 0.10923, "40": 0.11032, "45": 0.11159, "50": 0.10997}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.85678, "5": 10.88398, "10": 10.84079, "15": 10.82504, "20": 10.71912, "25": 10.55479, "30": 10.35998, "35": 10.26937, "40": 10.08396, "45": 9.82563, "50": 9.90725}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1654.0, "5": 1860.0, "10": 1317.0, "15": 1759.0, "20": 1730.0, "25": 1552.0, "30": 1895.0, "35": 1987.0, "40": 2099.0, "45": 1993.0, "50": 2085.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 779997184.0, "5": 779997184.0, "10": 779997184.0, "15": 779997184.0, "20": 779997184.0, "25": 779997184.0, "30": 779997184.0, "35": 779997184.0, "40": 779997184.0, "45": 779997184.0, "50": 779997184.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2463815680.0, "5": 2746575872.0, "10": 2746575872.0, "15": 2746575872.0, "20": 2746575872.0, "25": 2746575872.0, "30": 2746575872.0, "35": 2746575872.0, "40": 2746575872.0, "45": 2746575872.0, "50": 2746575872.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.21839, "5": 0.13024, "10": 0.13236, "15": 0.13158, "20": 0.12851, "25": 0.12984, "30": 0.13011, "35": 0.12981, "40": 0.12965, "45": 0.13094, "50": 0.13196}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86122, "5": 10.88248, "10": 10.83515, "15": 10.82747, "20": 10.72762, "25": 10.55769, "30": 10.37915, "35": 10.28345, "40": 10.08809, "45": 9.82642, "50": 9.91341}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1694.0, "5": 2127.0, "10": 1548.0, "15": 1997.0, "20": 1846.0, "25": 1802.0, "30": 2112.0, "35": 2172.0, "40": 2560.0, "45": 2397.0, "50": 2761.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 382956544.0, "5": 382956544.0, "10": 382956544.0, "15": 382956544.0, "20": 382956544.0, "25": 382956544.0, "30": 382956544.0, "35": 382956544.0, "40": 382956544.0, "45": 382956544.0, "50": 382956544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1497803264.0, "5": 1628741632.0, "10": 1628741632.0, "15": 1628741632.0, "20": 1628741632.0, "25": 1628741632.0, "30": 1628741632.0, "35": 1628741632.0, "40": 1628741632.0, "45": 1628741632.0, "50": 1628741632.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3.92164, "5": 0.29494, "10": 0.2941, "15": 0.29069, "20": 0.2914, "25": 0.29245, "30": 0.29159, "35": 0.29034, "40": 0.29023, "45": 0.29123, "50": 0.29039}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86122, "5": 10.88248, "10": 10.83515, "15": 10.82747, "20": 10.72762, "25": 10.55769, "30": 10.37919, "35": 10.28344, "40": 10.08807, "45": 9.82644, "50": 9.9134}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1694.0, "5": 2127.0, "10": 1548.0, "15": 1997.0, "20": 1846.0, "25": 1700.0, "30": 2165.0, "35": 2194.0, "40": 2540.0, "45": 2414.0, "50": 2586.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 382956544.0, "5": 382956544.0, "10": 382956544.0, "15": 382956544.0, "20": 382956544.0, "25": 382956544.0, "30": 382956544.0, "35": 382956544.0, "40": 382956544.0, "45": 382956544.0, "50": 382956544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1497803776.0, "5": 1629265408.0, "10": 1629265408.0, "15": 1629265408.0, "20": 1629265408.0, "25": 1629265408.0, "30": 1629265408.0, "35": 1629265408.0, "40": 1629265408.0, "45": 1629265408.0, "50": 1629265408.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.03009, "5": 0.32279, "10": 0.32497, "15": 0.32097, "20": 0.31241, "25": 0.30965, "30": 0.31321, "35": 0.30989, "40": 0.3143, "45": 0.31488, "50": 0.31594}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.57354, "5": 12.58052, "10": 12.47389, "15": 11.80615, "20": 11.49679, "25": 10.98441}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 521040608.0, "5": 520996544.0, "10": 521180480.0, "15": 521592480.0, "20": 521134336.0, "25": 523544480.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 24510808064.0, "5": 24510808064.0, "10": 24510808064.0, "15": 24510808064.0, "20": 24510808064.0, "25": 24510808064.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 52700401664.0, "5": 60489064448.0, "10": 60489064448.0, "15": 60489064448.0, "20": 60489064448.0, "25": 60489064448.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 2.84236, "15": "nan", "20": 2.8477, "25": "nan"}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.58569, "5": 12.5828, "10": 12.48258, "15": 11.79645, "20": 11.47664, "25": 10.97988}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 521035392.0, "5": 520993472.0, "10": 521176928.0, "15": 521588800.0, "20": 521133408.0, "25": 523547232.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 24540168192.0, "5": 24540168192.0, "10": 24540168192.0, "15": 24540168192.0, "20": 24540168192.0, "25": 24540168192.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 52729761792.0, "5": 60518424576.0, "10": 60518424576.0, "15": 60518424576.0, "20": 60518424576.0, "25": 60518424576.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 1.26794, "15": "nan", "20": 1.25096, "25": "nan"}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.61228, "5": 12.60403, "10": 12.49844, "15": 11.8178, "20": 11.50309, "25": 10.99207}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 523041344.0, "5": 523013024.0, "10": 523188736.0, "15": 523626720.0, "20": 523224480.0, "25": 525635552.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 20634324992.0, "5": 20634324992.0, "10": 20634324992.0, "15": 20634324992.0, "20": 20634324992.0, "25": 20634324992.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 51333926912.0, "5": 58188337152.0, "10": 58188337152.0, "15": 58188337152.0, "20": 58188337152.0, "25": 58188337152.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 2.59405, "15": "nan", "20": 2.60299, "25": "nan"}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.59715, "5": 12.59006, "10": 12.49071, "15": 11.82094, "20": 11.51707, "25": 11.00352}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 523037536.0, "5": 523010848.0, "10": 523184768.0, "15": 523629344.0, "20": 523228704.0, "25": 525639232.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 20663961600.0, "5": 20663961600.0, "10": 20663961600.0, "15": 20663961600.0, "20": 20663961600.0, "25": 20663961600.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 50289545216.0, "5": 57144233984.0, "10": 57144233984.0, "15": 57144233984.0, "20": 57144233984.0, "25": 57144233984.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 1.13086, "15": "nan", "20": 1.13253, "25": "nan"}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82975, "5": 10.8439, "10": 10.79337, "15": 10.77994, "20": 10.67712, "25": 10.48584, "30": 10.28468, "35": 10.18859, "40": 9.99279, "45": 9.72153, "50": 9.82127}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 226.0, "5": 275.0, "10": 181.0, "15": 253.0, "20": 248.0, "25": 207.0, "30": 265.0, "35": 281.0, "40": 315.0, "45": 282.0, "50": 336.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 831212544.0, "5": 831212544.0, "10": 831212544.0, "15": 831212544.0, "20": 831212544.0, "25": 831212544.0, "30": 831212544.0, "35": 831212544.0, "40": 831212544.0, "45": 831212544.0, "50": 831212544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 891582464.0, "5": 1250786304.0, "10": 1250786304.0, "15": 1250786304.0, "20": 1250786304.0, "25": 1250786304.0, "30": 1250786304.0, "35": 1251833856.0, "40": 1251833856.0, "45": 1251833856.0, "50": 1251833856.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 20.1181, "5": 0.47795, "10": 0.47291, "15": 0.48167, "20": 0.412, "25": 0.41115, "30": 0.41145, "35": 0.41136, "40": 0.41095, "45": 0.40816, "50": 0.42667}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.85004, "5": 10.86413, "10": 10.82533, "15": 10.81501, "20": 10.72113, "25": 10.53088, "30": 10.33843, "35": 10.24208, "40": 10.05219, "45": 9.76638, "50": 9.85497}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1683.0, "5": 1927.0, "10": 1648.0, "15": 2007.0, "20": 1833.0, "25": 1805.0, "30": 2032.0, "35": 2136.0, "40": 2234.0, "45": 2271.0, "50": 2398.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 886001664.0, "5": 886001664.0, "10": 886001664.0, "15": 886001664.0, "20": 886001664.0, "25": 886001664.0, "30": 886001664.0, "35": 886001664.0, "40": 886001664.0, "45": 886001664.0, "50": 886001664.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3212633088.0, "5": 3570001920.0, "10": 3570001920.0, "15": 3570001920.0, "20": 3570001920.0, "25": 3570001920.0, "30": 3570001920.0, "35": 3570001920.0, "40": 3570001920.0, "45": 3570001920.0, "50": 3570001920.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.76404, "5": 0.14426, "10": 0.14503, "15": 0.14512, "20": 0.14395, "25": 0.14807, "30": 0.14833, "35": 0.1429, "40": 0.14205, "45": 0.14208, "50": 0.14172}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8468, "5": 10.8657, "10": 10.82411, "15": 10.8128, "20": 10.72008, "25": 10.53151, "30": 10.33655, "35": 10.24133, "40": 10.05096, "45": 9.76804, "50": 9.85531}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1707.0, "5": 2121.0, "10": 1606.0, "15": 1959.0, "20": 1756.0, "25": 1848.0, "30": 2091.0, "35": 2089.0, "40": 2156.0, "45": 2137.0, "50": 2317.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3212632576.0, "5": 3572098560.0, "10": 3572098560.0, "15": 3572098560.0, "20": 3572098560.0, "25": 3572098560.0, "30": 3572098560.0, "35": 3572098560.0, "40": 3572098560.0, "45": 3572098560.0, "50": 3572098560.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.77598, "5": 0.14261, "10": 0.14233, "15": 0.14134, "20": 0.14113, "25": 0.141, "30": 0.1403, "35": 0.1406, "40": 0.1401, "45": 0.13985, "50": 0.14004}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79436, "5": 10.84798, "10": 10.7703, "15": 10.78948, "20": 10.68039, "25": 10.506, "30": 10.33228, "35": 10.2547, "40": 10.05593, "45": 9.80637, "50": 9.89113}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1637.0, "5": 1785.0, "10": 1384.0, "15": 1933.0, "20": 1624.0, "25": 1589.0, "30": 1959.0, "35": 1973.0, "40": 2248.0, "45": 2173.0, "50": 2448.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 718931456.0, "5": 718931456.0, "10": 718931456.0, "15": 718931456.0, "20": 718931456.0, "25": 718931456.0, "30": 718931456.0, "35": 718931456.0, "40": 718931456.0, "45": 718931456.0, "50": 718931456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2399714816.0, "5": 2685510144.0, "10": 2685510144.0, "15": 2685510144.0, "20": 2685510144.0, "25": 2685510144.0, "30": 2685510144.0, "35": 2685510144.0, "40": 2685510144.0, "45": 2685510144.0, "50": 2685510144.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3.76573, "5": 0.16293, "10": 0.16166, "15": 0.1618, "20": 0.16139, "25": 0.16605, "30": 0.162, "35": 0.16243, "40": 0.16141, "45": 0.16279, "50": 0.16404}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.73394, "5": 10.79243, "10": 10.70607, "15": 10.76012, "20": 10.68686, "25": 10.54768, "30": 10.45359, "35": 10.38572, "40": 10.24216, "45": 9.98159, "50": 10.06417}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2514.0, "5": 2818.0, "10": 2519.0, "15": 2543.0, "20": 2560.0, "25": 2574.0, "30": 2629.0, "35": 2568.0, "40": 2561.0, "45": 2508.0, "50": 2619.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 717420032.0, "5": 717420032.0, "10": 717420032.0, "15": 717420032.0, "20": 717420032.0, "25": 717420032.0, "30": 717420032.0, "35": 717420032.0, "40": 717420032.0, "45": 717420032.0, "50": 717420032.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2401419776.0, "5": 2684785152.0, "10": 2684785152.0, "15": 2684785152.0, "20": 2684785152.0, "25": 2684785152.0, "30": 2684785152.0, "35": 2684785152.0, "40": 2684785152.0, "45": 2684785152.0, "50": 2684785152.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.92787, "5": 0.17447, "10": 0.17372, "15": 0.17578, "20": 0.17588, "25": 0.17513, "30": 0.1731, "35": 0.1734, "40": 0.17385, "45": 0.17319, "50": 0.17333}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.74049, "5": 10.79201, "10": 10.71088, "15": 10.76031, "20": 10.6891, "25": 10.54338, "30": 10.4542, "35": 10.38324, "40": 10.24296, "45": 9.9834, "50": 10.06865}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2527.0, "5": 2875.0, "10": 2475.0, "15": 2508.0, "20": 2650.0, "25": 2392.0, "30": 2484.0, "35": 2573.0, "40": 2559.0, "45": 2519.0, "50": 2500.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 715322368.0, "5": 715322368.0, "10": 715322368.0, "15": 715322368.0, "20": 715322368.0, "25": 715322368.0, "30": 715322368.0, "35": 715322368.0, "40": 715322368.0, "45": 715322368.0, "50": 715322368.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2402991104.0, "5": 2683341824.0, "10": 2683341824.0, "15": 2683341824.0, "20": 2683341824.0, "25": 2683341824.0, "30": 2683341824.0, "35": 2683341824.0, "40": 2683341824.0, "45": 2683341824.0, "50": 2683341824.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.81379, "5": 0.17159, "10": 0.17073, "15": 0.16785, "20": 0.17251, "25": 0.17348, "30": 0.17312, "35": 0.17159, "40": 0.16987, "45": 0.17054, "50": 0.16978}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81478, "5": 10.85169, "10": 10.78745, "15": 10.79503, "20": 10.69101, "25": 10.52199, "30": 10.34557, "35": 10.25813, "40": 10.06995, "45": 9.80182, "50": 9.8759}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1549.0, "5": 1939.0, "10": 1348.0, "15": 1913.0, "20": 1684.0, "25": 1625.0, "30": 1929.0, "35": 1956.0, "40": 2108.0, "45": 2034.0, "50": 2458.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 731763200.0, "5": 731763200.0, "10": 731763200.0, "15": 731763200.0, "20": 731763200.0, "25": 731763200.0, "30": 731763200.0, "35": 731763200.0, "40": 731763200.0, "45": 731763200.0, "50": 731763200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895616.0, "5": 4120607232.0, "10": 4120607232.0, "15": 4120607232.0, "20": 4120607232.0, "25": 4120607232.0, "30": 4120607232.0, "35": 4120607232.0, "40": 4120607232.0, "45": 4120607232.0, "50": 4120607232.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92138, "5": 0.1642, "10": 0.16403, "15": 0.16127, "20": 0.16115, "25": 0.16151, "30": 0.16082, "35": 0.16141, "40": 0.1612, "45": 0.16203, "50": 0.16105}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85285, "10": 10.78449, "15": 10.79226, "20": 10.69196, "25": 10.52317, "30": 10.34507, "35": 10.25889, "40": 10.07027, "45": 9.80301, "50": 9.87673}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1915.0, "10": 1361.0, "15": 1831.0, "20": 1695.0, "25": 1596.0, "30": 1821.0, "35": 1872.0, "40": 2121.0, "45": 2090.0, "50": 2395.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 733859840.0, "5": 733859840.0, "10": 733859840.0, "15": 733859840.0, "20": 733859840.0, "25": 733859840.0, "30": 733859840.0, "35": 733859840.0, "40": 733859840.0, "45": 733859840.0, "50": 733859840.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895104.0, "5": 4122703872.0, "10": 4122703872.0, "15": 4122703872.0, "20": 4122703872.0, "25": 4122703872.0, "30": 4122703872.0, "35": 4122703872.0, "40": 4122703872.0, "45": 4122703872.0, "50": 4122703872.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.37934, "5": 0.166, "10": 0.16217, "15": 0.1635, "20": 0.16167, "25": 0.15901, "30": 0.15975, "35": 0.15935, "40": 0.15876, "45": 0.16028, "50": 0.15898}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81478, "5": 10.8517, "10": 10.78749, "15": 10.79505, "20": 10.69119, "25": 10.52294, "30": 10.34604, "35": 10.26165, "40": 10.072, "45": 9.80976, "50": 9.88336}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1549.0, "5": 1915.0, "10": 1391.0, "15": 1873.0, "20": 1698.0, "25": 1701.0, "30": 1980.0, "35": 1893.0, "40": 2037.0, "45": 1968.0, "50": 2391.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 731763200.0, "5": 731763200.0, "10": 731763200.0, "15": 731763200.0, "20": 731763200.0, "25": 731763200.0, "30": 731763200.0, "35": 731763200.0, "40": 731763200.0, "45": 731763200.0, "50": 731763200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895616.0, "5": 4120607232.0, "10": 4120607232.0, "15": 4120607232.0, "20": 4120607232.0, "25": 4120607232.0, "30": 4120607232.0, "35": 4120607232.0, "40": 4120607232.0, "45": 4120607232.0, "50": 4120607232.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.13825, "5": 0.15932, "10": 0.16236, "15": 0.16058, "20": 0.15952, "25": 0.15943, "30": 0.15981, "35": 0.15842, "40": 0.16085, "45": 0.16001, "50": 0.15982}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84523, "5": 10.87431, "10": 10.82854, "15": 10.8192, "20": 10.72736, "25": 10.55176, "30": 10.3649, "35": 10.27828, "40": 10.09756, "45": 9.84183, "50": 9.91243}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1725.0, "5": 1906.0, "10": 1451.0, "15": 1899.0, "20": 1576.0, "25": 1534.0, "30": 1886.0, "35": 1905.0, "40": 2136.0, "45": 2154.0, "50": 2246.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 763220480.0, "5": 763220480.0, "10": 763220480.0, "15": 763220480.0, "20": 763220480.0, "25": 763220480.0, "30": 763220480.0, "35": 763220480.0, "40": 763220480.0, "45": 763220480.0, "50": 763220480.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3868255744.0, "5": 4152064512.0, "10": 4152064512.0, "15": 4152064512.0, "20": 4152064512.0, "25": 4152064512.0, "30": 4152064512.0, "35": 4152064512.0, "40": 4152064512.0, "45": 4152064512.0, "50": 4152064512.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.86411, "5": 0.13803, "10": 0.13439, "15": 0.1352, "20": 0.14116, "25": 0.13406, "30": 0.13892, "35": 0.13943, "40": 0.14209, "45": 0.14014, "50": 0.14122}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.79229, "20": 10.69211, "25": 10.52412, "30": 10.34552, "35": 10.26239, "40": 10.07241, "45": 9.81101, "50": 9.88422}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1848.0, "20": 1601.0, "25": 1635.0, "30": 1936.0, "35": 1908.0, "40": 2100.0, "45": 2098.0, "50": 2333.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 732811264.0, "5": 732811264.0, "10": 732811264.0, "15": 732811264.0, "20": 732811264.0, "25": 732811264.0, "30": 732811264.0, "35": 732811264.0, "40": 732811264.0, "45": 732811264.0, "50": 732811264.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3838895104.0, "5": 4122703872.0, "10": 4122703872.0, "15": 4122703872.0, "20": 4122703872.0, "25": 4122703872.0, "30": 4122703872.0, "35": 4122703872.0, "40": 4122703872.0, "45": 4122703872.0, "50": 4122703872.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.9121, "5": 0.1731, "10": 0.17256, "15": 0.1722, "20": 0.17555, "25": 0.17245, "30": 0.17067, "35": 0.17091, "40": 0.17274, "45": 0.17151, "50": 0.17108}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.88761, "5": 10.9019, "10": 10.86847, "15": 10.84822, "20": 10.71762, "25": 10.54247, "30": 10.33628, "35": 10.23953, "40": 10.03243, "45": 9.768, "50": 9.8531}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 581.0, "5": 672.0, "10": 570.0, "15": 660.0, "20": 642.0, "25": 631.0, "30": 634.0, "35": 765.0, "40": 832.0, "45": 798.0, "50": 829.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 609140224.0, "5": 609140224.0, "10": 609140224.0, "15": 609140224.0, "20": 609140224.0, "25": 609140224.0, "30": 609140224.0, "35": 609140224.0, "40": 609140224.0, "45": 609140224.0, "50": 609140224.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 881272320.0, "5": 1139960320.0, "10": 1139960320.0, "15": 1139960320.0, "20": 1139960320.0, "25": 1139960320.0, "30": 1139960320.0, "35": 1139960320.0, "40": 1139960320.0, "45": 1139960320.0, "50": 1139960320.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.59675, "5": 0.37244, "10": 0.37323, "15": 0.37434, "20": 0.37587, "25": 0.37155, "30": 0.36463, "35": 0.361, "40": 0.36207, "45": 0.36168, "50": 0.35807}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.88759, "5": 10.90189, "10": 10.86849, "15": 10.84829, "20": 10.71772, "25": 10.54269, "30": 10.33645, "35": 10.23973, "40": 10.03266, "45": 9.76817, "50": 9.85325}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 584.0, "5": 690.0, "10": 501.0, "15": 618.0, "20": 573.0, "25": 605.0, "30": 678.0, "35": 702.0, "40": 775.0, "45": 787.0, "50": 830.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 611892224.0, "5": 611892224.0, "10": 611892224.0, "15": 611892224.0, "20": 611892224.0, "25": 611892224.0, "30": 611892224.0, "35": 611892224.0, "40": 611892224.0, "45": 611892224.0, "50": 611892224.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 879199232.0, "5": 1155304960.0, "10": 1155318784.0, "15": 1155318784.0, "20": 1155318784.0, "25": 1155318784.0, "30": 1155318784.0, "35": 1155318784.0, "40": 1155318784.0, "45": 1155318784.0, "50": 1155318784.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.16274, "5": 0.34171, "10": 0.34862, "15": 0.36414, "20": 0.34819, "25": 0.33727, "30": 0.35247, "35": 0.35263, "40": 0.33785, "45": 0.3406, "50": 0.35113}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.9297, "10": 10.90469, "15": 10.87115, "20": 10.74984, "25": 10.53727, "30": 10.32528, "35": 10.22874, "40": 10.01958, "45": 9.75531, "50": 9.84057}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 599.0, "5": 637.0, "10": 567.0, "15": 637.0, "20": 569.0, "25": 577.0, "30": 701.0, "35": 733.0, "40": 813.0, "45": 759.0, "50": 874.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 433619456.0, "5": 433619456.0, "10": 433619456.0, "15": 433619456.0, "20": 433619456.0, "25": 433619456.0, "30": 433619456.0, "35": 433619456.0, "40": 433619456.0, "45": 433619456.0, "50": 433619456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 675226112.0, "5": 857134592.0, "10": 857134592.0, "15": 857134592.0, "20": 857134592.0, "25": 857134592.0, "30": 857134592.0, "35": 857134592.0, "40": 857134592.0, "45": 857134592.0, "50": 857134592.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.29671, "5": 0.41356, "10": 0.41276, "15": 0.4124, "20": 0.41115, "25": 0.41244, "30": 0.41458, "35": 0.41419, "40": 0.41405, "45": 0.41469, "50": 0.41348}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87853, "10": 10.82979, "15": 10.82044, "20": 10.7038, "25": 10.49397, "30": 10.30529, "35": 10.20166, "40": 10.01885, "45": 9.74947, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 657.0, "5": 614.0, "10": 533.0, "15": 657.0, "20": 610.0, "25": 624.0, "30": 690.0, "35": 677.0, "40": 774.0, "45": 765.0, "50": 884.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759897600.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.93298, "5": 0.35524, "10": 0.35279, "15": 0.3474, "20": 0.35066, "25": 0.35914, "30": 0.35208, "35": 0.35087, "40": 0.35252, "45": 0.35496, "50": 0.36129}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.9297, "10": 10.90469, "15": 10.87115, "20": 10.74984, "25": 10.53727, "30": 10.32528, "35": 10.22874, "40": 10.01958, "45": 9.75531, "50": 9.84057}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 599.0, "5": 637.0, "10": 567.0, "15": 637.0, "20": 569.0, "25": 577.0, "30": 701.0, "35": 733.0, "40": 813.0, "45": 759.0, "50": 874.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 433619456.0, "5": 433619456.0, "10": 433619456.0, "15": 433619456.0, "20": 433619456.0, "25": 433619456.0, "30": 433619456.0, "35": 433619456.0, "40": 433619456.0, "45": 433619456.0, "50": 433619456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676274688.0, "5": 857131520.0, "10": 857134592.0, "15": 857134592.0, "20": 857134592.0, "25": 857134592.0, "30": 857134592.0, "35": 857134592.0, "40": 857134592.0, "45": 857134592.0, "50": 857134592.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.08022, "5": 0.41819, "10": 0.41975, "15": 0.42276, "20": 0.41504, "25": 0.41104, "30": 0.41458, "35": 0.41187, "40": 0.41442, "45": 0.41888, "50": 0.41596}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87853, "10": 10.82979, "15": 10.82044, "20": 10.7038, "25": 10.49397, "30": 10.30529, "35": 10.20166, "40": 10.01885, "45": 9.74947, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 657.0, "5": 614.0, "10": 533.0, "15": 657.0, "20": 610.0, "25": 624.0, "30": 690.0, "35": 677.0, "40": 774.0, "45": 765.0, "50": 884.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898112.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.82747, "5": 0.35095, "10": 0.35221, "15": 0.35252, "20": 0.35092, "25": 0.35493, "30": 0.35627, "35": 0.35299, "40": 0.35323, "45": 0.35997, "50": 0.34428}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.9297, "10": 10.90469, "15": 10.87115, "20": 10.74984, "25": 10.53727, "30": 10.32528, "35": 10.22874, "40": 10.01958, "45": 9.75531, "50": 9.84057}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 599.0, "5": 637.0, "10": 567.0, "15": 637.0, "20": 569.0, "25": 577.0, "30": 701.0, "35": 733.0, "40": 813.0, "45": 759.0, "50": 874.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 433619456.0, "5": 433619456.0, "10": 433619456.0, "15": 433619456.0, "20": 433619456.0, "25": 433619456.0, "30": 433619456.0, "35": 433619456.0, "40": 433619456.0, "45": 433619456.0, "50": 433619456.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676274176.0, "5": 858183168.0, "10": 858183168.0, "15": 858183168.0, "20": 858183168.0, "25": 858183168.0, "30": 858183168.0, "35": 858183168.0, "40": 858183168.0, "45": 858183168.0, "50": 858183168.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.2965, "5": 0.44042, "10": 0.42384, "15": 0.4236, "20": 0.41899, "25": 0.42049, "30": 0.42347, "35": 0.42232, "40": 0.42267, "45": 0.42233, "50": 0.42382}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87853, "10": 10.82979, "15": 10.82044, "20": 10.7038, "25": 10.49397, "30": 10.30529, "35": 10.20166, "40": 10.01885, "45": 9.74947, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 657.0, "5": 614.0, "10": 533.0, "15": 657.0, "20": 610.0, "25": 624.0, "30": 690.0, "35": 677.0, "40": 774.0, "45": 765.0, "50": 884.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898624.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 17.78221, "5": 0.35745, "10": 0.35686, "15": 0.35523, "20": 0.34955, "25": 0.34923, "30": 0.35955, "35": 0.36112, "40": 0.34611, "45": 0.40112, "50": 0.34706}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87857, "10": 10.8298, "15": 10.82043, "20": 10.7038, "25": 10.49396, "30": 10.30535, "35": 10.20165, "40": 10.01884, "45": 9.74947, "50": 9.83976}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 632.0, "5": 590.0, "10": 548.0, "15": 633.0, "20": 581.0, "25": 568.0, "30": 662.0, "35": 713.0, "40": 768.0, "45": 808.0, "50": 814.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898624.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 933156352.0, "35": 933156352.0, "40": 933156352.0, "45": 933156352.0, "50": 933156352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.52949, "5": 0.34536, "10": 0.34032, "15": 0.34188, "20": 0.34737, "25": 0.35154, "30": 0.34407, "35": 0.35764, "40": 0.34414, "45": 0.34298, "50": 0.34026}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93296, "5": 10.92969, "10": 10.9047, "15": 10.87118, "20": 10.74988, "25": 10.53733, "30": 10.32529, "35": 10.22869, "40": 10.01949, "45": 9.75528, "50": 9.84055}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 603.0, "5": 651.0, "10": 512.0, "15": 655.0, "20": 611.0, "25": 608.0, "30": 647.0, "35": 716.0, "40": 794.0, "45": 842.0, "50": 775.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 431522304.0, "5": 431522304.0, "10": 431522304.0, "15": 431522304.0, "20": 431522304.0, "25": 431522304.0, "30": 431522304.0, "35": 431522304.0, "40": 431522304.0, "45": 431522304.0, "50": 431522304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676274688.0, "5": 861328384.0, "10": 861328896.0, "15": 861328896.0, "20": 861328896.0, "25": 861328896.0, "30": 861328896.0, "35": 861328896.0, "40": 861328896.0, "45": 861328896.0, "50": 861328896.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.53857, "5": 0.41994, "10": 0.42437, "15": 0.4243, "20": 0.41968, "25": 0.41901, "30": 0.41523, "35": 0.41823, "40": 0.4121, "45": 0.4234, "50": 0.41265}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86539, "5": 10.87858, "10": 10.82978, "15": 10.82045, "20": 10.70382, "25": 10.49393, "30": 10.30533, "35": 10.20167, "40": 10.01882, "45": 9.74952, "50": 9.83978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 615.0, "5": 655.0, "10": 536.0, "15": 663.0, "20": 604.0, "25": 625.0, "30": 742.0, "35": 711.0, "40": 744.0, "45": 840.0, "50": 883.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 510689792.0, "5": 510689792.0, "10": 510689792.0, "15": 510689792.0, "20": 510689792.0, "25": 510689792.0, "30": 510689792.0, "35": 510689792.0, "40": 510689792.0, "45": 510689792.0, "50": 510689792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 759898624.0, "5": 933156352.0, "10": 933156352.0, "15": 933156352.0, "20": 933156352.0, "25": 933156352.0, "30": 934204416.0, "35": 934204416.0, "40": 934204416.0, "45": 934204416.0, "50": 934204416.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.22018, "5": 0.35813, "10": 0.36158, "15": 0.35614, "20": 0.35813, "25": 0.35947, "30": 0.35907, "35": 0.35505, "40": 0.35725, "45": 0.35408, "50": 0.35552}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.93292, "5": 10.92965, "10": 10.90473, "15": 10.87127, "20": 10.74997, "25": 10.53754, "30": 10.32548, "35": 10.22895, "40": 10.01975, "45": 9.75546, "50": 9.84069}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 585.0, "5": 675.0, "10": 544.0, "15": 619.0, "20": 579.0, "25": 620.0, "30": 678.0, "35": 717.0, "40": 813.0, "45": 746.0, "50": 841.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 432177152.0, "5": 432177152.0, "10": 432177152.0, "15": 432177152.0, "20": 432177152.0, "25": 432177152.0, "30": 432177152.0, "35": 432177152.0, "40": 432177152.0, "45": 432177152.0, "50": 432177152.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 676283904.0, "5": 856228864.0, "10": 857276928.0, "15": 857276928.0, "20": 857276928.0, "25": 857276928.0, "30": 857276928.0, "35": 857276928.0, "40": 857276928.0, "45": 857276928.0, "50": 857276928.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.34002, "5": 0.40276, "10": 0.39665, "15": 0.39344, "20": 0.39157, "25": 0.3871, "30": 0.38802, "35": 0.39196, "40": 0.38964, "45": 0.39313, "50": 0.39241}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92655, "5": 10.92722, "10": 10.9079, "15": 10.88296, "20": 10.77594, "25": 10.59266, "30": 10.39175, "35": 10.29701, "40": 10.09666, "45": 9.8447, "50": 9.90944}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1675.0, "5": 2035.0, "10": 1469.0, "15": 1853.0, "20": 1641.0, "25": 1685.0, "30": 1947.0, "35": 1941.0, "40": 2148.0, "45": 2122.0, "50": 2483.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 435191808.0, "5": 435191808.0, "10": 435191808.0, "15": 435191808.0, "20": 435191808.0, "25": 435191808.0, "30": 435191808.0, "35": 435191808.0, "40": 435191808.0, "45": 435191808.0, "50": 435191808.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.35385, "5": 0.17431, "10": 0.16906, "15": 0.16815, "20": 0.17162, "25": 0.17427, "30": 0.16998, "35": 0.172, "40": 0.17758, "45": 0.16824, "50": 0.16924}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92335, "5": 10.92815, "10": 10.9082, "15": 10.8847, "20": 10.77516, "25": 10.59065, "30": 10.39293, "35": 10.29701, "40": 10.09481, "45": 9.84578, "50": 9.90863}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 66.0, "5": 59.0, "10": 57.0, "15": 61.0, "20": 67.0, "25": 62.0, "30": 57.0, "35": 64.0, "40": 68.0, "45": 77.0, "50": 86.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 488145408.0, "5": 488145408.0, "10": 488145408.0, "15": 488145408.0, "20": 488145408.0, "25": 488145408.0, "30": 488145408.0, "35": 488145408.0, "40": 488145408.0, "45": 488145408.0, "50": 488145408.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2158389760.0, "5": 2340560384.0, "10": 2340560384.0, "15": 2340560384.0, "20": 2340560384.0, "25": 2340560384.0, "30": 2340560384.0, "35": 2340560384.0, "40": 2340560384.0, "45": 2340560384.0, "50": 2340560384.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.76555, "5": 0.21558, "10": 0.21605, "15": 0.21624, "20": 0.21751, "25": 0.21773, "30": 0.21692, "35": 0.21584, "40": 0.21624, "45": 0.21663, "50": 0.2151}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92799, "10": 10.90789, "15": 10.88313, "20": 10.77626, "25": 10.59138, "30": 10.39195, "35": 10.29687, "40": 10.0964, "45": 9.84466, "50": 9.90919}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 68.0, "5": 64.0, "10": 61.0, "15": 58.0, "20": 64.0, "25": 58.0, "30": 85.0, "35": 66.0, "40": 85.0, "45": 82.0, "50": 68.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 487096320.0, "5": 487096320.0, "10": 487096320.0, "15": 487096320.0, "20": 487096320.0, "25": 487096320.0, "30": 487096320.0, "35": 487096320.0, "40": 487096320.0, "45": 487096320.0, "50": 487096320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2158389248.0, "5": 2338462720.0, "10": 2338462720.0, "15": 2338462720.0, "20": 2338462720.0, "25": 2338462720.0, "30": 2338462720.0, "35": 2338462720.0, "40": 2338462720.0, "45": 2338462720.0, "50": 2338462720.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.46397, "5": 0.23056, "10": 0.22926, "15": 0.22949, "20": 0.23168, "25": 0.23436, "30": 0.2291, "35": 0.22829, "40": 0.22791, "45": 0.22801, "50": 0.20614}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92335, "5": 10.92817, "10": 10.90824, "15": 10.88471, "20": 10.77514, "25": 10.5907, "30": 10.39289, "35": 10.29701, "40": 10.09486, "45": 9.84576, "50": 9.90869}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1834.0, "5": 2149.0, "10": 1616.0, "15": 2030.0, "20": 1833.0, "25": 1728.0, "30": 2045.0, "35": 2164.0, "40": 2364.0, "45": 2242.0, "50": 2670.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 487096832.0, "5": 487096832.0, "10": 487096832.0, "15": 487096832.0, "20": 487096832.0, "25": 487096832.0, "30": 487096832.0, "35": 487096832.0, "40": 487096832.0, "45": 487096832.0, "50": 487096832.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1720084992.0, "5": 1900158464.0, "10": 1900158464.0, "15": 1900158464.0, "20": 1900158464.0, "25": 1900158464.0, "30": 1900158464.0, "35": 1900158464.0, "40": 1900158464.0, "45": 1900158464.0, "50": 1900158464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.80378, "5": 0.21906, "10": 0.21975, "15": 0.21443, "20": 0.21546, "25": 0.21553, "30": 0.21592, "35": 0.21551, "40": 0.21537, "45": 0.21378, "50": 0.21373}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92795, "10": 10.90786, "15": 10.88314, "20": 10.77629, "25": 10.59141, "30": 10.39192, "35": 10.29686, "40": 10.0964, "45": 9.84464, "50": 9.90918}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1627.0, "5": 2010.0, "10": 1368.0, "15": 1897.0, "20": 1626.0, "25": 1743.0, "30": 1930.0, "35": 1954.0, "40": 2199.0, "45": 2068.0, "50": 2460.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 488144896.0, "5": 488144896.0, "10": 488144896.0, "15": 488144896.0, "20": 488144896.0, "25": 488144896.0, "30": 488144896.0, "35": 488144896.0, "40": 488144896.0, "45": 488144896.0, "50": 488144896.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1720084480.0, "5": 1902255104.0, "10": 1902255104.0, "15": 1902255104.0, "20": 1902255104.0, "25": 1902255104.0, "30": 1902255104.0, "35": 1902255104.0, "40": 1902255104.0, "45": 1902255104.0, "50": 1902255104.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.72897, "5": 0.21376, "10": 0.21471, "15": 0.21644, "20": 0.21662, "25": 0.21524, "30": 0.21202, "35": 0.21278, "40": 0.21187, "45": 0.21266, "50": 0.21239}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91616, "15": 10.93902, "20": 10.93405, "25": 10.88579, "30": 10.81294, "35": 10.72198, "40": 10.55137, "45": 10.32844, "50": 10.28766}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 378378752.0, "5": 378378752.0, "10": 378903040.0, "15": 378903040.0, "20": 560548864.0, "25": 560811008.0, "30": 559500288.0, "35": 560548864.0, "40": 560548864.0, "45": 560548864.0, "50": 560548864.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1905351680.0, "5": 1905352192.0, "10": 1905352192.0, "15": 1905352192.0, "20": 2086473728.0, "25": 2086473728.0, "30": 2086473728.0, "35": 2086473728.0, "40": 2086473728.0, "45": 2086473728.0, "50": 2086473728.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.42624, "5": 0.19771, "10": 0.19787, "15": 0.19869, "20": 0.2134, "25": 0.21429, "30": 0.21327, "35": 0.21407, "40": 0.21288, "45": 0.21339, "50": 0.21186}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1751.0, "25": 2304.0, "30": 2419.0, "35": 1906.0, "40": 2063.0, "45": 2340.0, "50": 2943.0}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91616, "15": 10.93902, "20": 10.93405, "25": 10.88579, "30": 10.81295, "35": 10.72198, "40": 10.55137, "45": 10.32844, "50": 10.28766}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 378903040.0, "5": 378378752.0, "10": 378903040.0, "15": 378378752.0, "20": 560811008.0, "25": 560548864.0, "30": 561073152.0, "35": 562646016.0, "40": 560548864.0, "45": 562646016.0, "50": 560548864.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1905351680.0, "5": 1905352192.0, "10": 1905352192.0, "15": 1905352192.0, "20": 2087784448.0, "25": 2087784448.0, "30": 2087784448.0, "35": 2087784448.0, "40": 2087784448.0, "45": 2087784448.0, "50": 2087784448.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.5872, "5": 0.20393, "10": 0.20412, "15": 0.20193, "20": 0.22109, "25": 0.21826, "30": 0.21476, "35": 0.21348, "40": 0.21255, "45": 0.21142, "50": 0.21064}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1751.0, "25": 2491.0, "30": 2428.0, "35": 1827.0, "40": 2072.0, "45": 2361.0, "50": 2998.0}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69213, "25": 10.5241, "30": 10.34556, "35": 10.26241, "40": 10.07237, "45": 9.811, "50": 9.88419}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1699.0, "25": 1614.0, "30": 1905.0, "35": 1933.0, "40": 2169.0, "45": 2101.0, "50": 2421.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 523004928.0, "5": 523004928.0, "10": 523004928.0, "15": 523004928.0, "20": 523004928.0, "25": 523004928.0, "30": 523004928.0, "35": 523004928.0, "40": 523004928.0, "45": 523004928.0, "50": 523004928.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768873984.0, "5": 3912766464.0, "10": 3912766464.0, "15": 3912766464.0, "20": 3912766464.0, "25": 3912766464.0, "30": 3912766464.0, "35": 3912766464.0, "40": 3912766464.0, "45": 3912766464.0, "50": 3912766464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.88705, "5": 0.16956, "10": 0.17448, "15": 0.16853, "20": 0.1715, "25": 0.17071, "30": 0.17343, "35": 0.17213, "40": 0.1719, "45": 0.17357, "50": 0.17228}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_dgx_a100_1N8G/golden_values_dev_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81478, "5": 10.8517, "10": 10.78749, "15": 10.79506, "20": 10.69119, "25": 10.52293, "30": 10.34604, "35": 10.26168, "40": 10.07199, "45": 9.8098, "50": 9.88336}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1549.0, "5": 1915.0, "10": 1391.0, "15": 1773.0, "20": 1615.0, "25": 1748.0, "30": 1877.0, "35": 1915.0, "40": 2111.0, "45": 2009.0, "50": 2347.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 522846720.0, "5": 522846720.0, "10": 522846720.0, "15": 522846720.0, "20": 522846720.0, "25": 522846720.0, "30": 522846720.0, "35": 522846720.0, "40": 522846720.0, "45": 522846720.0, "50": 522846720.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768846848.0, "5": 3912608256.0, "10": 3912608256.0, "15": 3912608256.0, "20": 3912608256.0, "25": 3912608256.0, "30": 3912608256.0, "35": 3912608256.0, "40": 3912608256.0, "45": 3912608256.0, "50": 3912608256.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.36782, "5": 0.18832, "10": 0.16735, "15": 0.16595, "20": 0.16466, "25": 0.16564, "30": 0.16594, "35": 0.16362, "40": 0.16524, "45": 0.16382, "50": 0.16329}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp1_pp4_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69213, "25": 10.5241, "30": 10.34556, "35": 10.26241, "40": 10.07237, "45": 9.811, "50": 9.88419}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1699.0, "25": 1614.0, "30": 1905.0, "35": 1933.0, "40": 2169.0, "45": 2101.0, "50": 2421.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 523004928.0, "5": 523004928.0, "10": 523004928.0, "15": 523004928.0, "20": 523004928.0, "25": 523004928.0, "30": 523004928.0, "35": 523004928.0, "40": 523004928.0, "45": 523004928.0, "50": 523004928.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768873984.0, "5": 3912766464.0, "10": 3912766464.0, "15": 3912766464.0, "20": 3912766464.0, "25": 3912766464.0, "30": 3912766464.0, "35": 3912766464.0, "40": 3912766464.0, "45": 3912766464.0, "50": 3912766464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.88705, "5": 0.16956, "10": 0.17448, "15": 0.16853, "20": 0.1715, "25": 0.17071, "30": 0.17343, "35": 0.17213, "40": 0.1719, "45": 0.17357, "50": 0.17228}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/hybrid/hybrid_mr_mcore_te_tp2_pp1_dgx_a100_1N8G/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69213, "25": 10.5241, "30": 10.34556, "35": 10.26241, "40": 10.07237, "45": 9.811, "50": 9.88419}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1699.0, "25": 1614.0, "30": 1905.0, "35": 1933.0, "40": 2169.0, "45": 2101.0, "50": 2421.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 523004928.0, "5": 523004928.0, "10": 523004928.0, "15": 523004928.0, "20": 523004928.0, "25": 523004928.0, "30": 523004928.0, "35": 523004928.0, "40": 523004928.0, "45": 523004928.0, "50": 523004928.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3768873984.0, "5": 3912766464.0, "10": 3912766464.0, "15": 3912766464.0, "20": 3912766464.0, "25": 3912766464.0, "30": 3912766464.0, "35": 3912766464.0, "40": 3912766464.0, "45": 3912766464.0, "50": 3912766464.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 18.88705, "5": 0.16956, "10": 0.17448, "15": 0.16853, "20": 0.1715, "25": 0.17071, "30": 0.17343, "35": 0.17213, "40": 0.1719, "45": 0.17357, "50": 0.17228}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp1_pp1_ep8_etp1_cp_memory_speed/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.30214, "5": 12.30024, "10": 12.30151, "15": 12.30332, "20": 12.30211, "25": 12.30186}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 328814144.0, "5": 328839744.0, "10": 328755232.0, "15": 328750112.0, "20": 328814144.0, "25": 328757856.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 43403747328.0, "5": 43403747328.0, "10": 43403747328.0, "15": 43403747328.0, "20": 43403747328.0, "25": 43403747328.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 50198523904.0, "5": 60830679040.0, "10": 60851249152.0, "15": 60851249152.0, "20": 60851265536.0, "25": 60851265536.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 57.07556, "5": 4.53974, "10": 4.46565, "15": 4.45791, "20": 4.45123, "25": 4.44671}}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_ep8_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/functional_tests/test_cases/moe/gpt3_moe_mr_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer/golden_values_lts_dgx_a100.json -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/t5/t5_11b_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev_dgx_h100.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 10.74903, "5": 11.07428, "10": 9.25131, "15": 8.7913, "20": 8.16509, "25": 7.78974}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 245867.0, "5": 251639.0, "10": 252463.0, "15": 262053.0, "20": 248279.0, "25": 237341.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 40574230528.0, "5": 40574230528.0, "10": 40574230528.0, "15": 40574230528.0, "20": 40574230528.0, "25": 40574230528.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 40574234624.0, "5": 44891381760.0, "10": 44900294656.0, "15": 44902916096.0, "20": 44902916096.0, "25": 44902916096.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 13.83435, "5": 0.45747, "10": 0.45413, "15": 0.45505, "20": 0.45544, "25": 0.45536}}} -------------------------------------------------------------------------------- /tests/test_utils/recipes/_build-mcore-dev.yaml: -------------------------------------------------------------------------------- 1 | type: build 2 | format_version: 1 3 | maintainers: [maanug] 4 | spec: 5 | name: mcore-pyt-dev 6 | platforms: [linux/amd64] 7 | source: 8 | # The image tag will be added via `jet-tests.yaml` 9 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 10 | image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_dev 11 | -------------------------------------------------------------------------------- /tests/test_utils/recipes/_build-mcore-lts.yaml: -------------------------------------------------------------------------------- 1 | type: build 2 | format_version: 1 3 | maintainers: [maanug] 4 | spec: 5 | name: mcore-pyt-lts 6 | platforms: [linux/amd64] 7 | source: 8 | # The image tag will be added via `jet-tests.yaml` 9 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 10 | image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_lts 11 | -------------------------------------------------------------------------------- /tests/test_utils/recipes/_build-nemo.yaml: -------------------------------------------------------------------------------- 1 | type: build 2 | format_version: 1 3 | maintainers: [maanug] 4 | spec: 5 | name: mcore-nemo 6 | platforms: [linux/amd64] 7 | source: 8 | # The image tag will be added via `jet-tests.yaml` 9 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 10 | image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- 1 | import torch._dynamo 2 | 3 | torch._dynamo.config.suppress_errors = True 4 | -------------------------------------------------------------------------------- /tests/unit_tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/data/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/dist_checkpointing/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | 5 | from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy 6 | from megatron.core.msc_utils import MultiStorageClientFeature 7 | 8 | 9 | def pytest_sessionfinish(session, exitstatus): 10 | if exitstatus == 5: 11 | session.exitstatus = 0 12 | 13 | 14 | @pytest.fixture(scope="class") 15 | def tmp_dir_per_class(tmp_path_factory): 16 | return tmp_path_factory.mktemp("data") 17 | 18 | 19 | @pytest.fixture(scope='session', autouse=True) 20 | def set_default_dist_ckpt_strategy(): 21 | # Disable MSC for tests 22 | MultiStorageClientFeature.disable() 23 | 24 | def get_pyt_dist_save_sharded_strategy(): 25 | return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1) 26 | 27 | with mock.patch( 28 | 'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy', 29 | new=get_pyt_dist_save_sharded_strategy, 30 | ) as _fixture: 31 | yield _fixture 32 | -------------------------------------------------------------------------------- /tests/unit_tests/dist_checkpointing/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/dist_checkpointing/models/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/export/trtllm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/export/trtllm/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/engines/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/model_inference_wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/model_inference_wrappers/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( 4 | InferenceWrapperConfig, 5 | ) 6 | 7 | 8 | class TestModelInferenceWrapperConfig: 9 | 10 | def test_inference_config(self): 11 | inference_config = InferenceWrapperConfig( 12 | hidden_size=10, 13 | inference_batch_times_seqlen_threshold=10, 14 | padded_vocab_size=10, 15 | params_dtype=torch.float, 16 | fp32_residual_connection=False, 17 | ) 18 | inference_config.add_attributes({"abc": 45}) 19 | assert ( 20 | inference_config.abc == 45 21 | ), f"min tokens not set correctly. it is {inference_config.min_tokens}" 22 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/test_common_inference_params.py: -------------------------------------------------------------------------------- 1 | from megatron.core.inference.sampling_params import SamplingParams 2 | 3 | 4 | class TestSamplingParams: 5 | 6 | def test_sampling_params(self): 7 | sampling_params = SamplingParams() 8 | sampling_params.add_attributes({"min_tokens": 45}) 9 | assert ( 10 | sampling_params.min_tokens == 45 11 | ), f"min tokens not set correctly. it is {sampling_params.min_tokens}" 12 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/test_flash_decode.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb_with_cos_sin 4 | from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding 5 | 6 | 7 | class TestRotaryEmbeddingWithPrecomputedCosSin: 8 | 9 | def setup_method(self): 10 | self.batch_size = 3 11 | self.seq_len = 4 12 | self.d_rot = 6 13 | self.rotary_embedding = RotaryEmbedding(kv_channels=4, rotary_percent=1.0) 14 | 15 | def test_output_shapes_match(self): 16 | 17 | # Create input tensors 18 | t = torch.randn(self.seq_len, self.batch_size, 2, self.d_rot * 2, device="cuda") 19 | rotary_pos_cos, rotary_pos_sin = self.rotary_embedding.get_cos_sin(self.seq_len) 20 | 21 | # Test using Flash Decoding optimized kernel which requires precomputed cos & sin tensors 22 | expected_shape = torch.Size( 23 | [self.seq_len, self.batch_size, self.seq_len // 2, self.seq_len * self.batch_size] 24 | ) 25 | output_flash_rotary = apply_rotary_pos_emb_with_cos_sin( 26 | t, rotary_pos_cos, rotary_pos_sin, rotary_interleaved=True 27 | ) 28 | 29 | assert ( 30 | output_flash_rotary.shape == expected_shape 31 | ), f"Outputs do not match: {output_flash_rotary.shape} != {expected_shape}" 32 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/test_inference_utils.py: -------------------------------------------------------------------------------- 1 | from megatron.core.inference.utils import Counter 2 | 3 | 4 | class TestInferenceUtils: 5 | 6 | def test_counter(self): 7 | counter = Counter() 8 | r = next(counter) 9 | assert r == 0, f'Counter return value should be 0 but it is {r}' 10 | assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}' 11 | counter.reset() 12 | assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}' 13 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/text_generation_controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/inference/text_generation_controllers/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/models/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/pipeline_parallel/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/post_training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/post_training/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/tensor_parallel/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy 5 | from tests.unit_tests.test_utilities import Utils 6 | 7 | 8 | def test_vocab_parallel_cross_entropy(): 9 | Utils.initialize_model_parallel(4, 2) 10 | vocab_parallel_logits = torch.range(0, 7).repeat(16, 4).cuda() 11 | target = torch.arange(0, 32, 2).cuda() 12 | output = vocab_parallel_cross_entropy(vocab_parallel_logits, target) 13 | expected_output = torch.tensor( 14 | [ 15 | 10.2309, 16 | 8.2309, 17 | 6.2309, 18 | 4.2309, 19 | 10.2309, 20 | 8.2309, 21 | 6.2309, 22 | 4.2309, 23 | 10.2309, 24 | 8.2309, 25 | 6.2309, 26 | 4.2309, 27 | 10.2309, 28 | 8.2309, 29 | 6.2309, 30 | 4.2309, 31 | ] 32 | ).cuda() 33 | assert torch.equal(torch.round(expected_output), torch.round(output)) 34 | Utils.destroy_model_parallel() 35 | -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from megatron.core.tensor_parallel.data import broadcast_data 4 | from tests.unit_tests.test_utilities import Utils 5 | 6 | 7 | def test_broadcast_data(): 8 | Utils.initialize_model_parallel(2, 4) 9 | input_data = { 10 | 0: torch.ones((8, 8)).cuda() * 0.0, 11 | 1: torch.ones((8, 8)).cuda() * 1.0, 12 | 2: torch.ones((8, 8)).cuda() * 2.0, 13 | 3: torch.ones((8, 8)).cuda() * 3.0, 14 | 4: torch.ones((8, 8)).cuda() * 4.0, 15 | 5: torch.ones((8, 8)).cuda() * 5.0, 16 | 6: torch.ones((8, 8)).cuda() * 6.0, 17 | 7: torch.ones((8, 8)).cuda() * 7.0, 18 | } 19 | dtype = torch.float32 20 | actual_output = broadcast_data([0, 1], input_data, dtype) 21 | assert torch.equal(actual_output[0], input_data[0]) 22 | assert torch.equal(actual_output[1], input_data[1]) 23 | Utils.destroy_model_parallel() 24 | -------------------------------------------------------------------------------- /tests/unit_tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | -------------------------------------------------------------------------------- /tests/unit_tests/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/transformer/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/transformer/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/Megatron-LM/0a438ed43f347ac01ceadb88e10b21de51efc7f7/tests/unit_tests/transformer/moe/__init__.py -------------------------------------------------------------------------------- /tools/autoformat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euox pipefail 3 | 4 | GIT_VERSION=$(git version | awk '{print $3}') 5 | GIT_MAJOR=$(echo $GIT_VERSION | awk -F. '{print $1}') 6 | GIT_MINOR=$(echo $GIT_VERSION | awk -F. '{print $2}') 7 | 8 | if [[ $GIT_MAJOR -eq 2 && $GIT_MINOR -lt 31 ]]; then 9 | echo "Git version must be at least 2.31.0. Found $GIT_VERSION" 10 | exit 1 11 | fi 12 | 13 | SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) 14 | CHECK_ONLY=${CHECK_ONLY:-false} 15 | SKIP_DOCS=${SKIP_DOCS:-false} 16 | 17 | BASE_REF=${BASE_REF:-main} 18 | CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/${BASE_REF} megatron/core tests/ | grep '\.py$' || true) 19 | ADDITIONAL_ARGS="" 20 | ADDITIONAL_BLACK_ARGS="" 21 | ADDITIONAL_PYLINT_ARGS="" 22 | 23 | if [[ $CHECK_ONLY == true ]]; then 24 | ADDITIONAL_ARGS="--check" 25 | ADDITIONAL_BLACK_ARGS="--diff" 26 | fi 27 | 28 | if [[ $SKIP_DOCS == true ]]; then 29 | ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116" 30 | fi 31 | 32 | if [[ -n "$CHANGED_FILES" ]]; then 33 | black --skip-magic-trailing-comma $ADDITIONAL_ARGS $ADDITIONAL_BLACK_ARGS --verbose $CHANGED_FILES 34 | isort $ADDITIONAL_ARGS $CHANGED_FILES 35 | pylint $ADDITIONAL_PYLINT_ARGS $CHANGED_FILES 36 | mypy --explicit-package-bases --follow-imports=skip $CHANGED_FILES || true 37 | else 38 | echo Changeset is empty, all good. 39 | fi 40 | -------------------------------------------------------------------------------- /tools/bert_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder 4 | -------------------------------------------------------------------------------- /tools/bert_embedding/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "h5py", 7 | "transformers", # for huggingface bert 8 | ] 9 | 10 | for lib in required_libs: 11 | try: 12 | globals()[lib] = importlib.import_module(lib) 13 | except ImportError as e: 14 | raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.") 15 | -------------------------------------------------------------------------------- /tools/copyright.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Files ending with .py should have Copyright notice in the first line. 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 5 | 6 | # Move to the project root 7 | cd $SCRIPT_DIR/.. 8 | find_files_with_missing_copyright() { 9 | find ./megatron/ -type f -name '*.py' | while read path; do 10 | echo -en $path"\t" 11 | head -2 $path | grep -iv 'coding=' | head -1 12 | done \ 13 | | egrep -iv 'Copyright.*NVIDIA CORPORATION.*All rights reserved.' \ 14 | | grep -iv 'BSD 3-Clause License' \ 15 | | grep -iv 'Copyright.*Microsoft' \ 16 | | grep -iv 'Copyright.*The Open AI Team' \ 17 | | grep -iv 'Copyright.*The Google AI' \ 18 | | grep -iv 'Copyright.*Facebook' | while read line; do 19 | echo $line | cut -d' ' -f1 20 | done 21 | } 22 | 23 | 24 | declare RESULT=($(find_files_with_missing_copyright)) # (..) = array 25 | 26 | if [ "${#RESULT[@]}" -gt 0 ]; then 27 | echo "Error: Found files with missing copyright:" 28 | for (( i=0; i<"${#RESULT[@]}"; i++ )); do 29 | echo "path= ${RESULT[$i]}" 30 | done 31 | exit 1; 32 | else 33 | echo "Ok: All files start with copyright notice" 34 | fi 35 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/report_theoretical_memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Computes theoretical memory footprint for model training without instantiating 4 | a model and running training iterations on GPU(s).""" 5 | 6 | from megatron.training import get_args 7 | from megatron.training.initialize import initialize_megatron 8 | from megatron.training.theoretical_memory_usage import report_theoretical_memory 9 | 10 | if __name__ == "__main__": 11 | initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True) 12 | args = get_args() 13 | 14 | report_theoretical_memory(args, verbose=True) 15 | -------------------------------------------------------------------------------- /tools/retro/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .cli import retro 4 | -------------------------------------------------------------------------------- /tools/retro/cli/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | from . import retro 6 | 7 | 8 | if __name__ == "__main__": 9 | retro.init(os.environ["RETRO_PROJECT_DIR"]) 10 | -------------------------------------------------------------------------------- /tools/retro/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3 2 | 3 | RUN pip install -U faiss-gpu 4 | 5 | RUN apt update 6 | 7 | RUN apt install -qy htop 8 | 9 | RUN pip install -U transformers 10 | 11 | RUN pip install --upgrade google-api-python-client 12 | 13 | RUN pip install sentencepiece 14 | 15 | RUN pip install h5py 16 | 17 | RUN pip install nltk 18 | 19 | RUN pip install einops 20 | -------------------------------------------------------------------------------- /tools/retro/sft/README.md: -------------------------------------------------------------------------------- 1 | ## Note 2 | 3 | The content within this `sft` directory is still under active development and will be updated soon. -------------------------------------------------------------------------------- /tools/retro/sft/open_inst.sh: -------------------------------------------------------------------------------- 1 | DATA_BLEND="1.0 open_inst" 2 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import ast 3 | import sys 4 | import json 5 | import requests 6 | 7 | 8 | if __name__ == "__main__": 9 | url = sys.argv[1] 10 | url = 'http://' + url + '/api' 11 | headers = {'Content-Type': 'application/json'} 12 | 13 | while True: 14 | sentence = input("Enter prompt: ") 15 | tokens_to_generate = ast.literal_eval(input("Enter number of tokens to generate: ")) 16 | 17 | data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} 18 | response = requests.put(url, data=json.dumps(data), headers=headers) 19 | 20 | if response.status_code != 200: 21 | print(f"Error {response.status_code}: {response.json()['message']}") 22 | else: 23 | print("Megatron Response: ") 24 | print(response.json()['text'][0]) 25 | -------------------------------------------------------------------------------- /tools/wait_daemon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SLURM by default terminates all user processes when the main job process is 4 | # finished. This also immediately terminates inprocess.MonitorProcess and 5 | # prevents it from submitting information to distributed store, and finalizing 6 | # iteration by waiting on termination barrier. 7 | # 8 | # This script waits for all "python" processes launched by the current user to 9 | # finish before terminating the SLURM job. 10 | 11 | is_daemon_running() { 12 | pgrep -u $USER "python" > /dev/null 13 | } 14 | 15 | wait_daemon() { 16 | while is_daemon_running; do 17 | sleep 1 18 | done 19 | } 20 | --------------------------------------------------------------------------------