├── .coveragerc
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── enhancement.md
    │   ├── question.md
    │   └── regression.md
    └── workflows
    │   └── stale.yml
├── .gitignore
├── .gitlab-ci.yml
├── .gitlab
    ├── labeler-config.yml
    └── stages
    │   ├── 00.pre.yml
    │   ├── 01.test.yml
    │   ├── 02.functional-tests.yml
    │   └── 03.publish.yml
├── .pylintrc
├── CHANGELOG.md
├── CODEOWNERS
├── CONTRIBUTING.md
├── Dockerfile.ci.dev
├── Dockerfile.ci.lts
├── Dockerfile.linting
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_MEGATRON.md
├── data_preprocessing.sh
├── docs
    ├── llama_mistral.md
    └── source
    │   ├── api-guide
    │       ├── context_parallel.rst
    │       ├── datasets.rst
    │       ├── dist_checkpointing.rst
    │       ├── dist_checkpointing.strategies.rst
    │       ├── dist_optimizer.md
    │       ├── distributed.rst
    │       ├── encoder_decoder_parallelism.rst
    │       ├── fusions.rst
    │       ├── index.rst
    │       ├── models.bert.rst
    │       ├── models.gpt.rst
    │       ├── models.rst
    │       ├── models.t5.rst
    │       ├── moe.rst
    │       ├── num_microbatches_calculator.rst
    │       ├── optimizer_param_scheduler.rst
    │       ├── pipeline_parallel.rst
    │       ├── tensor_parallel.rst
    │       └── transformer.rst
    │   ├── images
    │       ├── context_parallel
    │       │   ├── CP_overview.png
    │       │   └── CP_results.png
    │       ├── distrib_optimizer
    │       │   ├── data_flow.png
    │       │   └── sharding_scheme.png
    │       └── moe
    │       │   └── token_drop.png
    │   ├── index.rst
    │   └── user-guide
    │       └── index.rst
├── examples
    ├── academic_paper_scripts
    │   ├── detxoify_lm
    │   │   ├── README.md
    │   │   ├── annotations
    │   │   │   ├── filter-selfgeneration.py
    │   │   │   ├── perspective_api_annotate.py
    │   │   │   └── preprocess.sh
    │   │   ├── finetune_gpt.py
    │   │   ├── finetune_gpt_distributed-1.3b.sh
    │   │   ├── generate-1.3b.sh
    │   │   ├── generate_samples_gpt.py
    │   │   ├── perspective_api.py
    │   │   └── self_generation
    │   │   │   └── selfgenerate-1.3b-unconditional.sh
    │   ├── msdp
    │   │   ├── README.md
    │   │   ├── data_processing.sh
    │   │   ├── eval_knwl_generation.sh
    │   │   ├── eval_resp_generation.sh
    │   │   ├── prep_resp_gen.sh
    │   │   ├── prompt_knwl_gen.sh
    │   │   └── prompt_resp_gen.sh
    │   └── sc21
    │   │   ├── CONFIG.sh
    │   │   ├── README.md
    │   │   ├── SBATCH.sh
    │   │   ├── SRUN.sh
    │   │   ├── run_figure_11.sh
    │   │   ├── run_figure_12.sh
    │   │   ├── run_figure_13.sh
    │   │   ├── run_figure_14.sh
    │   │   ├── run_figure_15.sh
    │   │   ├── run_figure_16.sh
    │   │   ├── run_figure_17.sh
    │   │   ├── run_figure_18.sh
    │   │   └── run_table_1.sh
    ├── bert
    │   ├── README.md
    │   └── train_bert_340m_distributed.sh
    ├── export
    │   ├── README.md
    │   ├── knowledge_distillation
    │   │   └── pretrain_gpt_modelopt.py
    │   ├── ptq_and_trtllm_export
    │   │   ├── README.md
    │   │   ├── ptq_trtllm_llama2_7b.sh
    │   │   ├── ptq_trtllm_llama3_1_8b.sh
    │   │   ├── ptq_trtllm_llama3_8b.sh
    │   │   ├── ptq_trtllm_minitron_8b.sh
    │   │   ├── ptq_trtllm_mistral_12b.sh
    │   │   ├── ptq_trtllm_mixtral_8x7b.sh
    │   │   ├── text_generation_ptq.py
    │   │   └── trtllm_text_generation.py
    │   └── trtllm_export
    │   │   ├── README.md
    │   │   ├── distributed_export
    │   │       └── gpt_distributed_gpu_export.py
    │   │   └── single_device_export
    │   │       └── gpt_single_device_cpu_export.py
    ├── gpt3
    │   ├── README.md
    │   ├── gpt_config.yaml
    │   └── train_gpt3_175b_distributed.sh
    ├── inference
    │   ├── README.md
    │   ├── gpt
    │   │   └── simple_gpt_batch_inference.py
    │   ├── llama_mistral
    │   │   ├── huggingface_reference.py
    │   │   ├── run_text_generation_llama3.1.sh
    │   │   ├── run_text_generation_llama3.sh
    │   │   └── run_text_generation_mistral.sh
    │   ├── run_text_generation_server_345M.sh
    │   ├── run_text_generation_server_345M_8_tensor_parallel.sh
    │   └── t5
    │   │   └── simple_t5_batch_inference.py
    ├── mamba
    │   ├── .gitignore
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── run_text_gen_server_8b.sh
    │   ├── run_text_gen_server_8b_gpt3.sh
    │   └── train.sh
    ├── mixtral
    │   ├── README.md
    │   └── train_mixtral_8x7b_distributed.sh
    ├── multimodal
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── assets
    │   │   └── pretrain_curves.png
    │   ├── combine_lm_vision_checkpoints.sh
    │   ├── combine_state_dicts.py
    │   ├── config.py
    │   ├── convert_llava_pretrain_to_wds.py
    │   ├── dataloader_provider.py
    │   ├── dataset_helpers.py
    │   ├── evaluate_ai2d.py
    │   ├── evaluate_chartqa.py
    │   ├── evaluate_coco.py
    │   ├── evaluate_mathvista.py
    │   ├── evaluate_mmmu.py
    │   ├── evaluate_ocrbench.py
    │   ├── evaluate_textvqa.py
    │   ├── evaluate_vqav2.py
    │   ├── evaluation_datasets.py
    │   ├── image_processing.py
    │   ├── layer_specs.py
    │   ├── manual_prompts.json
    │   ├── model.py
    │   ├── model_converter
    │   │   ├── clip_converter.py
    │   │   ├── internvit_converter.py
    │   │   ├── siglip_converter.py
    │   │   └── vision_model_tester.py
    │   ├── multimodal_args.py
    │   ├── nvlm
    │   │   ├── README.md
    │   │   ├── internvit.py
    │   │   ├── nvlm_prompts.json
    │   │   ├── pp_checkpoint_converter.py
    │   │   ├── pretrain_blend.yaml
    │   │   ├── pretrain_qwen20_72b_internvit_6b.sh
    │   │   ├── pretrain_yi_34b_internvit_6b.sh
    │   │   ├── run_text_generation_qwen20_72b_internvit_6b.sh
    │   │   ├── run_text_generation_yi_34b_internvit_6b.sh
    │   │   ├── sft_34b_internvit.sh
    │   │   ├── sft_blend.yaml
    │   │   └── sft_qwen20_72b_internvit_6b.sh
    │   ├── pretrain_dataset.yaml
    │   ├── pretrain_mistral_clip.sh
    │   ├── run_text_generation.py
    │   ├── sft_dataset.yaml
    │   ├── sft_mistral_clip.sh
    │   ├── text_generation_mistral_clip.sh
    │   └── train.py
    ├── retro
    │   ├── README.md
    │   ├── preprocess_data.sh
    │   └── train_retro_2b_distributed.sh
    ├── run_simple_mcore_train_loop.py
    └── t5
    │   ├── README.md
    │   ├── t5_mcore_train_curve.png
    │   └── train_t5_220m_distributed.sh
├── images
    ├── model_table.png
    ├── remoe_comparison.png
    ├── remoe_scaling.png
    ├── strong_scaling.png
    └── weak_scaling.png
├── megatron
    ├── core
    │   ├── QuickStart.md
    │   ├── README.md
    │   ├── README_STRAGGLER.md
    │   ├── __init__.py
    │   ├── config_logger.py
    │   ├── datasets
    │   │   ├── Makefile
    │   │   ├── __init__.py
    │   │   ├── bert_dataset.py
    │   │   ├── blended_dataset.py
    │   │   ├── blended_megatron_dataset_builder.py
    │   │   ├── blended_megatron_dataset_config.py
    │   │   ├── gpt_dataset.py
    │   │   ├── helpers.cpp
    │   │   ├── helpers.py
    │   │   ├── indexed_dataset.py
    │   │   ├── masked_dataset.py
    │   │   ├── megatron_dataset.py
    │   │   ├── megatron_tokenizer.py
    │   │   ├── multimodal_dataset.py
    │   │   ├── readme.md
    │   │   ├── retro
    │   │   │   ├── __init__.py
    │   │   │   ├── config
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bert_embedders.py
    │   │   │   │   ├── config.py
    │   │   │   │   ├── gpt_chunk_datasets.py
    │   │   │   │   └── tokenizers.py
    │   │   │   ├── db
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build.py
    │   │   │   │   ├── dataset.py
    │   │   │   │   └── utils.py
    │   │   │   ├── external_libs.py
    │   │   │   ├── index
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build.py
    │   │   │   │   ├── factory.py
    │   │   │   │   ├── index.py
    │   │   │   │   ├── indexes
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── faiss_base.py
    │   │   │   │   │   └── faiss_par_add.py
    │   │   │   │   ├── utils.py
    │   │   │   │   └── validate.py
    │   │   │   ├── query
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── gpt_chunk_dataset.py
    │   │   │   │   ├── multi_split_gpt_dataset.py
    │   │   │   │   ├── query.py
    │   │   │   │   ├── retro_dataset.py
    │   │   │   │   └── utils.py
    │   │   │   └── utils.py
    │   │   ├── t5_dataset.py
    │   │   ├── utils.py
    │   │   └── utils_s3.py
    │   ├── dist_checkpointing
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── dict_utils.py
    │   │   ├── exchange_utils.py
    │   │   ├── mapping.py
    │   │   ├── optimizer.py
    │   │   ├── serialization.py
    │   │   ├── state_dict_transformation.py
    │   │   ├── strategies
    │   │   │   ├── __init__.py
    │   │   │   ├── async_utils.py
    │   │   │   ├── base.py
    │   │   │   ├── common.py
    │   │   │   ├── filesystem_async.py
    │   │   │   ├── fully_parallel.py
    │   │   │   ├── resharding.py
    │   │   │   ├── state_dict_saver.py
    │   │   │   ├── tensorstore.py
    │   │   │   ├── torch.py
    │   │   │   ├── two_stage.py
    │   │   │   └── zarr.py
    │   │   ├── utils.py
    │   │   └── validation.py
    │   ├── distributed
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── data_parallel_base.py
    │   │   ├── distributed_data_parallel.py
    │   │   ├── distributed_data_parallel_config.py
    │   │   ├── finalize_model_grads.py
    │   │   ├── param_and_grad_buffer.py
    │   │   └── torch_fully_sharded_data_parallel.py
    │   ├── enums.py
    │   ├── export
    │   │   ├── __init__.py
    │   │   ├── data_type.py
    │   │   ├── export_config.py
    │   │   ├── model_type.py
    │   │   └── trtllm
    │   │   │   ├── __init__.py
    │   │   │   ├── engine_builder
    │   │   │       ├── __init__.py
    │   │   │       └── trtllm_engine_builder.py
    │   │   │   ├── model_to_trllm_mapping
    │   │   │       ├── __init__.py
    │   │   │       └── default_conversion_dict.py
    │   │   │   ├── trt_model_config.py
    │   │   │   ├── trt_model_type.py
    │   │   │   ├── trtllm_helper.py
    │   │   │   ├── trtllm_layers.py
    │   │   │   └── trtllm_weights_converter
    │   │   │       ├── __init__.py
    │   │   │       ├── distributed_trtllm_model_weights_converter.py
    │   │   │       └── single_device_trtllm_model_weights_converter.py
    │   ├── extensions
    │   │   ├── __init__.py
    │   │   └── transformer_engine.py
    │   ├── fusions
    │   │   ├── __init__.py
    │   │   ├── fused_bias_dropout.py
    │   │   ├── fused_bias_geglu.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_bias_swiglu.py
    │   │   ├── fused_cross_entropy.py
    │   │   ├── fused_layer_norm.py
    │   │   └── fused_softmax.py
    │   ├── inference
    │   │   ├── __init__.py
    │   │   ├── ammo_support
    │   │   │   ├── __init__.py
    │   │   │   └── gpt
    │   │   │   │   ├── model_specs.py
    │   │   │   │   └── state_dict_hooks.py
    │   │   ├── common_inference_params.py
    │   │   ├── communication_utils.py
    │   │   ├── engines
    │   │   │   ├── __init__.py
    │   │   │   ├── abstract_engine.py
    │   │   │   └── mcore_engine.py
    │   │   ├── inference_request.py
    │   │   ├── model_inference_wrappers
    │   │   │   ├── __init__.py
    │   │   │   ├── abstract_model_inference_wrapper.py
    │   │   │   ├── gpt
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── gpt_inference_wrapper.py
    │   │   │   ├── inference_wrapper_config.py
    │   │   │   └── t5
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── t5_inference_wrapper.py
    │   │   ├── modelopt_support
    │   │   │   ├── __init__.py
    │   │   │   └── gpt
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── model_specs.py
    │   │   │   │   └── state_dict_hooks.py
    │   │   ├── scheduler.py
    │   │   ├── text_generation_controllers
    │   │   │   ├── __init__.py
    │   │   │   ├── encoder_decoder_text_generation_controller.py
    │   │   │   └── simple_text_generation_controller.py
    │   │   └── utils.py
    │   ├── inference_params.py
    │   ├── jit.py
    │   ├── model_parallel_config.py
    │   ├── models
    │   │   ├── T5
    │   │   │   ├── __init__.py
    │   │   │   ├── t5_model.py
    │   │   │   └── t5_spec.py
    │   │   ├── __init__.py
    │   │   ├── bert
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_layer_specs.py
    │   │   │   ├── bert_lm_head.py
    │   │   │   ├── bert_model.py
    │   │   │   └── pooler.py
    │   │   ├── common
    │   │   │   ├── __init__.py
    │   │   │   ├── embeddings
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── language_model_embedding.py
    │   │   │   │   ├── rope_utils.py
    │   │   │   │   ├── rotary_pos_embedding.py
    │   │   │   │   └── yarn_rotary_pos_embedding.py
    │   │   │   ├── language_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── language_module.py
    │   │   │   └── vision_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── vision_module.py
    │   │   ├── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── gpt_layer_specs.py
    │   │   │   ├── gpt_model.py
    │   │   │   └── moe_module_specs.py
    │   │   ├── mamba
    │   │   │   ├── __init__.py
    │   │   │   ├── mamba_layer_specs.py
    │   │   │   └── mamba_model.py
    │   │   ├── multimodal
    │   │   │   ├── __init__.py
    │   │   │   ├── llava_model.py
    │   │   │   └── llava_spec.py
    │   │   ├── retro
    │   │   │   ├── __init__.py
    │   │   │   ├── base_attention.py
    │   │   │   ├── config.py
    │   │   │   ├── decoder_attention.py
    │   │   │   ├── decoder_spec.py
    │   │   │   ├── encoder_attention.py
    │   │   │   ├── encoder_spec.py
    │   │   │   ├── model.py
    │   │   │   └── utils.py
    │   │   └── vision
    │   │   │   ├── __init__.py
    │   │   │   ├── clip_vit_model.py
    │   │   │   ├── multimodal_projector.py
    │   │   │   └── vit_layer_specs.py
    │   ├── num_microbatches_calculator.py
    │   ├── optimizer
    │   │   ├── __init__.py
    │   │   ├── clip_grads.py
    │   │   ├── distrib_optimizer.py
    │   │   ├── grad_scaler.py
    │   │   ├── optimizer.py
    │   │   └── optimizer_config.py
    │   ├── optimizer_param_scheduler.py
    │   ├── package_info.py
    │   ├── packed_seq_params.py
    │   ├── parallel_state.py
    │   ├── pipeline_parallel
    │   │   ├── __init__.py
    │   │   ├── p2p_communication.py
    │   │   └── schedules.py
    │   ├── requirements.txt
    │   ├── rerun_state_machine.py
    │   ├── ssm
    │   │   ├── __init__.py
    │   │   ├── mamba_block.py
    │   │   ├── mamba_hybrid_layer_allocation.py
    │   │   ├── mamba_layer.py
    │   │   ├── mamba_mixer.py
    │   │   └── triton_cache_manager.py
    │   ├── tensor_parallel
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   └── utils.py
    │   ├── timers.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── cuda_graphs.py
    │   │   ├── custom_layers
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_engine.py
    │   │   ├── dot_product_attention.py
    │   │   ├── enums.py
    │   │   ├── identity_op.py
    │   │   ├── mlp.py
    │   │   ├── module.py
    │   │   ├── moe
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── experts.py
    │   │   │   ├── grouped_gemm_util.py
    │   │   │   ├── legacy_a2a_token_dispatcher.py
    │   │   │   ├── moe_layer.py
    │   │   │   ├── moe_utils.py
    │   │   │   ├── router.py
    │   │   │   ├── shared_experts.py
    │   │   │   ├── token_dispatcher.py
    │   │   │   └── upcycling_utils.py
    │   │   ├── multi_latent_attention.py
    │   │   ├── spec_utils.py
    │   │   ├── torch_layer_norm.py
    │   │   ├── torch_norm.py
    │   │   ├── transformer_block.py
    │   │   ├── transformer_config.py
    │   │   ├── transformer_layer.py
    │   │   └── utils.py
    │   └── utils.py
    ├── inference
    │   ├── __init__.py
    │   ├── algos
    │   │   ├── __init__.py
    │   │   └── distillation.py
    │   ├── arguments.py
    │   ├── checkpointing.py
    │   ├── docs
    │   │   └── distillation.md
    │   ├── endpoints
    │   │   ├── common.py
    │   │   └── completions.py
    │   ├── gpt
    │   │   ├── __init__.py
    │   │   ├── loss_func.py
    │   │   └── model_provider.py
    │   ├── static
    │   │   └── index.html
    │   ├── text_generation
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── beam_utils.py
    │   │   ├── communication.py
    │   │   ├── forward_step.py
    │   │   ├── generation.py
    │   │   ├── sampling.py
    │   │   └── tokenization.py
    │   └── text_generation_server.py
    ├── legacy
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── autoaugment.py
    │   │   ├── biencoder_dataset_utils.py
    │   │   ├── data_samplers.py
    │   │   ├── dataset_utils.py
    │   │   ├── ict_dataset.py
    │   │   ├── image_folder.py
    │   │   ├── multimodal_dataset.py
    │   │   ├── orqa_wiki_dataset.py
    │   │   ├── realm_dataset_utils.py
    │   │   ├── realm_index.py
    │   │   └── vit_dataset.py
    │   ├── fp16_deprecated
    │   │   └── loss_scaler.py
    │   ├── fused_kernels
    │   │   ├── __init__.py
    │   │   ├── compat.h
    │   │   ├── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_fused_kernels.py
    │   │   └── type_shim.h
    │   ├── indexer.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── bert_model.py
    │   │   ├── biencoder_model.py
    │   │   ├── classification.py
    │   │   ├── enums.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_layer_norm.py
    │   │   ├── fused_softmax.py
    │   │   ├── gpt_model.py
    │   │   ├── language_model.py
    │   │   ├── module.py
    │   │   ├── multiple_choice.py
    │   │   ├── realm_model.py
    │   │   ├── rms_norm.py
    │   │   ├── t5_model.py
    │   │   ├── transformer.py
    │   │   ├── utils.py
    │   │   └── vision
    │   │   │   ├── classification.py
    │   │   │   ├── dino.py
    │   │   │   ├── esvit_swin_backbone.py
    │   │   │   ├── inpainting.py
    │   │   │   ├── knn_monitor.py
    │   │   │   ├── mit_backbone.py
    │   │   │   ├── swin_backbone.py
    │   │   │   ├── utils.py
    │   │   │   └── vit_backbone.py
    │   └── mpu
    │   │   └── tests
    │   │       ├── __init__.py
    │   │       ├── commons.py
    │   │       ├── test_cross_entropy.py
    │   │       ├── test_data.py
    │   │       ├── test_initialize.py
    │   │       ├── test_layers.py
    │   │       └── test_random.py
    └── training
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── arguments.py
    │   ├── async_utils.py
    │   ├── checkpointing.py
    │   ├── dist_signal_handler.py
    │   ├── ft_integration.py
    │   ├── global_vars.py
    │   ├── initialize.py
    │   ├── log_handler.py
    │   ├── one_logger_utils.py
    │   ├── theoretical_memory_usage.py
    │   ├── tokenizer
    │       ├── __init__.py
    │       ├── bert_tokenization.py
    │       ├── gpt2_tokenization.py
    │       ├── multimodal_tokenizer.py
    │       └── tokenizer.py
    │   ├── training.py
    │   ├── utils.py
    │   └── yaml_arguments.py
├── mypy.ini
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_ict.py
├── pretrain_mamba.py
├── pretrain_retro.py
├── pretrain_t5.py
├── pretrain_vision_classify.py
├── pretrain_vision_dino.py
├── pretrain_vision_inpaint.py
├── pretrain_vlm.py
├── pyproject.toml
├── pytest.ini
├── requirements
    ├── pytorch:24.01
    │   └── requirements.txt
    └── pytorch:24.07
    │   └── requirements.txt
├── scripts
    ├── train_llama_182m_dense.sh
    ├── train_llama_182m_moe.sh
    ├── train_llama_182m_remoe.sh
    ├── train_llama_469m_dense.sh
    ├── train_llama_469m_moe.sh
    ├── train_llama_469m_remoe.sh
    ├── train_llama_978m_dense.sh
    ├── train_llama_978m_moe.sh
    └── train_llama_978m_remoe.sh
├── setup.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── main.py
    ├── msdp
    │   ├── README.md
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── prompt.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── quantize
    │   └── calibrate_gpt.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification
    │   │   ├── classification.py
    │   │   └── eval_utils.py
    │   ├── finetune_utils.py
    │   ├── main.py
    │   └── segmentation
    │   │   ├── cityscapes.py
    │   │   ├── data.py
    │   │   ├── finetune_segformer.py
    │   │   ├── finetune_setr.py
    │   │   ├── metrics.py
    │   │   ├── seg_heads.py
    │   │   ├── seg_models.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── __init__.py
    ├── functional_tests
    │   ├── __init__.py
    │   ├── python_test_utils
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── get_test_results_from_tensorboard_logs.py
    │   │   ├── test_ci_pipeline.py
    │   │   ├── test_fp8_ci_pipeline.py
    │   │   └── test_resume_checkpoint_pipeline.py
    │   ├── shell_test_utils
    │   │   ├── _run_training.sh
    │   │   └── run_ci_test.sh
    │   └── test_cases
    │   │   ├── bert
    │   │       ├── bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_tp2_pp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_tp1_pp2
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── bert_nightly_dgx_a100_1N8G_tp4_pp1
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       └── bert_release
    │   │       │   ├── golden_values_0.9.0.json
    │   │       │   └── model_config.yaml
    │   │   ├── common
    │   │       └── ckpt_converter
    │   │       │   ├── __main__.py
    │   │       │   └── model_config.yaml
    │   │   ├── gpt-nemo
    │   │       ├── gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       └── gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │   ├── gpt
    │   │       ├── gpt3_15b_8t_release
    │   │       │   ├── golden_values_0.8.0.json
    │   │       │   ├── golden_values_0.9.0.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_15b_8t_release_sm
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_te_tp2_pp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_tp2_pp2_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
    │   │       │   └── model_config.yaml
    │   │       └── gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume
    │   │       │   └── model_config.yaml
    │   │   ├── mixtral
    │   │       ├── mixtral_8x22b_tp2pp8ep8vpp1_release
    │   │       │   ├── golden_values_0.9.0.json
    │   │       │   └── model_config.yaml
    │   │       ├── mixtral_8x7b_alltoall_tp2pp4ep4_release
    │   │       │   ├── golden_values_0.8.0.json
    │   │       │   ├── golden_values_0.9.0.json
    │   │       │   └── model_config.yaml
    │   │       ├── mixtral_8x7b_alltoall_tp2pp4ep4_release_sm
    │   │       │   └── model_config.yaml
    │   │       └── mixtral_8x7b_tp1pp4ep8vpp8_release
    │   │       │   ├── golden_values_0.9.0.json
    │   │       │   └── model_config.yaml
    │   │   ├── multimodal-llava
    │   │       ├── multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       ├── multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
    │   │       │   ├── golden_values_dev.json
    │   │       │   ├── golden_values_lts.json
    │   │       │   └── model_config.yaml
    │   │       └── multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G
    │   │       │   └── model_config.yaml
    │   │   └── t5
    │   │       ├── t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1
    │   │           ├── golden_values_dev.json
    │   │           ├── golden_values_lts.json
    │   │           └── model_config.yaml
    │   │       ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
    │   │           └── golden_values_lts.json
    │   │       ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel
    │   │           └── golden_values_lts.json
    │   │       └── t5_release
    │   │           ├── golden_values_0.9.0.json
    │   │           └── model_config.yaml
    ├── test_utils
    │   ├── python_scripts
    │   │   ├── common.py
    │   │   ├── generate_jet_trigger_job.py
    │   │   ├── generate_local_jobs.py
    │   │   └── launch_jet_workload.py
    │   ├── recipes
    │   │   ├── _build-mcore-dev.yaml
    │   │   ├── _build-mcore-lts.yaml
    │   │   ├── _build-nemo.yaml
    │   │   ├── bert.yaml
    │   │   ├── gpt-modelopt.yaml
    │   │   ├── gpt-nemo.yaml
    │   │   ├── gpt.yaml
    │   │   ├── multimodal-llava.yaml
    │   │   ├── t5.yaml
    │   │   └── unit-tests.yaml
    │   └── shell_scripts
    │   │   └── notify.sh
    └── unit_tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── data
    │       ├── __init__.py
    │       ├── test_bin_reader.py
    │       ├── test_builder.py
    │       ├── test_gpt_dataset.py
    │       ├── test_multimodal_dataset.py
    │       ├── test_preprocess_data.py
    │       └── test_preprocess_mmdata.py
    │   ├── dist_checkpointing
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   ├── common.py
    │       │   ├── test_bert_model.py
    │       │   ├── test_gpt_model.py
    │       │   ├── test_mamba.py
    │       │   ├── test_mlp_glu.py
    │       │   ├── test_moe_experts.py
    │       │   ├── test_retro_model.py
    │       │   └── test_t5_model.py
    │       ├── test_async_save.py
    │       ├── test_cached_metadata.py
    │       ├── test_flattened_resharding.py
    │       ├── test_fp8.py
    │       ├── test_fully_parallel.py
    │       ├── test_local.py
    │       ├── test_mapping.py
    │       ├── test_nonpersistent.py
    │       ├── test_optimizer.py
    │       ├── test_serialization.py
    │       └── utils.py
    │   ├── distributed
    │       ├── test_grad_reduce_for_replicated_embedder.py
    │       └── test_param_and_grad_buffer.py
    │   ├── export
    │       └── trtllm
    │       │   ├── __init__.py
    │       │   ├── test_distributed_fp8.py
    │       │   ├── test_single_device_fp8.py
    │       │   ├── test_trtllm_distributed_gpu_converter.py
    │       │   ├── test_trtllm_helper.py
    │       │   ├── test_trtllm_layers.py
    │       │   └── test_trtllm_single_device_converter.py
    │   ├── fusions
    │       └── test_torch_softmax.py
    │   ├── inference
    │       ├── __init__.py
    │       ├── engines
    │       │   ├── __init__.py
    │       │   └── test_mcore_engine.py
    │       ├── model_inference_wrappers
    │       │   ├── __init__.py
    │       │   ├── gpt
    │       │   │   └── test_gpt_inference_wrapper.py
    │       │   ├── t5
    │       │   │   └── test_t5_inference_wrapper.py
    │       │   └── test_model_inference_wrapper_config.py
    │       ├── test_common_inference_params.py
    │       ├── test_flash_decode.py
    │       ├── test_inference_utils.py
    │       ├── test_modelopt_gpt_model.py
    │       ├── test_scheduler.py
    │       └── text_generation_controllers
    │       │   ├── __init__.py
    │       │   ├── test_encoder_decoder_text_generation_controller.py
    │       │   └── test_simple_text_generation_controller.py
    │   ├── models
    │       ├── __init__.py
    │       ├── test_base_embedding.py
    │       ├── test_bert_model.py
    │       ├── test_clip_vit_model.py
    │       ├── test_gpt_model.py
    │       ├── test_llava_model.py
    │       ├── test_mamba_model.py
    │       ├── test_multimodal_projector.py
    │       └── test_t5_model.py
    │   ├── pipeline_parallel
    │       ├── __init__.py
    │       ├── test_helpers.py
    │       └── test_schedules.py
    │   ├── ssm
    │       ├── test_mamba_block.py
    │       ├── test_mamba_hybrid_layer_allocation.py
    │       ├── test_mamba_layer.py
    │       └── test_mamba_mixer.py
    │   ├── tensor_parallel
    │       ├── __init__.py
    │       ├── test_cross_entropy.py
    │       ├── test_data.py
    │       ├── test_initialization.py
    │       ├── test_layers.py
    │       ├── test_mappings.py
    │       ├── test_random.py
    │       └── test_tensor_parallel_utils.py
    │   ├── test_basic.py
    │   ├── test_imports.py
    │   ├── test_inference.py
    │   ├── test_local_multi_tensor_fns.py
    │   ├── test_num_microbatches_calculator.py
    │   ├── test_optimizer.py
    │   ├── test_optimizer_param_scheduler.py
    │   ├── test_parallel_state.py
    │   ├── test_tokenizer.py
    │   ├── test_training.py
    │   ├── test_utilities.py
    │   ├── test_utils.py
    │   └── transformer
    │       ├── __init__.py
    │       ├── moe
    │           ├── __init__.py
    │           ├── conftest.py
    │           ├── test_a2a_token_dispatcher.py
    │           ├── test_aux_loss.py
    │           ├── test_grouped_mlp.py
    │           ├── test_moe_layer.py
    │           ├── test_routers.py
    │           ├── test_sequential_mlp.py
    │           ├── test_shared_experts.py
    │           ├── test_token_dispatcher.py
    │           └── test_upcycling.py
    │       ├── test_attention.py
    │       ├── test_attention_packed_seq.py
    │       ├── test_core_attention.py
    │       ├── test_mlp.py
    │       ├── test_module.py
    │       ├── test_multi_latent_attention.py
    │       ├── test_retro_attention.py
    │       ├── test_rope.py
    │       ├── test_spec_customization.py
    │       ├── test_transformer_block.py
    │       └── test_transformer_layer.py
├── tools
    ├── autoformat.sh
    ├── bert_embedding
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── embed.py
    │   ├── external_libs.py
    │   └── huggingface.py
    ├── checkpoint
    │   ├── convert.py
    │   ├── hybrid_conversion.py
    │   ├── loader_llama_mistral.py
    │   ├── loader_mcore.py
    │   ├── loader_megatron.py
    │   ├── loader_mixtral_hf.py
    │   ├── saver_mcore.py
    │   ├── saver_megatron.py
    │   ├── schema_base.py
    │   ├── schema_mcore.py
    │   └── utils.py
    ├── copyright.sh
    ├── linter.py
    ├── merge_datasets.py
    ├── openwebtext
    │   ├── README.md
    │   ├── add_id.py
    │   ├── blacklist_urls.py
    │   ├── cleanup_dataset.py
    │   ├── cleanup_fix_dataset.py
    │   ├── filter_ngrams.py
    │   ├── find_duplicates.py
    │   ├── group_duplicate_url.py
    │   ├── merge_jsons.py
    │   └── remove_group_duplicates.py
    ├── preprocess_data.py
    ├── preprocess_data_nmt.py
    ├── preprocess_mmdata.py
    ├── report_theoretical_memory.py
    ├── retro
    │   ├── README.md
    │   ├── build_db.md
    │   ├── cli
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   └── cli.py
    │   ├── config_utils.py
    │   ├── docker
    │   │   └── Dockerfile
    │   ├── preprocess_data.py
    │   ├── sft
    │   │   ├── README.md
    │   │   ├── dataset_conv.py
    │   │   ├── open_inst.sh
    │   │   ├── sft_retro.py
    │   │   └── sft_retro_lm.sh
    │   └── text_generation
    │   │   ├── evaluate.py
    │   │   ├── metrics.py
    │   │   ├── retro_api.py
    │   │   ├── retro_generate.sh
    │   │   ├── retro_generation.py
    │   │   └── retro_text_generation.py
    ├── run_mamba_text_generation_server.py
    ├── run_text_generation_server.py
    ├── run_vlm_text_generation.py
    └── text_generation_cli.py
└── unit-test-job-lts.yaml


/.coveragerc:
--------------------------------------------------------------------------------
1 | [html]
2 | directory = coverage
3 | 
4 | [run]
5 | data_file = .coverage_$LOCAL_RANK
6 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | extend-ignore = E203,E501,F401,E402,E714
4 | per-file-ignores = __init__.py:F401


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: BUG
 3 | about: Report a bug that needs attention
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Stack trace/logs**
20 | If applicable, add the stack trace or logs from the time of the error.
21 | 
22 | **Environment (please complete the following information):**
23 |  - Megatron-LM commit ID
24 |  - PyTorch version
25 |  - CUDA version
26 |  - NCCL version
27 | 
28 | **Proposed fix**
29 | If you have a proposal for how to fix the issue state it here or link to a PR.
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: ENHANCEMENT
 3 | about: Suggest an idea to improve this project
 4 | title: "[ENHANCEMENT]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Proposed implementation**
20 | If you have a proposed implementation for the feature state it here or link to a PR.
21 | 
22 | **Additional context**
23 | Add any other context or screenshots about the feature request here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: QUESTION
 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
 4 |   request
 5 | title: "[QUESTION]"
 6 | labels: ''
 7 | assignees: ''
 8 | 
 9 | ---
10 | 
11 | **Your question**
12 | Ask a clear and concise question about Megatron-LM.
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/regression.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: REGRESSION
 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update
 4 | title: "[REGRESSION]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the regression**
11 | A clear and concise description of what the regression is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Previous performance**
17 | What speed or accuracy did you previously see.
18 | 
19 | **New performance**
20 | What speed or accuracy do you see after the update.
21 | 
22 | **Stack trace/logs**
23 | If applicable, add the stack trace or logs related to the regression.
24 | 
25 | **Environment (please complete the following information):**
26 |  - Previous Megatron-LM commit ID
27 |  - New Megatron-LM commit ID
28 |  - Previous PyTorch version
29 |  - New PyTorch version
30 |  - Previous CUDA version
31 |  - New CUDA version
32 |  - Previous NCCL version
33 |  - New NCCL version
34 | 
35 | **Proposed fix**
36 | If you have a proposal for how to fix the issue state it here or link to a PR.
37 | 
38 | **Additional context**
39 | Add any other context about the problem here.
40 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
 2 | #
 3 | # You can adjust the behavior by modifying this file.
 4 | # For more information, see:
 5 | # https://github.com/actions/stale
 6 | name: Mark stale issues and pull requests
 7 | 
 8 | on:
 9 |   schedule:
10 |   - cron: '15 18 * * *'
11 | 
12 | jobs:
13 |   stale:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       issues: write
18 |       pull-requests: write
19 | 
20 |     steps:
21 |     - uses: actions/stale@v5
22 |       with:
23 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
24 |         days-before-stale: 60
25 |         stale-issue-message: 'Marking as stale. No activity in 60 days.'
26 |         stale-pr-message: 'Marking as stale. No activity in 60 days.'
27 |         stale-issue-label: 'stale'
28 |         stale-pr-label: 'stale'
29 |         remove-stale-when-updated: true
30 |         operations-per-run: 1000
31 |         days-before-close: -1
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.so
 3 | build
 4 | .coverage_*
 5 | *.egg-info
 6 | *~
 7 | slurm*
 8 | logs
 9 | .vscode
10 | local/
11 | .gitmodules


--------------------------------------------------------------------------------
/.gitlab/labeler-config.yml:
--------------------------------------------------------------------------------
 1 | CI:
 2 | - .gitlab-ci.yml
 3 | - Dockerfile.ci.lts
 4 | - Dockerfile.ci.dev
 5 | - .github/**
 6 | - .gitlab/**
 7 | 
 8 | Datasets:
 9 | - megatron/core/datasets/**
10 | 
11 | BERT:
12 | - megatron/core/models/bert/**
13 | 
14 | GPT:
15 | - megatron/core/models/gpt/**
16 | 
17 | RETRO:
18 | - megatron/core/models/retro/**
19 | 
20 | Dist-Ckpt:
21 | - megatron/core/dist_checkpointing
22 | 
23 | Dist-Opt:
24 | - megatron/core/optimizer/distrib_optimizer 
25 | 
26 | Inference:
27 | - megatron/core/inference
28 | 
29 | MoE:
30 | - megatron/core/transformer/moe
31 | 
32 | Tests:
33 | - tests/**


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MAIN]
 2 | ignore-paths=tests
 3 | max-line-length=100
 4 | 
 5 | [MESSAGES CONTROL]
 6 | disable=all
 7 | 
 8 | enable=C0115,C0116,W0611,C0301
 9 | # C0115: missing-class-docstring
10 | # C0116: missing-function-docstring
11 | # W0611: unused-import
12 | # C0301: line-too-long
13 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | [Core-ADLR] @mcore-reviewers/core-adlr
 2 | megatron/core/ 
 3 | 
 4 | [Core-NeMo] @mcore-reviewers/core-nemo
 5 | megatron/core/ 
 6 | 
 7 | ^[Core-MLPerf] @mcore-reviewers/mlperf
 8 | megatron/core/
 9 | 
10 | [MoE-ADLR] @mcore-reviewers/moe-adlr
11 | megatron/core/transformer/moe/
12 | 
13 | [MoE-Moe] @mcore-reviewers/moe-moe
14 | megatron/core/transformer/moe/
15 | 
16 | [Datasets] @mcore-reviewers/datasets
17 | megatron/core/datasets/
18 | 
19 | [BERT] @mcore-reviewers/bert
20 | megatron/core/models/bert/
21 | 
22 | [GPT] @mcore-reviewers/gpt
23 | megatron/core/models/gpt/
24 | 
25 | [Retro] @mcore-reviewers/retro
26 | megatron/core/models/retro/
27 | 
28 | [Distributed Checkpointing] @mcore-reviewers/dist-checkpointing
29 | megatron/core/dist_checkpointing/
30 | 
31 | [Distributed Optimizer] @mcore-reviewers/dist-optimizer
32 | megatron/core/optimizer/distrib_optimizer/ 
33 | 
34 | [Inference] @mcore-reviewers/inference
35 | megatron/core/inference/
36 | 
37 | ^[Quantization and Inference (QAT)] @mcore-reviewers/quantization-and-inference
38 | megatron/core/inference/
39 | 
40 | ; [Context Parallelism] @mcore-reviewers/context-parallelism
41 | ; 
42 | 
43 | [CI] @mcore-reviewers/ci
44 | .gitlab/
45 | .github/
46 | .gitlab-ci.yml
47 | Dockerfile.ci.lts
48 | Dockerfile.ci.dev
49 | tests/
50 | 


--------------------------------------------------------------------------------
/Dockerfile.linting:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:experimental
 2 | 
 3 | ARG FROM_IMAGE_NAME
 4 | FROM $FROM_IMAGE_NAME as main
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
 8 |       /etc/apt/apt.conf.d/docker-clean
 9 | 
10 | RUN apt-get update && \
11 |       apt-get install -y python3-venv && \
12 |       apt-get clean && \
13 |       python -m venv /opt/jet
14 | 
15 | RUN pip3 install --no-cache-dir \
16 |       black==24.4.2 \
17 |       isort==5.13.2 \
18 |       flake8==7.1.0 \
19 |       pylint==3.2.6 \
20 |       mypy
21 | 
22 | COPY . /opt/megatron-lm
23 | 
24 | WORKDIR /opt/megatron-lm
25 | 
26 | ##### For NVIDIANS only #####
27 | FROM main as jet
28 | ARG CACHEBUST=0
29 | RUN --mount=type=secret,id=JET_INDEX_URLS \
30 |       JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
31 |       pip install jet-client jet-api --upgrade $JET_INDEX_URLS
32 | ENV PATH="$PATH:/opt/jet/bin"
33 | ###


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/core/requirements.txt
2 | include megatron/core/README.md
3 | recursive-include requirements *
4 | 


--------------------------------------------------------------------------------
/data_preprocessing.sh:
--------------------------------------------------------------------------------
 1 | for i in $(seq -w 0 29); do
 2 |     python tools/preprocess_data.py \
 3 |         --input ../pile/${i}.jsonl \
 4 |         --output-prefix ../pile_gpt_test/${i} \
 5 |         --vocab-file ../gpt2-vocab.json \
 6 |         --tokenizer-type GPT2BPETokenizer \
 7 |         --merge-file ../gpt2-merges.txt \
 8 |         --append-eod \
 9 |         --workers 32
10 | done


--------------------------------------------------------------------------------
/docs/source/api-guide/index.rst:
--------------------------------------------------------------------------------
 1 | API Guide
 2 | =========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 4
 6 | 
 7 |    models
 8 |    tensor_parallel
 9 |    context_parallel
10 |    pipeline_parallel
11 |    fusions
12 |    transformer
13 |    moe
14 |    dist_checkpointing
15 |    dist_optimizer
16 |    distributed
17 |    datasets
18 |    num_microbatches_calculator
19 |    optimizer_param_scheduler
20 |    encoder_decoder_parallelism


--------------------------------------------------------------------------------
/docs/source/api-guide/models.bert.rst:
--------------------------------------------------------------------------------
 1 | models.bert package
 2 | ===================
 3 | Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 
 4 | 
 5 | Submodules
 6 | ----------
 7 | 
 8 | models.bert.bert\_model module
 9 | ------------------------------
10 | 
11 | .. automodule:: core.models.bert.bert_model
12 |    :members:
13 |    :undoc-members:
14 |    :show-inheritance:
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: core.models.bert
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.gpt.rst:
--------------------------------------------------------------------------------
 1 | models.gpt package
 2 | ==================
 3 | This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 
 4 | 
 5 | Submodules
 6 | ----------
 7 | 
 8 | models.gpt.gpt\_model module
 9 | ----------------------------
10 | 
11 | .. automodule:: core.models.gpt.gpt_model
12 |    :members:
13 |    :undoc-members:
14 |    :show-inheritance:
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: core.models.gpt
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.rst:
--------------------------------------------------------------------------------
 1 | models package
 2 | ==============
 3 | This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 
 4 | 
 5 | Subpackages
 6 | -----------
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 4
10 | 
11 |    models.gpt
12 |    models.t5
13 |    models.bert
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: core.models
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.t5.rst:
--------------------------------------------------------------------------------
 1 | models.t5 package
 2 | =================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | models.t5.t5\_model module
 8 | --------------------------
 9 | 
10 | .. automodule:: core.models.T5.t5_model
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: core.models.T5
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/moe.rst:
--------------------------------------------------------------------------------
1 | Mixture of Experts package
2 | ==========================
3 | 
4 | .. mdinclude :: ../../../megatron/core/transformer/moe/README.md
5 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/num_microbatches_calculator.rst:
--------------------------------------------------------------------------------
 1 | Microbatches Calculator
 2 | =======================
 3 | This api is used to calculate the number of microbatches required to fit a given model on a given batch size.
 4 | 
 5 | 
 6 | Module contents
 7 | ---------------
 8 | 
 9 | .. automodule:: core.num_microbatches_calculator
10 |    :members:
11 |    :undoc-members:
12 |    :show-inheritance:
13 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/optimizer_param_scheduler.rst:
--------------------------------------------------------------------------------
 1 | Optimizer Parameters Scheduler
 2 | ==============================
 3 | This api is used to calculate the learning rate and weight decay for the optimizer.
 4 | 
 5 | 
 6 | Module contents
 7 | ---------------
 8 | 
 9 | .. automodule:: core.optimizer_param_scheduler
10 |    :members:
11 |    :undoc-members:
12 |    :show-inheritance:
13 | 


--------------------------------------------------------------------------------
/docs/source/images/context_parallel/CP_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/context_parallel/CP_overview.png


--------------------------------------------------------------------------------
/docs/source/images/context_parallel/CP_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/context_parallel/CP_results.png


--------------------------------------------------------------------------------
/docs/source/images/distrib_optimizer/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/distrib_optimizer/data_flow.png


--------------------------------------------------------------------------------
/docs/source/images/distrib_optimizer/sharding_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/distrib_optimizer/sharding_scheme.png


--------------------------------------------------------------------------------
/docs/source/images/moe/token_drop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/moe/token_drop.png


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Lumache documentation master file, created by
 2 |    sphinx-quickstart on Tue Aug 15 13:44:10 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Megatron Core User Guide
 7 | ===================================
 8 | 
 9 | **Megatron Core** is a Python library that has the core components required to build your language models. 
10 | A reference implementation of Megatron Core can be found in  `NeMo <https://github.com/NVIDIA/NeMo/tree/main>`_ It offers a *simple* and
11 | *intuitive* API.
12 | 
13 | .. toctree::
14 |    :maxdepth: 2
15 |    :caption: User Guide
16 | 
17 |    user-guide/index
18 | 
19 | .. toctree::
20 |    :maxdepth: 3
21 |    :caption: API Guide
22 |    
23 |    api-guide/index
24 | 


--------------------------------------------------------------------------------
/docs/source/user-guide/index.rst:
--------------------------------------------------------------------------------
1 | User Guide 
2 | ============
3 | 
4 | .. mdinclude:: ../../../megatron/core/QuickStart.md


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh:
--------------------------------------------------------------------------------
 1 | VOCAB_FILE=pt2-vocab.json
 2 | MERGE_FILE=gpt2-merges.txt
 3 | 
 4 | python3 tools/preprocess_data.py \
 5 |     --input $1 \
 6 |     --output-prefix $2 \
 7 |     --vocab-file $VOCAB_FILE \
 8 |     --merge-file $MERGE_FILE \
 9 |     --tokenizer-type GPT2BPETokenizer \
10 |     --append-eod  --workers 20 --chunk-size 25
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
3 | 
4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/prep_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Preparing the input file for the response generation (second-stage prompting)
 4 | 
 5 | DIR=`pwd`
 6 | 
 7 | TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
 8 |         (e.g., /testseen_processed.txt)
 9 | KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
10 |         (e.g., /testseen_knowledge_generations.txt)
11 | PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
12 |         (e.g., /testseen_processed_with_generated_knowledge.txt)
13 | 
14 | python ${DIR}/tasks/msdp/preprocessing.py \
15 |         --func prepare_input \
16 |         --test_file ${TEST_FILE} \
17 |         --knwl_gen_file ${KNOWLEDGE_FILE} \
18 |         --processed_file ${PROCESSED_FILE}
19 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/SBATCH.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | sbatch -p ${SLURM_PARTITION} \
 5 |        -A ${SLURM_ACCOUNT} \
 6 |        --job-name=${JOB_NAME} \
 7 |        --nodes=${NNODES} \
 8 |        --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 9 | 
10 | exit 0
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/SRUN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 4 | 
 5 | 
 6 | THIS_DIR=`pwd`
 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 8 | mkdir -p ${THIS_DIR}/logs
 9 | 
10 | 
11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
12 | 
13 | 
14 | srun -l \
15 |      --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
16 |      --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
17 |      --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [1, 2, 4, 8].
 8 | PP=1
 9 | 
10 | # Batch size (global batch size) options = [8, 128].
11 | GBS=8
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel size options.
18 | NLS=$((3*PP))
19 | NNODES=${PP}
20 | 
21 | 
22 | # Other params.
23 | TP=8
24 | MBS=1
25 | HS=20480
26 | NAH=128
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Interleaved schedule options = [YES, NO].
 8 | INTERLEAVED=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set interleaved schedule options.
18 | if [ ${INTERLEAVED} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${INTERLEAVED} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and tensor-parallel size options.
18 | TP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | MBS=1
23 | NLS=32
24 | HS=20480
25 | NAH=128
26 | DDP=local
27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
28 | NNODES=8
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and data-parallel size options.
18 | DP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | TP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32].
 8 | TP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set tensor-parallel and data-parallel size options.
18 | DP=$((64/TP))
19 | 
20 | 
21 | # Other params.
22 | PP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Microbatch size options = [1, 2, 4, 8].
 8 | MBS=1
 9 | 
10 | # Batch size (global batch size) options = [128, 512].
11 | GBS=128
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Other params.
18 | TP=8
19 | PP=8
20 | NLS=32
21 | HS=15360
22 | NAH=128
23 | DDP=local
24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
25 | NNODES=8
26 | 
27 | 
28 | # Name of the job.
29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
30 | 
31 | 
32 | # Import the configs.
33 | . `pwd`/CONFIG.sh
34 | 
35 | 
36 | # Submit the job.
37 | . `pwd`/SBATCH.sh
38 | 
39 | 
40 | exit 0
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_17.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Activation recomputation options = [YES, NO].
 8 | ACTIVATION_RECOMPUTATION=YES
 9 | 
10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256].
11 | GBS=1
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set activation recomputation.
18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS=""
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=16
31 | MBS=1
32 | NLS=80
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=16
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_18.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Scatter-gather communication optimization options = [YES, NO].
 8 | SCATTER_GATHER=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set scatter-gather communication optimization options.
18 | if [ ${SCATTER_GATHER} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${SCATTER_GATHER} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/export/README.md:
--------------------------------------------------------------------------------
 1 | # Megatron Core Export
 2 | 
 3 | This module is used to export megatron core models to different inference frameworks. 
 4 | Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
 5 | 
 6 | ## PTQ AND EXPORT
 7 | Follow the instructions in [ptq_and_trtllm_export](./ptq_and_trtllm_export) to do post training quantization, followed by an export to TRTLLM format. 
 8 | 
 9 | # TRTLLM EXPORT
10 | Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.


--------------------------------------------------------------------------------
/examples/inference/llama_mistral/huggingface_reference.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 3 | 
 4 | # Set up argument parsing
 5 | parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
 6 | parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
 7 | parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
 8 | 
 9 | # Parse command-line arguments
10 | args = parser.parse_args()
11 | 
12 | model_path = args.model_path
13 | prompt = args.prompt
14 | 
15 | config = AutoConfig.from_pretrained(model_path)
16 | tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
17 | model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
18 | 
19 | inputs = tokenizer(prompt, return_tensors="pt")
20 | for key in inputs:
21 |     inputs[key] = inputs[key].cuda()
22 | # top_k, top_p and do_sample are set for greedy argmax based sampling
23 | 
24 | outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
25 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))


--------------------------------------------------------------------------------
/examples/inference/run_text_generation_server_345M.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model.
 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | export CUDA_DEVICE_MAX_CONNECTIONS=1
14 | 
15 | pip install flask-restful
16 | 
17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
18 |        --tensor-model-parallel-size 1  \
19 |        --pipeline-model-parallel-size 1  \
20 |        --num-layers 24  \
21 |        --hidden-size 1024  \
22 |        --load ${CHECKPOINT}  \
23 |        --num-attention-heads 16  \
24 |        --max-position-embeddings 1024  \
25 |        --tokenizer-type GPT2BPETokenizer  \
26 |        --fp16  \
27 |        --micro-batch-size 1  \
28 |        --seq-length 1024  \
29 |        --vocab-file $VOCAB_FILE  \
30 |        --merge-file $MERGE_FILE  \
31 |        --seed 42
32 | 


--------------------------------------------------------------------------------
/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel
 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | pip install flask-restful
14 | 
15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
16 |        --tensor-model-parallel-size 8  \
17 |        --pipeline-model-parallel-size 1  \
18 |        --num-layers 24  \
19 |        --hidden-size 1024  \
20 |        --load ${CHECKPOINT}  \
21 |        --num-attention-heads 16  \
22 |        --max-position-embeddings 1024  \
23 |        --tokenizer-type GPT2BPETokenizer  \
24 |        --fp16  \
25 |        --micro-batch-size 1  \
26 |        --seq-length 1024  \
27 |        --vocab-file $VOCAB_FILE  \
28 |        --merge-file $MERGE_FILE  \
29 |        --seed 42
30 | 


--------------------------------------------------------------------------------
/examples/mamba/.gitignore:
--------------------------------------------------------------------------------
1 | checkpoints/
2 | data-cache/
3 | tensorboard/
4 | triton-cache/
5 | 


--------------------------------------------------------------------------------
/examples/multimodal/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:24.02-py3
 2 | 
 3 | RUN apt update && \
 4 |     apt -y upgrade && \
 5 |     apt install -y --no-install-recommends \
 6 |         software-properties-common \
 7 |         build-essential \
 8 |         python3-pip \
 9 |         python3-dev \
10 |         bash \
11 |         git \
12 |         vim \
13 |         tmux \
14 |         python-is-python3 \
15 |         default-jre
16 | 
17 | RUN pip install --upgrade pip
18 | RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging
19 | RUN pip install transformers datasets accelerate timm
20 | RUN pip install pytest-cov pytest_mock nltk wrapt
21 | RUN pip install zarr "tensorstore==0.1.45"
22 | RUN pip install black isort click==8.0.2
23 | RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken
24 | RUN pip install git+https://github.com/openai/CLIP.git
25 | # Use --no-deps for the following to avoid outdated and unnecessary dependencies.
26 | RUN pip install open_clip_torch open-flamingo[eval] --no-deps
27 | 


--------------------------------------------------------------------------------
/examples/multimodal/assets/pretrain_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/examples/multimodal/assets/pretrain_curves.png


--------------------------------------------------------------------------------
/examples/multimodal/convert_llava_pretrain_to_wds.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import webdataset as wds
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | llava_pretrain_dir = '<path_to_LLaVA-Pretrain>'
 8 | 
 9 | # Paths to the dataset files
10 | json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json')
11 | output = os.path.join(llava_pretrain_dir, 'wds')
12 | 
13 | if not os.path.exists(output):
14 |     os.mkdir(output)
15 | 
16 | # Load data
17 | with open(json_file, 'r') as f:
18 |     data = json.load(f)
19 | 
20 | with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer:
21 |     for entry in tqdm(data):
22 |         with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file:
23 |                 image_data = img_file.read()
24 |         sample = {
25 |             "__key__": entry['id'],
26 |             "jpg": image_data,
27 |             "json": json.dumps(entry['conversations']).encode("utf-8"),
28 |         }
29 |         shard_writer.write(sample)
30 | 
31 | print(f"Dataset successfully converted to wds")
32 | 


--------------------------------------------------------------------------------
/examples/multimodal/nvlm/pretrain_blend.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 0.579   # Datasets are weighted according to their size. Weights sum up to 1.
 7 |         path: <path to laion dataset>
 8 |         subflavors:
 9 |           augmentation: False
10 | 
11 |       - weight: 0.02
12 |         path: <path to coco>
13 |         subflavors:
14 |           augmentation: False
15 | 
16 |       - weight: 0.01
17 |         path: <path to vqav2 dataset>
18 |         subflavors:
19 |           augmentation: False
20 | 
21 |       # Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
22 |       # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
23 |   val:
24 |     datasets:
25 |       - weight: 1.
26 |         path: <path to validation dataset>
27 |         subflavors:
28 |           augmentation: False
29 | 


--------------------------------------------------------------------------------
/examples/multimodal/nvlm/sft_blend.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 0.01  # # Datasets are weighted according to their size. Weights sum up to 1.
 7 |         path: <path to coco>
 8 |         subflavors:
 9 |           augmentation: False
10 | 
11 |       - weight: 0.02
12 |         path: <path to clevr-math dataset>
13 |         subflavors:
14 |           augmentation: False
15 | 
16 |       # Please refer to Table 6 in https://arxiv.org/pdf/2409.11402 for full list of SFT datasets.
17 |       # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
18 |   val:
19 |     datasets:
20 |       - weight: 1.
21 |         path: <path to validation dataset>
22 |         subflavors:
23 |           augmentation: False
24 | 


--------------------------------------------------------------------------------
/examples/multimodal/pretrain_dataset.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 1.
 7 |         path: <path_to_pretraining_dataset_in_energon_format>
 8 |         subflavors:
 9 |           augmentation: false
10 |   val:
11 |     datasets:
12 |       - weight: 1.
13 |         path: <path_to_pretraining_dataset_in_energon_format>
14 |         subflavors:
15 |           augmentation: false
16 | 


--------------------------------------------------------------------------------
/examples/multimodal/sft_dataset.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 1.
 7 |         path: <path_to_sft_dataset_in_energon_format>
 8 |         subflavors:
 9 |           augmentation: false
10 |   val:
11 |     datasets:
12 |       - weight: 1.
13 |         path: <path_to_sft_dataset_in_energon_format>
14 |         subflavors:
15 |           augmentation: false
16 | 


--------------------------------------------------------------------------------
/examples/t5/t5_mcore_train_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/examples/t5/t5_mcore_train_curve.png


--------------------------------------------------------------------------------
/images/model_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/model_table.png


--------------------------------------------------------------------------------
/images/remoe_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/remoe_comparison.png


--------------------------------------------------------------------------------
/images/remoe_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/remoe_scaling.png


--------------------------------------------------------------------------------
/images/strong_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/strong_scaling.png


--------------------------------------------------------------------------------
/images/weak_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/weak_scaling.png


--------------------------------------------------------------------------------
/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | import megatron.core.tensor_parallel
 3 | import megatron.core.utils
 4 | from megatron.core import parallel_state
 5 | from megatron.core.distributed import DistributedDataParallel
 6 | from megatron.core.inference_params import InferenceParams
 7 | from megatron.core.model_parallel_config import ModelParallelConfig
 8 | from megatron.core.package_info import (
 9 |     __contact_emails__,
10 |     __contact_names__,
11 |     __description__,
12 |     __download_url__,
13 |     __homepage__,
14 |     __keywords__,
15 |     __license__,
16 |     __package_name__,
17 |     __repository_url__,
18 |     __shortversion__,
19 |     __version__,
20 | )
21 | from megatron.core.timers import Timers
22 | 
23 | # Alias parallel_state as mpu, its legacy name
24 | mpu = parallel_state
25 | 
26 | __all__ = [
27 |     "parallel_state",
28 |     "tensor_parallel",
29 |     "utils",
30 |     "DistributedDataParallel",
31 |     "InferenceParams",
32 |     "ModelParallelConfig",
33 |     "Timers",
34 | ]
35 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | 
 4 | LIBNAME = helpers_cpp
 5 | LIBEXT = $(shell python3-config --extension-suffix)
 6 | 
 7 | OUT = $(LIBNAME)$(LIBEXT)
 8 | SRC = helpers.cpp
 9 | 
10 | default: $(OUT)
11 | 
12 | $(OUT): $(SRC)
13 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
14 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/datasets/__init__.py


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .config import RetroGPTChunkDatasets
4 | from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
5 | from .query.retro_dataset import get_retro_datasets
6 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - Embedder: Base class for all Bert embedders.
 7 |   - RetroBertEmbedders: Container class for in-memory and on-disk embedders.
 8 |   - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing.
 9 |   - RetroGPTChunkDatasets: Container class for train, valid, and test datasets.
10 |   - RetroTokenizers: Container class for GPT and Bert tokenizers.
11 | """
12 | 
13 | from .bert_embedders import Embedder, RetroBertEmbedders
14 | from .config import RetroPreprocessingConfig
15 | from .gpt_chunk_datasets import RetroGPTChunkDatasets
16 | from .tokenizers import RetroTokenizers
17 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/gpt_chunk_datasets.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Container dataclass for GPT chunk datasets (train, valid, and test)."""
 4 | 
 5 | from dataclasses import dataclass
 6 | 
 7 | 
 8 | @dataclass
 9 | class RetroGPTChunkDatasets:
10 |     """Container dataclass for GPT chunk datasets."""
11 | 
12 |     # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'.
13 |     train: dict = None
14 |     valid: dict = None
15 |     test: dict = None
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/tokenizers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Container class for GPT and Bert tokenizers."""
 4 | 
 5 | from dataclasses import dataclass
 6 | 
 7 | from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
 8 | 
 9 | 
10 | @dataclass
11 | class RetroTokenizers:
12 |     """Container class for GPT and Bert tokenizers."""
13 | 
14 |     gpt: MegatronTokenizer = None
15 |     bert: MegatronTokenizer = None
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/db/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - build_db: Build a chunk database from a list of indexed datasets.
 7 | """
 8 | 
 9 | from .build import build_db
10 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Required external libraries for Retro preprocessing."""
 4 | 
 5 | import importlib
 6 | 
 7 | required_libs = ["faiss", "h5py", "transformers"]  # for huggingface bert
 8 | 
 9 | for lib in required_libs:
10 |     try:
11 |         globals()[lib] = importlib.import_module(lib)
12 |     except ImportError as e:
13 |         raise Exception(
14 |             f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'."
15 |         )
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/index/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - train_index: Train an index on representative vectors.
 7 |   - add_to_index: Add vectors to a trained index.
 8 |   - build_index: Wrapper function that calls above two functions.
 9 | """
10 | 
11 | from .build import add_to_index, build_index, train_index
12 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/index/indexes/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | - FaissBaseIndex: Unoptimized Faiss index wrapper
 6 | - FaissParallelAddIndex: Optimized index.add() for Faiss index.
 7 | """
 8 | 
 9 | from .faiss_base import FaissBaseIndex
10 | from .faiss_par_add import FaissParallelAddIndex
11 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/query/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Utilities for querying the pretraining dataset."""
 4 | 
 5 | import os
 6 | 
 7 | from megatron.core.datasets.megatron_dataset import MegatronDataset
 8 | 
 9 | 
10 | def get_query_dir(project_dir: str) -> str:
11 |     """Get root directory of all saved query data.
12 | 
13 |     Args:
14 |         project_dir (str): Retro project dir.
15 | 
16 |     Returns:
17 |         Path to query sub-directory in Retro project.
18 |     """
19 |     return os.path.join(project_dir, "query")
20 | 
21 | 
22 | def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str:
23 |     """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test).
24 | 
25 |     Args:
26 |         project_dir (str): Retro project dir.
27 |         key (str): Dataset split key; 'train', 'valid', or 'test'.
28 |         dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors.
29 | 
30 |     Returns:
31 |         Path to directory containing this dataset's neighbors within Retro project.
32 |     """
33 |     return os.path.join(
34 |         get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}")
35 |     )
36 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from .core import check_is_distributed_checkpoint
 4 | from .mapping import LocalNonpersistentObject, LocalNonpersitentObject, ShardedTensor
 5 | from .serialization import (
 6 |     load,
 7 |     load_common_state_dict,
 8 |     load_plain_tensors,
 9 |     load_tensors_metadata,
10 |     remove_sharded_tensors,
11 |     save,
12 | )
13 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | """ Various loading and saving strategies """
4 | from megatron.core.dist_checkpointing.strategies.common import register_default_common_strategies
5 | 
6 | # We load "common" strategies by default to be always available
7 | register_default_common_strategies()
8 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/README.md:
--------------------------------------------------------------------------------
 1 | ## How to use pytorch FSDP2?
 2 | 
 3 | Add these flag to enable Torch FSDP2.
 4 | 
 5 | ```
 6 | --use-torch-fsdp2
 7 | --no-gradient-accumulation-fusion
 8 | --ckpt-format torch_dist
 9 | ```
10 | 
11 | It is worth noting that CUDA_MAX_CONNECTIONS=1 should not be enabled to ensure that the communication of FSDP and the computation on the primary stream can be fully parallelized.
12 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from packaging.version import Version
4 | 
5 | from .distributed_data_parallel import DistributedDataParallel
6 | from .distributed_data_parallel_config import DistributedDataParallelConfig
7 | from .finalize_model_grads import finalize_model_grads
8 | from .torch_fully_sharded_data_parallel import TorchFullyShardedDataParallel
9 | 


--------------------------------------------------------------------------------
/megatron/core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | class ModelType(enum.Enum):
 7 |     encoder_or_decoder = 1
 8 |     encoder_and_decoder = 2
 9 |     retro_encoder = 3
10 |     retro_decoder = 4
11 | 


--------------------------------------------------------------------------------
/megatron/core/export/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/data_type.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from enum import Enum
4 | 
5 | DataType = Enum('DataType', ["bfloat16", "float16", "float32"])
6 | 


--------------------------------------------------------------------------------
/megatron/core/export/export_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class ExportConfig:
 8 |     """Base configuration for Megatron Core Export
 9 | 
10 |     These parameters control the export setting for trtllm
11 |     """
12 | 
13 |     inference_tp_size: int = 1
14 | 
15 |     inference_pp_size: int = 1
16 | 
17 |     use_parallel_embedding: bool = False
18 | 
19 |     use_embedding_sharing: bool = False
20 | 


--------------------------------------------------------------------------------
/megatron/core/export/model_type.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from enum import Enum
4 | 
5 | ModelType = Enum(
6 |     'ModelType', ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
7 | )
8 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/engine_builder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/trt_model_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import tensorrt_llm
 4 | 
 5 | from megatron.core.export.model_type import ModelType
 6 | 
 7 | TRT_MODEL_CONFIG = {
 8 |     ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig,
 9 |     ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig,
10 |     ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig,
11 |     ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig,
12 |     ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig,
13 |     ModelType.gemma: tensorrt_llm.models.GemmaConfig,
14 |     ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig,
15 | }
16 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/trt_model_type.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from megatron.core.export.model_type import ModelType
 4 | 
 5 | TRT_MODEL_TYPE_STRING = {
 6 |     ModelType.gpt: 'GPTForCausalLM',
 7 |     ModelType.gptnext: 'GPTForCausalLM',
 8 |     ModelType.starcoder: 'GPTForCausalLM',
 9 |     ModelType.mixtral: 'LlamaForCausalLM',
10 |     ModelType.llama: 'LlamaForCausalLM',
11 |     ModelType.gemma: 'GemmaForCausalLM',
12 |     ModelType.falcon: 'FalconForCausalLM',
13 | }
14 | 


--------------------------------------------------------------------------------
/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/extensions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/extensions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/fusions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/fusions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/inference/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/ammo_support/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | import warnings
3 | 
4 | warnings.warn(
5 |     "The 'megatron.core.inference.ammo_support' module is deprecated and will be removed in a future release. "
6 |     "Please use megatron.core.inference.modelopt_support instead",
7 |     DeprecationWarning,
8 | )
9 | 


--------------------------------------------------------------------------------
/megatron/core/inference/ammo_support/gpt/model_specs.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from megatron.core.inference.modelopt_support.gpt.model_specs import get_gpt_layer_modelopt_spec
3 | 


--------------------------------------------------------------------------------
/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from megatron.core.inference.modelopt_support.gpt.state_dict_hooks import (
3 |     mcore_gpt_load_legacy_state_dict_pre_hook,
4 |     mcore_gpt_load_te_state_dict_pre_hook,
5 | )
6 | 


--------------------------------------------------------------------------------
/megatron/core/inference/common_inference_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class CommonInferenceParams:
 7 |     """Inference parameters sent along with the prompts
 8 | 
 9 |     For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910
10 |     """
11 | 
12 |     temperature: float = 1.0
13 |     top_k: int = 0
14 |     top_p: float = 0.0
15 |     return_log_probs: bool = False
16 |     num_tokens_to_generate: int = 30
17 | 
18 |     def add_attributes(self, attribute_value_pair: dict):
19 |         """Utility to add more attributes to inference params
20 | 
21 |         Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows
22 |         c = CommonInferenceParams
23 |         c.add_attributes({'min_length':4, 'eod_id':153})
24 | 
25 |         Args:
26 |             attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
27 |         """
28 |         for key, value in attribute_value_pair.items():
29 |             setattr(self, key, value)
30 | 


--------------------------------------------------------------------------------
/megatron/core/inference/engines/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/engines/abstract_engine.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | from typing import List
 4 | 
 5 | 
 6 | class AbstractEngine(ABC):
 7 |     @staticmethod
 8 |     @abstractmethod
 9 |     def generate(self) -> dict:
10 |         """The abstract backend's generate function.
11 | 
12 |         To define a new backend, implement this and return the outputs as a dictionary.
13 | 
14 |         Returns:
15 |             dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`.
16 |         """
17 |         pass
18 | 


--------------------------------------------------------------------------------
/megatron/core/inference/inference_request.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | from dataclasses import dataclass
 3 | from enum import Enum
 4 | from typing import List
 5 | 
 6 | import torch
 7 | 
 8 | from megatron.core.inference.common_inference_params import CommonInferenceParams
 9 | 
10 | 
11 | # class syntax
12 | class Status(Enum):
13 |     """Enum for status"""
14 | 
15 |     WAITING_IN_QUEUE = 1
16 |     ACTIVE_AND_GENERATING_TOKENS = 2
17 |     ACTIVE_BUT_NOT_GENERATING_TOKENS = 3
18 |     COMPLETED = 4
19 | 
20 | 
21 | @dataclass
22 | class InferenceRequest:
23 |     """Class for one inference request
24 | 
25 |     Containing relevant data for an inference request
26 | 
27 |     """
28 | 
29 |     request_id: str
30 |     prompt: str
31 |     inference_parameters: CommonInferenceParams
32 |     prompt_tokens: List[int]
33 |     arrival_time: float
34 |     status: Status
35 |     encoder_prompt: str = None
36 |     generated_text: str = None
37 |     generated_tokens: torch.Tensor = None
38 |     generated_log_probs: torch.Tensor = None
39 |     generated_length: int = 0
40 | 


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/t5/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/modelopt_support/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | """Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt).
3 | 
4 | ModelOpt is a library comprising state-of-the-art model optimization techniques including quantization and sparsity to
5 | compress model for efficient inference on NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
6 | experience for users to optimize their Megatron-core models for inference. More details on ModelOpt including
7 | installation and usage can be found at https://github.com/NVIDIA/TensorRT-Model-Optimizer.
8 | """
9 | 


--------------------------------------------------------------------------------
/megatron/core/inference/modelopt_support/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/text_generation_controllers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | class Counter:
 3 |     """A simple counter class
 4 | 
 5 |     This class is responsible for assigning request ids to incoming requests
 6 |     """
 7 | 
 8 |     def __init__(self, start: int = 0) -> None:
 9 |         self.counter = start
10 | 
11 |     def __next__(self) -> int:
12 |         i = self.counter
13 |         self.counter += 1
14 |         return i
15 | 
16 |     def reset(self) -> None:
17 |         self.counter = 0
18 | 


--------------------------------------------------------------------------------
/megatron/core/jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from megatron.core.utils import is_torch_min_version
 6 | 
 7 | jit_fuser = torch.jit.script
 8 | # nvFuser is deprecated in PyTorch JIT starting from 2.2
 9 | if is_torch_min_version("2.2.0a0"):
10 |     jit_fuser = torch.compile
11 | 


--------------------------------------------------------------------------------
/megatron/core/models/T5/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .t5_model import T5Model
3 | 


--------------------------------------------------------------------------------
/megatron/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/bert/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/common/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .rope_utils import apply_rotary_pos_emb
4 | from .rotary_pos_embedding import RotaryEmbedding
5 | from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale
6 | 


--------------------------------------------------------------------------------
/megatron/core/models/common/language_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/common/language_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/common/vision_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/vision_module.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | """Megatron Vision Module."""
 3 | 
 4 | from megatron.core.transformer.module import MegatronModule
 5 | from megatron.core.transformer.transformer_config import TransformerConfig
 6 | 
 7 | 
 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes.
 9 | class VisionModule(MegatronModule):
10 |     """Base vision module that has common helper functions used across CLIP, ViT, etc.
11 | 
12 |     Args:
13 |         config (TransformerConfig): Input transformer config for the model
14 |     """
15 | 
16 |     def __init__(self, config: TransformerConfig) -> None:
17 |         super().__init__(config=config)
18 | 


--------------------------------------------------------------------------------
/megatron/core/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .gpt_model import GPTModel
3 | 


--------------------------------------------------------------------------------
/megatron/core/models/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .mamba_model import MambaModel
3 | 


--------------------------------------------------------------------------------
/megatron/core/models/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - RetroConfig: configuration dataclass for RetroModel.
 7 |   - RetroModel: The Retro model.
 8 |   - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block.
 9 | """
10 | 
11 | from .config import RetroConfig
12 | from .decoder_spec import get_retro_decoder_block_spec
13 | from .model import RetroModel
14 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def get_config_path(project_dir: str) -> str:
 9 |     """Config copy stored within retro project dir."""
10 |     return os.path.join(project_dir, "config.json")
11 | 
12 | 
13 | def get_gpt_data_dir(project_dir: str) -> str:
14 |     """Get project-relative directory of GPT bin/idx datasets."""
15 |     return os.path.join(project_dir, "data")
16 | 
17 | 
18 | # ** Note ** : Retro's compatibility between cross attention and Flash/Fused
19 | #   Attention is currently a work in progress. We default to returning None for
20 | #   now.
21 | # def get_all_true_mask(size, device):
22 | #     return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device)
23 | def get_all_true_mask(size, device):
24 |     return None
25 | 


--------------------------------------------------------------------------------
/megatron/core/models/vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/vision/__init__.py


--------------------------------------------------------------------------------
/megatron/core/package_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | 
 4 | MAJOR = 0
 5 | MINOR = 10
 6 | PATCH = 0
 7 | PRE_RELEASE = 'rc0'
 8 | 
 9 | # Use the following formatting: (major, minor, patch, pre-release)
10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
11 | 
12 | __shortversion__ = '.'.join(map(str, VERSION[:3]))
13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
14 | 
15 | __package_name__ = 'megatron_core'
16 | __contact_names__ = 'NVIDIA'
17 | __contact_emails__ = 'nemo-toolkit@nvidia.com'  # use NeMo Email
18 | __homepage__ = (
19 |     'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'  # use NeMo homepage
20 | )
21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
23 | __description__ = (
24 |     'Megatron Core - a library for efficient and scalable training of transformer based models'
25 | )
26 | __license__ = 'BSD-3'
27 | __keywords__ = (
28 |     'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
29 | )
30 | 


--------------------------------------------------------------------------------
/megatron/core/packed_seq_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | from dataclasses import dataclass
 3 | 
 4 | from torch import Tensor
 5 | 
 6 | 
 7 | @dataclass
 8 | class PackedSeqParams:
 9 |     '''
10 |     parameters to TEDotProductAttention and fused rope kernels for the
11 |     `thd` (packed) sequence format
12 |     '''
13 | 
14 |     qkv_format: str = None
15 |     cu_seqlens_q: Tensor = None
16 |     cu_seqlens_kv: Tensor = None
17 |     cu_seqlens_q_padded: Tensor = None
18 |     cu_seqlens_kv_padded: Tensor = None
19 |     max_seqlen_q: Tensor = None
20 |     max_seqlen_kv: Tensor = None
21 | 


--------------------------------------------------------------------------------
/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from .schedules import get_forward_backward_func
3 | 


--------------------------------------------------------------------------------
/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | packaging
3 | 


--------------------------------------------------------------------------------
/megatron/core/ssm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/ssm/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .module import MegatronModule
4 | from .spec_utils import ModuleSpec, build_module
5 | from .transformer_config import MLATransformerConfig, TransformerConfig
6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
7 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/custom_layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/transformer/custom_layers/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/custom_layers/transformer_engine.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import warnings
 4 | 
 5 | warnings.warn(
 6 |     """The 'megatron.core.transformer.custom_layers.transformer_engine' 
 7 |     module is deprecated and will be removed in 0.10.0. Please use 
 8 |     'megatron.core.extensions.transformer_engine' instead.""",
 9 |     DeprecationWarning,
10 |     stacklevel=2,
11 | )
12 | from megatron.core.extensions.transformer_engine import *
13 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | # can we get rid of this?
 7 | # it's being used in pipeline schedules
 8 | class ModelType(enum.Enum):
 9 |     """Model Type
10 | 
11 |     encoder_or_decoder for bert, gpt etc
12 |     encoder_and_decoder for multimodal , T5 etc
13 |     """
14 | 
15 |     encoder_or_decoder = 1
16 |     encoder_and_decoder = 2
17 | 
18 | 
19 | # class LayerType(enum.Enum):
20 | #     encoder = 1
21 | #     decoder = 2
22 | 
23 | 
24 | class AttnType(enum.Enum):
25 |     """Attention type"""
26 | 
27 |     self_attn = 1
28 |     cross_attn = 2
29 | 
30 | 
31 | class AttnMaskType(enum.Enum):
32 |     """Attention Mask Type"""
33 | 
34 |     padding = 1
35 |     causal = 2
36 |     no_mask = 3  # only used for TE
37 |     padding_causal = 4  # only used for thd attention
38 |     arbitrary = 5
39 | 
40 | 
41 | class AttnBackend(enum.Enum):
42 |     """Attention Backend"""
43 | 
44 |     flash = 1
45 |     fused = 2
46 |     unfused = 3
47 |     local = 4
48 |     auto = 5
49 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/identity_op.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | class IdentityOp(torch.nn.Module):
 6 |     """
 7 |     This is a placeholder for IdentityOp(x) -> x
 8 |     """
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 
16 | 
17 | class IdentityFuncOp(IdentityOp):
18 |     """
19 |     This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x.
20 |     Such a func is handy for ops like `bias_dropout_fusion` which themselves
21 |     return a function at runtime based on passed arguments
22 |     """
23 | 
24 |     def __init__(self, *args, **kwargs):
25 |         super().__init__()
26 | 
27 |     def forward(self, *args, **kwargs):
28 |         return super().forward
29 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/transformer/moe/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/grouped_gemm_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | try:
 4 |     import grouped_gemm
 5 | except ImportError:
 6 |     grouped_gemm = None
 7 | 
 8 | 
 9 | def grouped_gemm_is_available():
10 |     """Check if grouped_gemm is available."""
11 |     return grouped_gemm is not None
12 | 
13 | 
14 | def assert_grouped_gemm_is_available():
15 |     """Assert that grouped_gemm is available."""
16 |     assert grouped_gemm_is_available(), (
17 |         "Grouped GEMM is not available. Please run "
18 |         "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4`."
19 |     )
20 | 
21 | 
22 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None
23 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/torch_layer_norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | from megatron.core.transformer.torch_norm import WrappedTorchNorm
3 | 
4 | WrappedTorchLayerNorm = WrappedTorchNorm
5 | 


--------------------------------------------------------------------------------
/megatron/inference/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/inference/algos/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/inference/endpoints/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | import threading
 5 | 
 6 | GENERATE_NUM = 0
 7 | BEAM_NUM = 1
 8 | LOCK = threading.Lock()
 9 | 
10 | 
11 | def send_do_generate():
12 |     choice = torch.tensor([GENERATE_NUM], dtype=torch.long, device="cuda")
13 |     torch.distributed.broadcast(choice, 0)
14 | 
15 | 
16 | def send_do_beam_search():
17 |     choice = torch.tensor([BEAM_NUM], dtype=torch.long, device="cuda")
18 |     torch.distributed.broadcast(choice, 0)
19 | 


--------------------------------------------------------------------------------
/megatron/inference/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .loss_func import loss_func
4 | from .model_provider import model_provider
5 | 


--------------------------------------------------------------------------------
/megatron/inference/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/megatron/legacy/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/legacy/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/megatron/legacy/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/megatron/legacy/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/legacy/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/legacy/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 4 | from .rms_norm import RMSNorm
 5 | 
 6 | from .bert_model import BertModel
 7 | from .gpt_model import GPTModel
 8 | from .t5_model import T5Model
 9 | from .language_model import get_language_model
10 | from .module import Float16Module
11 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 |     retro_decoder_with_retriever = 5
11 |  
12 | class AttnType(enum.Enum):
13 |     self_attn = 1
14 |     cross_attn = 2
15 | 
16 | class AttnMaskType(enum.Enum):
17 |     padding = 1
18 |     causal = 2
19 | 
20 | # For backward compatibility with old model checkpoints
21 | from megatron.core.enums import ModelType
22 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | class RMSNorm(torch.nn.Module):
 7 | 
 8 |     def __init__(self,
 9 |                  dim: int,
10 |                  eps: float = 1e-6,
11 |                  sequence_parallel: bool = False,
12 |                  config: dict = None):
13 |         """RMS Normaliation module
14 | 
15 |         Args:
16 |             dim (int): The width of input, i.e. hidden size
17 |             eps (float): epsilon to use for the norm, default to 1e-6
18 |             sequence_parallel (bool): Set to true if sequence parallelism is being used,
19 |               this marks the weights as needing to be allreduced.
20 |         """
21 |         super().__init__()
22 |         self.eps = eps
23 |         self.weight = nn.Parameter(torch.ones(dim))
24 | 
25 |         setattr(self.weight, 'sequence_parallel', sequence_parallel)
26 | 
27 |     def _norm(self, x):
28 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
29 | 
30 |     def forward(self, x):
31 |         output = self._norm(x.float()).type_as(x)
32 |         return output * self.weight
33 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | import warnings
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def resize(input,
 8 |            size=None,
 9 |            scale_factor=None,
10 |            mode='nearest',
11 |            align_corners=None,
12 |            warning=True):
13 |     if warning:
14 |         if size is not None and align_corners:
15 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
16 |             output_h, output_w = tuple(int(x) for x in size)
17 |             if output_h > input_h or output_w > output_h:
18 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
19 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
20 |                         and (output_w - 1) % (input_w - 1)):
21 |                     warnings.warn(
22 |                         f'When align_corners={align_corners}, '
23 |                         'the output would more aligned if '
24 |                         f'input size {(input_h, input_w)} is `x+1` and '
25 |                         f'out size {(output_h, output_w)} is `nx+1`')
26 |     if isinstance(size, torch.Size):
27 |         size = tuple(int(x) for x in size)
28 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
29 | 


--------------------------------------------------------------------------------
/megatron/legacy/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/legacy/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/training/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args
 6 | from .global_vars import get_signal_handler
 7 | from .global_vars import get_tokenizer
 8 | from .global_vars import get_tensorboard_writer
 9 | from .global_vars import get_wandb_writer
10 | from .global_vars import get_one_logger
11 | from .global_vars import get_adlr_autoresume
12 | from .global_vars import get_timers
13 | from .initialize  import initialize_megatron
14 | from .training import pretrain, get_model, get_train_valid_test_num_samples
15 | 
16 | from .utils import (print_rank_0,
17 |                     is_last_rank,
18 |                     print_rank_last)
19 | 


--------------------------------------------------------------------------------
/megatron/training/activations.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | from megatron.core.jit import jit_fuser
 6 | 
 7 | 
 8 | @jit_fuser
 9 | def squared_relu(x: torch.Tensor) -> torch.Tensor:
10 |     return torch.pow(F.relu(x), 2)
11 | 
12 | 
13 | @jit_fuser
14 | def quick_gelu(x: torch.Tensor) -> torch.Tensor:
15 |     return x * torch.sigmoid(1.702 * x)
16 | 
17 | @jit_fuser
18 | def fast_gelu(x: torch.Tensor) -> torch.Tensor:
19 |     return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
20 | 


--------------------------------------------------------------------------------
/megatron/training/log_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import sys
 4 | from logging import LogRecord, StreamHandler
 5 | 
 6 | BLACKLISTED_MODULES = ["torch.distributed"]
 7 | 
 8 | 
 9 | class CustomHandler(StreamHandler):
10 |     """
11 |     Custom handler to filter out logging from code outside of
12 |     Megatron Core, and dump to stdout.
13 |     """
14 | 
15 |     def __init__(self):
16 |         super().__init__(stream=sys.stdout)
17 | 
18 |     def filter(self, record: LogRecord) -> bool:
19 |         # Prevent log entries that come from the blacklisted modules
20 |         # through (e.g., PyTorch Distributed).
21 |         for blacklisted_module in BLACKLISTED_MODULES:
22 |             if record.name.startswith(blacklisted_module):
23 |                 return False
24 |         return True
25 | 


--------------------------------------------------------------------------------
/megatron/training/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | ignore_missing_imports = True
 3 | check_untyped_defs = False
 4 | disallow_untyped_calls = False
 5 | disallow_untyped_defs = False
 6 | disallow_incomplete_defs = False
 7 | 
 8 | disable_error_code = call-arg,operator,var-annotated,union-attr,import-untyped
 9 | 
10 | # Enable only `assignment` error checking
11 | enable_error_code = assignment


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # content of pytest.ini
2 | [pytest]
3 | markers =
4 |     internal: mark a test as a test to private/internal functions.


--------------------------------------------------------------------------------
/requirements/pytorch:24.01/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops
 2 | flask-restful
 3 | nltk
 4 | pytest
 5 | pytest-cov
 6 | pytest_mock
 7 | pytest-random-order
 8 | sentencepiece
 9 | tiktoken
10 | wrapt
11 | zarr
12 | wandb
13 | triton==2.1.0
14 | tensorstore==0.1.45
15 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"


--------------------------------------------------------------------------------
/requirements/pytorch:24.07/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops
 2 | flask-restful
 3 | nltk
 4 | pytest
 5 | pytest-cov
 6 | pytest_mock
 7 | pytest-random-order
 8 | sentencepiece
 9 | tiktoken
10 | wrapt
11 | zarr
12 | wandb
13 | tensorstore==0.1.45
14 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | from megatron.training import get_args, print_rank_0
 6 | from megatron.legacy.indexer import IndexBuilder
 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator
 8 | 
 9 | def main():
10 |     """
11 |     Main program
12 |     """
13 | 
14 |     args = get_args()
15 | 
16 |     """
17 |     Create a BlockData data structure by running an IndexBuilder over an
18 |     ICT Dataset and then evaluate on NQ task
19 |     """
20 | 
21 |     print_rank_0("Starting index builder!")
22 | 
23 |     index_builder = IndexBuilder()
24 |     index_builder.build_and_save_index()
25 |     print_rank_0("Build and save indices: done!")
26 | 
27 | 
28 |     print_rank_0("Starting evaluations!")
29 | 
30 |     # Set up the model and evaluator
31 |     evaluator = ORQAEvaluator()
32 | 
33 |     # Run evaluation
34 |     if args.qa_data_dev is not None:
35 |         evaluator.evaluate(args.qa_data_dev, "DEV")
36 | 
37 |     if args.qa_data_test is not None:
38 |         evaluator.evaluate(args.qa_data_test, "TEST")
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/functional_tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/functional_tests/python_test_utils/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
 4 | import json
 5 | 
 6 | import click
 7 | 
 8 | from tests.functional_tests.python_test_utils import common
 9 | 
10 | 
11 | @click.command()
12 | @click.option("--logs-dir", required=True, type=str, help="Path to Tensorboard logs")
13 | @click.option("--output-path", required=False, type=str, help="Path to write golden values")
14 | @click.option(
15 |     "--is-convergence-test/--is-normal-test",
16 |     type=bool,
17 |     help="Tensorboard index to extract",
18 |     default=False,
19 | )
20 | def collect_train_test_metrics(logs_dir: str, output_path: str, is_convergence_test: bool):
21 |     summaries = common.read_tb_logs_as_list(logs_dir, index=-1 if is_convergence_test else 0)
22 | 
23 |     train_metrics = {
24 |         metric_name: {
25 |             "start_step": 0,
26 |             "end_step": len(metric_values),
27 |             "step_interval": 5,
28 |             "values": metric_values[0 : len(metric_values) : 5],
29 |         }
30 |         for metric_name, metric_values in summaries.items()
31 |     }
32 | 
33 |     if output_path is not None:
34 |         with open(output_path, "w") as fh:
35 |             json.dump(train_metrics, fh)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     collect_train_test_metrics()
40 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {   "lm loss": {
 2 |         "start_step": 0,
 3 |         "end_step": 50,
 4 |         "step_interval": 5,
 5 |         "values": [
 6 |             10.49569,
 7 |             10.48173,
 8 |             10.48047,
 9 |             10.45353,
10 |             10.44394,
11 |             10.35611,
12 |             10.13779,
13 |             10.04017,
14 |             9.86834,
15 |             9.67307
16 |         ]
17 |     },
18 |     "num-zeros": {
19 |         "start_step": 0,
20 |         "end_step": 50,
21 |         "step_interval": 5,
22 |         "values": [
23 |             2254.0,
24 |             2585.0,
25 |             2101.0,
26 |             2157.0,
27 |             2241.0,
28 |             2475.0,
29 |             2890.0,
30 |             3199.0,
31 |             3524.0,
32 |             3090.0
33 |         ]
34 |     },
35 |     "iteration-time": {
36 |         "start_step": 0,
37 |         "end_step": 50,
38 |         "step_interval": 5,
39 |         "values": [
40 |             13.65829,
41 |             1.27589,
42 |             1.2782,
43 |             1.32374,
44 |             1.26543,
45 |             1.26423,
46 |             1.26203,
47 |             1.54723,
48 |             1.27297,
49 |             1.26491
50 |         ]
51 |     }
52 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44396, 10.35607, 10.13786, 10.04016, 9.86838, 9.67302]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2291.0, 2485.0, 2953.0, 3287.0, 3440.0, 3059.0]}, "iteration_timing_avg": 0.8110379411764704}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49568, 10.45958, 10.32846, 10.17264, 9.96952]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27442.0, 22852.0, 22567.0, 20740.0, 23315.0]}, "iteration_timing_avg": 0.7692817647058824}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49568, 10.45958, 10.32846, 10.17264, 9.96952]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27442.0, 22852.0, 22567.0, 20740.0, 23315.0]}, "iteration_timing_avg": 0.7692817647058824}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.49411,
 8 |             10.4825,
 9 |             10.49242,
10 |             10.47802,
11 |             10.46608,
12 |             10.35193,
13 |             10.17693,
14 |             10.07728,
15 |             9.88753,
16 |             9.68034
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1931.0,
25 |             2555.0,
26 |             2017.0,
27 |             2135.0,
28 |             2440.0,
29 |             2464.0,
30 |             3070.0,
31 |             3006.0,
32 |             2932.0,
33 |             2303.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             10.94975,
42 |             0.67196,
43 |             0.67378,
44 |             0.66862,
45 |             0.69618,
46 |             0.66936,
47 |             0.67757,
48 |             0.67189,
49 |             0.67519,
50 |             0.67762
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49405, 10.48276, 10.49249, 10.47813, 10.46623, 10.35183, 10.17697, 10.07728, 9.8875, 9.68029]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2018.0, 2636.0, 2067.0, 2225.0, 2555.0, 2554.0, 2969.0, 2935.0, 2967.0, 2287.0]}, "iteration_timing_avg": 0.5847132352941178}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.46796,
 8 |             10.45723,
 9 |             10.44911,
10 |             10.44107,
11 |             10.41739,
12 |             10.34626,
13 |             10.11387,
14 |             10.0439,
15 |             9.86702,
16 |             9.679
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             2404.0,
25 |             2610.0,
26 |             2173.0,
27 |             2312.0,
28 |             2371.0,
29 |             2652.0,
30 |             3089.0,
31 |             3200.0,
32 |             3497.0,
33 |             3075.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             15.80389,
42 |             0.94155,
43 |             0.88518,
44 |             1.22442,
45 |             0.86955,
46 |             0.85166,
47 |             1.02329,
48 |             1.07525,
49 |             0.90283,
50 |             0.88308
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.4681,
 8 |             10.45734,
 9 |             10.4491,
10 |             10.44121,
11 |             10.41764,
12 |             10.34626,
13 |             10.11384,
14 |             10.04383,
15 |             9.86686,
16 |             9.67906
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             2373.0,
25 |             2593.0,
26 |             2187.0,
27 |             2325.0,
28 |             2407.0,
29 |             2627.0,
30 |             3036.0,
31 |             3109.0,
32 |             3568.0,
33 |             3019.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             22.86543,
42 |             0.84168,
43 |             0.92727,
44 |             0.84734,
45 |             0.93196,
46 |             0.86308,
47 |             0.86633,
48 |             0.86112,
49 |             0.87598,
50 |             1.02461
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.42085,
 8 |             10.42901,
 9 |             10.43576,
10 |             10.40804,
11 |             10.38463,
12 |             10.32426,
13 |             10.13148,
14 |             10.04317,
15 |             9.86257,
16 |             9.65771
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             3252.0,
25 |             2595.0,
26 |             3240.0,
27 |             3429.0,
28 |             3463.0,
29 |             3509.0,
30 |             4065.0,
31 |             4114.0,
32 |             4651.0,
33 |             4253.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             10.83012,
42 |             2.26196,
43 |             2.22779,
44 |             2.22677,
45 |             2.23847,
46 |             2.24307,
47 |             2.23859,
48 |             2.23544,
49 |             2.2414,
50 |             2.25107
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.4209,
 8 |             10.42905,
 9 |             10.43557,
10 |             10.40806,
11 |             10.38457,
12 |             10.32414,
13 |             10.13167,
14 |             10.04335,
15 |             9.86262,
16 |             9.65771
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             2249.0,
25 |             3640.0,
26 |             3249.0,
27 |             2318.0,
28 |             3512.0,
29 |             3601.0,
30 |             4111.0,
31 |             3175.0,
32 |             4713.0,
33 |             3320.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             12.51144,
42 |             2.1285,
43 |             2.28886,
44 |             2.24273,
45 |             2.20818,
46 |             2.20231,
47 |             2.18786,
48 |             2.17554,
49 |             2.213,
50 |             2.18811
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.49101,
 8 |             10.49526,
 9 |             10.48682,
10 |             10.48817,
11 |             10.49415,
12 |             10.4724,
13 |             10.42265,
14 |             10.29901,
15 |             10.1572,
16 |             9.97594
17 |         ]
18 |     },
19 |     "iteration-time": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             12.56945,
25 |             0.58599,
26 |             0.58451,
27 |             0.68178,
28 |             0.6056,
29 |             0.609,
30 |             0.59965,
31 |             0.60618,
32 |             0.60152,
33 |             0.59945
34 |         ]
35 |     },
36 |     "num-zeros": {
37 |         "start_step": 0,
38 |         "end_step": 34,
39 |         "step_interval": 5,
40 |         "values": [
41 |             17032.0,
42 |             16918.0,
43 |             19957.0,
44 |             18761.0,
45 |             25689.0,
46 |             19897.0,
47 |             22224.0
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.50096,
 8 |             10.48594,
 9 |             10.4936,
10 |             10.48501,
11 |             10.50417,
12 |             10.4773,
13 |             10.42154,
14 |             10.29716,
15 |             10.15831,
16 |             9.96751
17 |         ]
18 |     },
19 |     "iteration-time": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             12.85743,
25 |             0.58922,
26 |             0.54928,
27 |             0.54147,
28 |             0.56305,
29 |             0.56895,
30 |             0.56282,
31 |             0.56247,
32 |             0.56751,
33 |             0.69574
34 |         ]
35 |     },
36 |     "num-zeros": {
37 |         "start_step": 0,
38 |         "end_step": 34,
39 |         "step_interval": 5,
40 |         "values": [
41 |             16595.0,
42 |             18537.0,
43 |             19509.0,
44 |             18532.0,
45 |             26712.0,
46 |             20164.0,
47 |             20981.0
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.49734,
 8 |             10.49243,
 9 |             10.49325,
10 |             10.50311,
11 |             10.48985,
12 |             10.4721,
13 |             10.41217,
14 |             10.2805,
15 |             10.14052,
16 |             9.94191
17 |         ]
18 |     },
19 |     "iteration-time": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             8.58282,
25 |             2.06311,
26 |             2.05789,
27 |             2.24493,
28 |             2.05273,
29 |             2.05118,
30 |             2.05666,
31 |             2.04533,
32 |             2.05152,
33 |             2.04761
34 |         ]
35 |     },
36 |     "num-zeros": {
37 |         "start_step": 0,
38 |         "end_step": 34,
39 |         "step_interval": 5,
40 |         "values": [
41 |             26081.0,
42 |             18799.0,
43 |             24479.0,
44 |             23782.0,
45 |             21056.0,
46 |             19877.0,
47 |             19774.0
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.48685,
 8 |             10.49276,
 9 |             10.48837,
10 |             10.51348,
11 |             10.49396,
12 |             10.4755,
13 |             10.41921,
14 |             10.28044,
15 |             10.14256,
16 |             9.94738
17 |         ]
18 |     },
19 |     "iteration-time": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             10.8221,
25 |             1.96114,
26 |             1.9401,
27 |             2.22227,
28 |             1.94508,
29 |             1.94212,
30 |             1.93958,
31 |             1.94562,
32 |             1.9442,
33 |             1.94606
34 |         ]
35 |     },
36 |     "num-zeros": {
37 |         "start_step": 0,
38 |         "end_step": 34,
39 |         "step_interval": 5,
40 |         "values": [
41 |             26876.0,
42 |             19339.0,
43 |             24146.0,
44 |             23625.0,
45 |             21440.0,
46 |             17865.0,
47 |             19282.0
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml:
--------------------------------------------------------------------------------
1 | ENV_VARS:
2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
3 |   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
4 |   NCCL_ALGO: Tree
5 |   CUBLAS_WORKSPACE_CONFIG: :4096:8
6 | MODEL_ARGS:
7 | TEST_TYPE: regular
8 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 | MODEL_ARGS:
 5 |   trainer.num_nodes: 1
 6 |   trainer.devices: 8
 7 |   trainer.max_steps: 50
 8 |   trainer.val_check_interval: 50
 9 |   trainer.limit_val_batches: 50
10 |   trainer.max_epochs: 'null'
11 |   trainer.precision: bf16
12 |   model.num_layers: 12
13 |   model.hidden_size: 768
14 |   model.num_attention_heads: 12
15 |   model.micro_batch_size: 1
16 |   model.global_batch_size: 8
17 |   model.tensor_model_parallel_size: 2
18 |   model.pipeline_model_parallel_size: 4
19 |   model.virtual_pipeline_model_parallel_size: 3
20 |   model.encoder_seq_length: 2048
21 |   model.max_position_embeddings: 2048
22 |   model.ffn_hidden_size: 3072
23 |   model.mcore_gpt: 'True'
24 |   model.apply_query_key_layer_scaling: 'True'
25 |   model.megatron_amp_O2: 'True'
26 |   model.data.data_prefix: '[]'
27 |   model.data.data_impl: mock
28 |   model.data.splits_string: '[99990,8,2]'
29 |   model.optim.name: distributed_fused_adam
30 |   model.optim.weight_decay: 0.1
31 |   exp_manager.create_checkpoint_callback: 'False'
32 |   model.sequence_parallel: 'True'
33 |   model.overlap_p2p_comm: 'True'
34 |   model.batch_p2p_comm: 'False'
35 | TEST_TYPE: regular
36 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml:
--------------------------------------------------------------------------------
 1 | ENV_VARS:
 2 |   CUDA_DEVICE_MAX_CONNECTIONS: 1
 3 |   SKIP_PYTEST: 1
 4 | MODEL_ARGS:
 5 |   trainer.num_nodes: 1
 6 |   trainer.devices: 8
 7 |   trainer.max_steps: 50
 8 |   trainer.val_check_interval: 50
 9 |   trainer.limit_val_batches: 50
10 |   trainer.max_epochs: 'null'
11 |   trainer.precision: bf16
12 |   model.num_layers: 12
13 |   model.hidden_size: 768
14 |   model.num_attention_heads: 12
15 |   model.micro_batch_size: 4
16 |   model.global_batch_size: 64
17 |   model.tensor_model_parallel_size: 1
18 |   model.pipeline_model_parallel_size: 1
19 |   model.virtual_pipeline_model_parallel_size: 'null'
20 |   model.encoder_seq_length: 2048
21 |   model.max_position_embeddings: 2048
22 |   model.ffn_hidden_size: 3072
23 |   model.mcore_gpt: 'True'
24 |   model.apply_query_key_layer_scaling: 'True'
25 |   model.megatron_amp_O2: 'True'
26 |   model.data.data_prefix: '[]'
27 |   model.data.data_impl: mock
28 |   model.data.splits_string: '[99990,8,2]'
29 |   model.optim.name: distributed_fused_adam
30 |   model.optim.weight_decay: 0.1
31 |   exp_manager.create_checkpoint_callback: 'False'
32 | TEST_TYPE: regular
33 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.83373,
 8 |             10.86683,
 9 |             10.89023,
10 |             10.81051,
11 |             10.68459,
12 |             10.60979,
13 |             10.08992,
14 |             10.21481,
15 |             10.14018,
16 |             9.80603
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1488.0,
25 |             1854.0,
26 |             1854.0,
27 |             1884.0,
28 |             1794.0,
29 |             1784.0,
30 |             1569.0,
31 |             1942.0,
32 |             2263.0,
33 |             2147.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             13.39475,
42 |             0.14158,
43 |             0.14256,
44 |             0.14166,
45 |             0.14243,
46 |             0.14232,
47 |             0.143,
48 |             0.14113,
49 |             0.14164,
50 |             0.14069
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.83373,
 8 |             10.86683,
 9 |             10.89023,
10 |             10.81051,
11 |             10.68459,
12 |             10.60979,
13 |             10.08992,
14 |             10.21481,
15 |             10.14018,
16 |             9.80603
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1488.0,
25 |             1854.0,
26 |             1854.0,
27 |             1884.0,
28 |             1794.0,
29 |             1784.0,
30 |             1569.0,
31 |             1942.0,
32 |             2263.0,
33 |             2147.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             13.39475,
42 |             0.14158,
43 |             0.14256,
44 |             0.14166,
45 |             0.14243,
46 |             0.14232,
47 |             0.143,
48 |             0.14113,
49 |             0.14164,
50 |             0.14069
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.79206,
 8 |             10.86691,
 9 |             10.89065,
10 |             10.78186,
11 |             10.65978,
12 |             10.58022,
13 |             10.08207,
14 |             10.19156,
15 |             10.13495,
16 |             9.81167
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1626.0,
25 |             1866.0,
26 |             1959.0,
27 |             1816.0,
28 |             1890.0,
29 |             1654.0,
30 |             1537.0,
31 |             1965.0,
32 |             2436.0,
33 |             2405.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             21.9348,
42 |             0.1633,
43 |             0.16334,
44 |             0.16269,
45 |             0.16133,
46 |             0.16064,
47 |             0.16007,
48 |             0.15926,
49 |             0.1592,
50 |             0.15982
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.79206,
 8 |             10.86691,
 9 |             10.89065,
10 |             10.78186,
11 |             10.65978,
12 |             10.58022,
13 |             10.08207,
14 |             10.19156,
15 |             10.13495,
16 |             9.81167
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1626.0,
25 |             1866.0,
26 |             1959.0,
27 |             1816.0,
28 |             1890.0,
29 |             1654.0,
30 |             1537.0,
31 |             1965.0,
32 |             2436.0,
33 |             2405.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             21.9348,
42 |             0.1633,
43 |             0.16334,
44 |             0.16269,
45 |             0.16133,
46 |             0.16064,
47 |             0.16007,
48 |             0.15926,
49 |             0.1592,
50 |             0.15982
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.86122,
 8 |             10.88647,
 9 |             10.87773,
10 |             10.83111,
11 |             10.7165,
12 |             10.60619,
13 |             10.13147,
14 |             10.22767,
15 |             10.15929,
16 |             9.83482
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1694.0,
25 |             2148.0,
26 |             2169.0,
27 |             2103.0,
28 |             1991.0,
29 |             1900.0,
30 |             1707.0,
31 |             2189.0,
32 |             2557.0,
33 |             2606.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             9.61991,
42 |             0.29135,
43 |             0.28852,
44 |             0.28971,
45 |             0.29221,
46 |             0.28994,
47 |             0.28976,
48 |             0.28887,
49 |             0.28975,
50 |             0.2869
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.86122,
 8 |             10.88647,
 9 |             10.87773,
10 |             10.83111,
11 |             10.7165,
12 |             10.60623,
13 |             10.13146,
14 |             10.2277,
15 |             10.15933,
16 |             9.8348
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1694.0,
25 |             2148.0,
26 |             2169.0,
27 |             2103.0,
28 |             1991.0,
29 |             1869.0,
30 |             1760.0,
31 |             2214.0,
32 |             2529.0,
33 |             2587.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             11.72537,
42 |             0.29824,
43 |             0.29549,
44 |             0.29574,
45 |             0.29514,
46 |             0.29533,
47 |             0.29415,
48 |             0.30722,
49 |             0.29731,
50 |             0.29867
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.87346,
 8 |             10.89625,
 9 |             10.88939,
10 |             10.88681,
11 |             10.8893,
12 |             10.84863,
13 |             10.6962,
14 |             10.63919,
15 |             10.53931,
16 |             10.31119
17 |         ]
18 |     },
19 |     "iteration-time": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             4.95266,
25 |             0.07818,
26 |             0.07961,
27 |             0.07716,
28 |             0.08368,
29 |             0.08327,
30 |             0.08409,
31 |             0.08371,
32 |             0.08372,
33 |             0.08387
34 |         ]
35 |     },
36 |     "num-zeros": {
37 |         "start_step": 0,
38 |         "end_step": 32,
39 |         "step_interval": 5,
40 |         "values": [
41 |             1300.0,
42 |             1287.0,
43 |             1565.0,
44 |             1441.0,
45 |             1419.0,
46 |             1295.0,
47 |             1177.0
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.8893, 10.84864, 10.6962, 10.63918, 10.5393, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1298.0, 1352.0, 1590.0, 1403.0, 1435.0, 1266.0, 1195.0]}, "iteration_timing_avg": 0.07655911764705883}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.87346,
 8 |             10.89625,
 9 |             10.88939,
10 |             10.88681,
11 |             10.88931,
12 |             10.84864,
13 |             10.6962,
14 |             10.63918,
15 |             10.5393,
16 |             10.31119
17 |         ]
18 |     },
19 |     "iteration-time": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             5.32064,
25 |             0.08204,
26 |             0.08233,
27 |             0.08176,
28 |             0.09748,
29 |             0.0966,
30 |             0.09648,
31 |             0.09617,
32 |             0.09604,
33 |             0.09646
34 |         ]
35 |     },
36 |     "num-zeros": {
37 |         "start_step": 0,
38 |         "end_step": 32,
39 |         "step_interval": 5,
40 |         "values": [
41 |             1112.0,
42 |             1124.0,
43 |             1229.0,
44 |             1665.0,
45 |             1269.0,
46 |             1219.0,
47 |             1572.0
48 |         ]
49 |     }
50 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.88931, 10.84864, 10.6962, 10.63918, 10.53931, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1131.0, 1173.0, 1218.0, 1783.0, 1278.0, 1244.0, 1555.0]}, "iteration_timing_avg": 0.07975499999999999}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.8401,
 8 |             10.87259,
 9 |             10.85024,
10 |             10.79646,
11 |             10.68156,
12 |             10.60618,
13 |             10.12768,
14 |             10.22185,
15 |             10.13788,
16 |             9.82309
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1698.0,
25 |             1855.0,
26 |             1949.0,
27 |             1968.0,
28 |             1881.0,
29 |             1783.0,
30 |             1653.0,
31 |             2037.0,
32 |             2313.0,
33 |             2300.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             5.37706,
42 |             0.09618,
43 |             0.09432,
44 |             0.09666,
45 |             0.09442,
46 |             0.09619,
47 |             0.09453,
48 |             0.0975,
49 |             0.09517,
50 |             0.09727
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.8401,
 8 |             10.87259,
 9 |             10.85023,
10 |             10.79646,
11 |             10.68153,
12 |             10.60619,
13 |             10.12767,
14 |             10.22185,
15 |             10.13787,
16 |             9.82307
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1698.0,
25 |             1855.0,
26 |             1896.0,
27 |             1866.0,
28 |             2032.0,
29 |             1814.0,
30 |             1664.0,
31 |             1961.0,
32 |             2306.0,
33 |             2403.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             8.00253,
42 |             0.13176,
43 |             0.13026,
44 |             0.13184,
45 |             0.13023,
46 |             0.13135,
47 |             0.13014,
48 |             0.13143,
49 |             0.1305,
50 |             0.13191
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82974, 10.85934, 10.88536, 10.78981, 10.64534, 10.56415, 9.99534, 10.13972, 10.06259, 9.71481]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [261.0, 256.0, 258.0, 250.0, 243.0, 265.0, 254.0, 299.0, 299.0, 294.0]}, "iteration_timing_avg": 0.3993126470588235}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85803, 10.88122, 10.85832, 10.80987, 10.66115, 10.55375, 10.01843, 10.14234, 10.05958, 9.71149]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [244.0, 231.0, 243.0, 257.0, 247.0, 267.0, 256.0, 299.0, 318.0, 325.0]}, "iteration_timing_avg": 0.3993126470588235}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.8468,
 8 |             10.87769,
 9 |             10.90302,
10 |             10.82026,
11 |             10.67979,
12 |             10.60157,
13 |             10.06449,
14 |             10.19316,
15 |             10.11411,
16 |             9.76007
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1692.0,
25 |             2044.0,
26 |             2005.0,
27 |             2007.0,
28 |             1945.0,
29 |             1868.0,
30 |             1701.0,
31 |             2085.0,
32 |             2389.0,
33 |             2377.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             10.20538,
42 |             0.14353,
43 |             0.14213,
44 |             0.14213,
45 |             0.14068,
46 |             0.14104,
47 |             0.14078,
48 |             0.14149,
49 |             0.14065,
50 |             0.14118
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.84474,
 8 |             10.87688,
 9 |             10.90253,
10 |             10.81872,
11 |             10.67849,
12 |             10.60076,
13 |             10.06361,
14 |             10.19267,
15 |             10.11344,
16 |             9.75987
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1769.0,
25 |             2129.0,
26 |             1987.0,
27 |             1961.0,
28 |             1961.0,
29 |             1886.0,
30 |             1655.0,
31 |             2130.0,
32 |             2315.0,
33 |             2362.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             8.72642,
42 |             0.16194,
43 |             0.15926,
44 |             0.15956,
45 |             0.15972,
46 |             0.1623,
47 |             0.16029,
48 |             0.15863,
49 |             0.15947,
50 |             0.15935
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.79205,
 8 |             10.86789,
 9 |             10.89149,
10 |             10.78328,
11 |             10.66126,
12 |             10.58275,
13 |             10.08467,
14 |             10.19448,
15 |             10.13785,
16 |             9.81454
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1580.0,
25 |             1778.0,
26 |             1849.0,
27 |             1841.0,
28 |             1884.0,
29 |             1679.0,
30 |             1544.0,
31 |             1953.0,
32 |             2449.0,
33 |             2335.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             10.79458,
42 |             0.16744,
43 |             0.16286,
44 |             0.16276,
45 |             0.16292,
46 |             0.16346,
47 |             0.16288,
48 |             0.16273,
49 |             0.16282,
50 |             0.16245
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.79208,
 8 |             10.86688,
 9 |             10.89063,
10 |             10.7818,
11 |             10.65964,
12 |             10.58005,
13 |             10.0819,
14 |             10.19136,
15 |             10.13478,
16 |             9.81149
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1602.0,
25 |             1792.0,
26 |             1751.0,
27 |             1885.0,
28 |             1872.0,
29 |             1716.0,
30 |             1561.0,
31 |             1867.0,
32 |             2355.0,
33 |             2329.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             13.82777,
42 |             0.17397,
43 |             0.17253,
44 |             0.17285,
45 |             0.17221,
46 |             0.17204,
47 |             0.17139,
48 |             0.17105,
49 |             0.17258,
50 |             0.17185
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.12348235294117646}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.74049,
 8 |             10.81937,
 9 |             10.84178,
10 |             10.75558,
11 |             10.69821,
12 |             10.63096,
13 |             10.2026,
14 |             10.36288,
15 |             10.25634,
16 |             9.94255
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             2529.0,
25 |             2845.0,
26 |             2909.0,
27 |             2683.0,
28 |             2631.0,
29 |             2573.0,
30 |             2281.0,
31 |             2559.0,
32 |             2484.0,
33 |             2360.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             14.80986,
42 |             0.17896,
43 |             0.17664,
44 |             0.17758,
45 |             0.17762,
46 |             0.17676,
47 |             0.17638,
48 |             0.1761,
49 |             0.17725,
50 |             0.1755
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.74049, 10.81937, 10.84178, 10.75551, 10.69818, 10.63091, 10.20265, 10.36288, 10.25632, 9.94256]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2527.0, 2937.0, 2975.0, 2749.0, 2580.0, 2593.0, 2320.0, 2616.0, 2541.0, 2393.0]}, "iteration_timing_avg": 0.12725500000000006}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.90105, 10.91104, 10.91635, 10.84822, 10.70727, 10.63018, 10.15241, 10.26052, 10.15994, 9.83162]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727086.0, 23021732.0, 22500940.0, 22830674.0, 22739332.0, 22547236.0, 22955516.0, 22590012.0, 22659588.0, 22884630.0]}, "iteration_timing_avg": 0.1246464705882353}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.82005,
 8 |             10.87447,
 9 |             10.87793,
10 |             10.79509,
11 |             10.68164,
12 |             10.59514,
13 |             10.10045,
14 |             10.21239,
15 |             10.13862,
16 |             9.80879
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1562.0,
25 |             1754.0,
26 |             1879.0,
27 |             1778.0,
28 |             1877.0,
29 |             1733.0,
30 |             1578.0,
31 |             1924.0,
32 |             2299.0,
33 |             2292.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             18.71949,
42 |             0.16575,
43 |             0.16508,
44 |             0.16465,
45 |             0.16475,
46 |             0.16222,
47 |             0.16473,
48 |             0.16461,
49 |             0.16489,
50 |             0.16518
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.82005,
 8 |             10.87448,
 9 |             10.87796,
10 |             10.79506,
11 |             10.68153,
12 |             10.59413,
13 |             10.09983,
14 |             10.20957,
15 |             10.13642,
16 |             9.80012
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1562.0,
25 |             1687.0,
26 |             1848.0,
27 |             1736.0,
28 |             1955.0,
29 |             1764.0,
30 |             1580.0,
31 |             1886.0,
32 |             2252.0,
33 |             2259.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             16.16694,
42 |             0.16354,
43 |             0.16237,
44 |             0.16232,
45 |             0.16088,
46 |             0.15891,
47 |             0.15894,
48 |             0.15865,
49 |             0.16009,
50 |             0.1576
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87448, 10.87794, 10.79507, 10.68154, 10.59412, 10.09987, 10.20952, 10.13639, 9.80012]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1734.0, 1884.0, 1684.0, 1815.0, 1766.0, 1601.0, 1904.0, 2361.0, 2347.0]}, "iteration_timing_avg": 0.12273676470588235}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.82005,
 8 |             10.87447,
 9 |             10.87793,
10 |             10.79509,
11 |             10.68164,
12 |             10.59514,
13 |             10.10045,
14 |             10.21239,
15 |             10.13862,
16 |             9.80879
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1562.0,
25 |             1754.0,
26 |             1879.0,
27 |             1778.0,
28 |             1877.0,
29 |             1733.0,
30 |             1578.0,
31 |             1924.0,
32 |             2299.0,
33 |             2292.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             18.68941,
42 |             0.16498,
43 |             0.16403,
44 |             0.16281,
45 |             0.16302,
46 |             0.16352,
47 |             0.16473,
48 |             0.16207,
49 |             0.16362,
50 |             0.16219
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.82005,
 8 |             10.87447,
 9 |             10.87799,
10 |             10.79507,
11 |             10.68165,
12 |             10.59511,
13 |             10.10047,
14 |             10.2124,
15 |             10.13861,
16 |             9.80876
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1562.0,
25 |             1738.0,
26 |             1852.0,
27 |             1802.0,
28 |             1917.0,
29 |             1765.0,
30 |             1570.0,
31 |             1949.0,
32 |             2251.0,
33 |             2270.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             14.96968,
42 |             0.16347,
43 |             0.16403,
44 |             0.16317,
45 |             0.162,
46 |             0.16129,
47 |             0.16268,
48 |             0.16156,
49 |             0.16212,
50 |             0.16407
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12168999999999999}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12873676470588236}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87799, 10.79508, 10.68166, 10.59514, 10.10042, 10.21238, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1857.0, 1746.0, 1883.0, 1738.0, 1475.0, 1851.0, 2303.0, 2258.0]}, "iteration_timing_avg": 0.12873676470588236}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9359, 10.93547, 10.94238, 10.88073, 10.75653, 10.66332, 10.1672, 10.27241, 10.19577, 9.86006]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727686.0, 23020980.0, 22501260.0, 22830024.0, 22739772.0, 22548148.0, 22955712.0, 22589816.0, 22660000.0, 22884332.0]}, "iteration_timing_avg": 0.12799705882352944}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81184, 10.84052, 10.8763, 10.79906, 10.68214, 10.59702, 10.49258, 10.11236, 10.12393, 9.98165]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1118.0, 1331.0, 1230.0, 1085.0, 1180.0, 1245.0, 1454.0, 1330.0, 1752.0, 1851.0]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [17.24286, 0.35341, 0.35187, 0.35028, 0.34941, 0.35093, 0.3488, 0.35179, 0.34905, 0.34684]}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81184, 10.84052, 10.87624, 10.79904, 10.68212, 10.59698, 10.49257, 10.11232, 10.12396, 9.98163]},  "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1125.0, 1304.0, 1252.0, 1102.0, 1201.0, 1200.0, 1489.0, 1395.0, 1677.0, 1867.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1125.0, 1304.0, 1252.0, 1102.0, 1201.0, 1200.0, 1489.0, 1395.0, 1677.0, 1867.0]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22.22011, 0.36082, 0.35927, 0.35627, 0.35901, 0.35008, 0.34828, 0.34774, 0.35145, 0.35141]}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9735, 10.96043, 10.95576, 10.91038, 10.78791, 10.71201, 10.22424, 10.28926, 10.19049, 9.86378]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727052.0, 23021930.0, 22501022.0, 22831208.0, 22740024.0, 22547916.0, 22955210.0, 22589344.0, 22658940.0, 22884970.0]},"iteration_timing_avg": 0.1367805882352941}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.88734,
 8 |             10.91614,
 9 |             10.89061,
10 |             10.86173,
11 |             10.72753,
12 |             10.64491,
13 |             10.18012,
14 |             10.2562,
15 |             10.1611,
16 |             9.8539
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             3268.0,
25 |             4040.0,
26 |             4142.0,
27 |             3766.0,
28 |             4028.0,
29 |             3648.0,
30 |             3306.0,
31 |             4028.0,
32 |             4648.0,
33 |             4546.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             7.0561,
42 |             0.32588,
43 |             0.32628,
44 |             0.32385,
45 |             0.32419,
46 |             0.32364,
47 |             0.32337,
48 |             0.32334,
49 |             0.32358,
50 |             0.32395
51 |         ]
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88734, 10.91612, 10.8906, 10.86171, 10.72752, 10.64491, 10.18015, 10.25622, 10.16111, 9.85394]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3228.0, 3820.0, 3890.0, 3848.0, 3902.0, 3486.0, 3310.0, 3982.0, 4472.0, 4532.0]}, "iteration_timing_avg": 0.22043823529411763}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.79987,
 8 |             10.85983,
 9 |             10.865,
10 |             10.799,
11 |             10.70987,
12 |             10.63782,
13 |             10.1965,
14 |             10.3099,
15 |             10.22262,
16 |             9.91423
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             30784.0,
25 |             37528.0,
26 |             37616.0,
27 |             36105.0,
28 |             33464.0,
29 |             34923.0,
30 |             30806.0,
31 |             35663.0,
32 |             36661.0,
33 |             37641.0
34 |         ]
35 |     },
36 |     "iteration_timing_avg": 0.3566726470588235
37 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.8029,
 8 |             10.86149,
 9 |             10.86819,
10 |             10.80829,
11 |             10.72062,
12 |             10.64588,
13 |             10.21132,
14 |             10.32324,
15 |             10.2265,
16 |             9.92918
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             31473.0,
25 |             37753.0,
26 |             38332.0,
27 |             36348.0,
28 |             33270.0,
29 |             34310.0,
30 |             30284.0,
31 |             35432.0,
32 |             36356.0,
33 |             37109.0
34 |         ]
35 |     },
36 |     "iteration_timing_avg": 0.21900323529411767
37 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.83445,
 8 |             10.87978,
 9 |             10.87924,
10 |             10.81567,
11 |             10.69374,
12 |             10.60333,
13 |             10.08824,
14 |             10.21471,
15 |             10.10778,
16 |             9.78309
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             26648.0,
25 |             32884.0,
26 |             33611.0,
27 |             31683.0,
28 |             28744.0,
29 |             30671.0,
30 |             28602.0,
31 |             33538.0,
32 |             34560.0,
33 |             35099.0
34 |         ]
35 |     },
36 |     "iteration_timing_avg": 0.28211852941176474
37 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.81823,
 8 |             10.86998,
 9 |             10.8727,
10 |             10.80014,
11 |             10.67571,
12 |             10.57944,
13 |             10.06572,
14 |             10.19342,
15 |             10.08575,
16 |             9.75236
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             26801.0,
25 |             32734.0,
26 |             32925.0,
27 |             31593.0,
28 |             28610.0,
29 |             30362.0,
30 |             28464.0,
31 |             33486.0,
32 |             33403.0,
33 |             35162.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             8.63293,
42 |             0.29454,
43 |             0.28102,
44 |             0.28297,
45 |             0.28369,
46 |             0.2848,
47 |             0.30008,
48 |             0.29214,
49 |             0.31041,
50 |             0.295
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.81823,
 8 |             10.86998,
 9 |             10.8727,
10 |             10.80014,
11 |             10.67571,
12 |             10.57944,
13 |             10.06572,
14 |             10.19342,
15 |             10.08575,
16 |             9.75236
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             26801.0,
25 |             32734.0,
26 |             32925.0,
27 |             31593.0,
28 |             28610.0,
29 |             30362.0,
30 |             28464.0,
31 |             33486.0,
32 |             33403.0,
33 |             35162.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             11.94141,
42 |             0.28425,
43 |             0.28413,
44 |             0.29449,
45 |             0.28534,
46 |             0.29977,
47 |             0.30061,
48 |             0.30321,
49 |             0.30986,
50 |             0.30404
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.92705,
 8 |             10.93624,
 9 |             10.89333,
10 |             10.87317,
11 |             10.74871,
12 |             10.65379,
13 |             10.15753,
14 |             10.24638,
15 |             10.15178,
16 |             9.83806
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1653.0,
25 |             1874.0,
26 |             1994.0,
27 |             1828.0,
28 |             1769.0,
29 |             1845.0,
30 |             1674.0,
31 |             1957.0,
32 |             2364.0,
33 |             2345.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             11.33146,
42 |             0.22344,
43 |             0.21997,
44 |             0.21977,
45 |             0.21792,
46 |             0.21685,
47 |             0.22555,
48 |             0.21755,
49 |             0.21796,
50 |             0.21694
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.92705,
 8 |             10.93628,
 9 |             10.89334,
10 |             10.87322,
11 |             10.74869,
12 |             10.65374,
13 |             10.15755,
14 |             10.24638,
15 |             10.15177,
16 |             9.83799
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             68.0,
25 |             64.0,
26 |             61.0,
27 |             70.0,
28 |             66.0,
29 |             55.0,
30 |             76.0,
31 |             72.0,
32 |             64.0,
33 |             85.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             9.68102,
42 |             0.22487,
43 |             0.22503,
44 |             0.22418,
45 |             0.22445,
46 |             0.22504,
47 |             0.22333,
48 |             0.22333,
49 |             0.22458,
50 |             0.22367
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93628, 10.89335, 10.87322, 10.7487, 10.65379, 10.15754, 10.2464, 10.15175, 9.83801]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [68.0, 64.0, 61.0, 58.0, 55.0, 85.0, 77.0, 68.0, 78.0, 63.0]}}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.92705,
 8 |             10.93624,
 9 |             10.89333,
10 |             10.87317,
11 |             10.74871,
12 |             10.65379,
13 |             10.15753,
14 |             10.24638,
15 |             10.15178,
16 |             9.83806
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1653.0,
25 |             1874.0,
26 |             1994.0,
27 |             1828.0,
28 |             1769.0,
29 |             1845.0,
30 |             1674.0,
31 |             1957.0,
32 |             2364.0,
33 |             2345.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             11.05896,
42 |             0.21941,
43 |             0.22052,
44 |             0.22086,
45 |             0.22118,
46 |             0.22063,
47 |             0.22075,
48 |             0.22064,
49 |             0.22956,
50 |             0.23548
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.92705,
 8 |             10.93624,
 9 |             10.89333,
10 |             10.87317,
11 |             10.74871,
12 |             10.65379,
13 |             10.15753,
14 |             10.24638,
15 |             10.15178,
16 |             9.83806
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1653.0,
25 |             1874.0,
26 |             1994.0,
27 |             1828.0,
28 |             1769.0,
29 |             1845.0,
30 |             1674.0,
31 |             1957.0,
32 |             2364.0,
33 |             2345.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             9.47055,
42 |             0.34439,
43 |             0.22313,
44 |             0.22277,
45 |             0.22175,
46 |             0.21936,
47 |             0.23348,
48 |             0.22009,
49 |             0.22043,
50 |             0.21934
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.86217,
 8 |             10.88646,
 9 |             10.87861,
10 |             10.83295,
11 |             10.7203,
12 |             10.61089,
13 |             10.14181,
14 |             10.23434,
15 |             10.16609,
16 |             9.84444
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             1769.0,
25 |             2056.0,
26 |             2198.0,
27 |             2079.0,
28 |             2181.0,
29 |             1912.0,
30 |             1825.0,
31 |             2115.0,
32 |             2621.0,
33 |             2598.0
34 |         ]
35 |     },
36 |     "iteration-time": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             6.42448,
42 |             0.42854,
43 |             0.42836,
44 |             0.42582,
45 |             0.42274,
46 |             0.42187,
47 |             0.42561,
48 |             0.42178,
49 |             0.44234,
50 |             0.42304
51 |         ]
52 |     }
53 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13495, 9.13325, 9.12905, 9.11323, 9.05401, 9.04233, 8.98255, 8.93258, 8.88937, 8.78788]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3477473.0, 3584371.0, 3475194.0, 3382773.0, 3699802.0, 3478715.0, 3397967.0, 3453615.0, 3424973.0, 3585127.0]},"iteration_timing_avg": 0.2253964705882353}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3557301.0, 3663955.0, 3555196.0, 3462888.0, 3780083.0, 3559007.0, 3477262.0, 3533752.0, 3505033.0, 3665096.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16173, 9.16211, 9.15686, 9.14022, 9.09396, 9.07146, 9.01401, 8.9651, 8.91881, 8.82578]}}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19864, 9.20112, 9.19598, 9.17297, 9.1171, 9.10232, 9.04013, 8.98432, 8.94016, 8.83862]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3717564.0, 3824205.0, 3714643.0, 3622971.0, 3939727.0, 3718836.0, 3637293.0, 3694227.0, 3665382.0, 3825257.0]}, "iteration_timing_avg": 0.5847132352941178}


--------------------------------------------------------------------------------
/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.41501, 9.20443, 8.62112, 8.34419, 8.08454, 7.96905, 7.68086, 7.39418, 7.26109, 7.19122]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [115751.0, 111072.0, 117055.0, 112398.0, 118712.0, 116944.0, 111387.0, 114025.0, 118464.0, 116959.0]}, "iteration_timing_avg": 0.2253964705882353}


--------------------------------------------------------------------------------
/tests/test_utils/recipes/_build-mcore-dev.yaml:
--------------------------------------------------------------------------------
 1 | type: build
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | spec:
 5 |   name: mcore-pyt-dev
 6 |   platforms: [linux/amd64]
 7 |   source:
 8 |     # The image tag will be added via `jet-tests.yaml`
 9 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
10 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_dev
11 |     


--------------------------------------------------------------------------------
/tests/test_utils/recipes/_build-mcore-lts.yaml:
--------------------------------------------------------------------------------
 1 | type: build
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | spec:
 5 |   name: mcore-pyt-lts
 6 |   platforms: [linux/amd64]
 7 |   source:
 8 |     # The image tag will be added via `jet-tests.yaml`
 9 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
10 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_lts
11 |     


--------------------------------------------------------------------------------
/tests/test_utils/recipes/_build-nemo.yaml:
--------------------------------------------------------------------------------
 1 | type: build
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | spec:
 5 |   name: mcore-nemo
 6 |   platforms: [linux/amd64]
 7 |   source:
 8 |     # The image tag will be added via `jet-tests.yaml`
 9 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
10 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci


--------------------------------------------------------------------------------
/tests/test_utils/recipes/gpt-nemo.yaml:
--------------------------------------------------------------------------------
 1 | type: basic
 2 | format_version: 1
 3 | maintainers: [mcore]
 4 | loggers: [stdout]
 5 | spec:
 6 |   name: "{test_case}"
 7 |   model: gpt-nemo
 8 |   build: mcore-nemo
 9 |   nodes: 1
10 |   gpus: 8
11 |   platforms: dgx_a100
12 |   time_limit: 1800
13 |   scope: null
14 |   script: |-
15 |     ls
16 |     cd /opt/NeMo
17 |   
18 |     ARGUMENTS=(
19 |         "DATA_PATH='-'"
20 |         "DATA_CACHE_PATH='-'"
21 |         "OUTPUT_PATH={assets_dir}"
22 |         "TENSORBOARD_PATH={assets_dir}/tensorboard"
23 |         "CHECKPOINT_PATH=/workspace/checkpoints"
24 |         "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
25 |         "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
26 |         "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
27 |         "N_REPEAT={n_repeat}"
28 |     )
29 | 
30 |     bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
31 | 
32 | products:
33 |   - environment: [dev]
34 |     scope: [mr]
35 |     n_repeat: [5]
36 |     test_case:
37 |     - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
38 |     - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
39 |     


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
1 | import torch._dynamo
2 | 
3 | torch._dynamo.config.suppress_errors = True
4 | 


--------------------------------------------------------------------------------
/tests/unit_tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/data/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/dist_checkpointing/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | import pytest
 4 | 
 5 | from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy
 6 | 
 7 | 
 8 | def pytest_sessionfinish(session, exitstatus):
 9 |     if exitstatus == 5:
10 |         session.exitstatus = 0
11 | 
12 | 
13 | @pytest.fixture(scope='session', autouse=True)
14 | def set_default_dist_ckpt_strategy():
15 |     def get_pyt_dist_save_sharded_strategy():
16 |         return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1)
17 | 
18 |     with mock.patch(
19 |         'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy',
20 |         new=get_pyt_dist_save_sharded_strategy,
21 |     ) as _fixture:
22 |         yield _fixture
23 | 


--------------------------------------------------------------------------------
/tests/unit_tests/dist_checkpointing/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/dist_checkpointing/models/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/export/trtllm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/export/trtllm/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/engines/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/model_inference_wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/model_inference_wrappers/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
 4 |     InferenceWrapperConfig,
 5 | )
 6 | 
 7 | 
 8 | class TestModelInferenceWrapperConfig:
 9 | 
10 |     def test_inference_params(self):
11 |         inference_parameters = InferenceWrapperConfig(
12 |             hidden_size=10,
13 |             inference_batch_times_seqlen_threshold=10,
14 |             padded_vocab_size=10,
15 |             params_dtype=torch.float,
16 |             fp32_residual_connection=False,
17 |         )
18 |         inference_parameters.add_attributes({"abc": 45})
19 |         assert (
20 |             inference_parameters.abc == 45
21 |         ), f"min tokens not set correctly. it is {inference_parameters.min_tokens}"
22 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/test_common_inference_params.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.inference.common_inference_params import CommonInferenceParams
 2 | 
 3 | 
 4 | class TestCommonInferenceParams:
 5 | 
 6 |     def test_inference_params(self):
 7 |         inference_parameters = CommonInferenceParams()
 8 |         inference_parameters.add_attributes({"min_tokens": 45})
 9 |         assert (
10 |             inference_parameters.min_tokens == 45
11 |         ), f"min tokens not set correctly. it is {inference_parameters.min_tokens}"
12 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/test_inference_utils.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.inference.utils import Counter
 2 | 
 3 | 
 4 | class TestInferenceUtils:
 5 | 
 6 |     def test_counter(self):
 7 |         counter = Counter()
 8 |         r = next(counter)
 9 |         assert r == 0, f'Counter return value should be 0 but it is {r}'
10 |         assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}'
11 |         counter.reset()
12 |         assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}'
13 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/text_generation_controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/text_generation_controllers/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/pipeline_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/tensor_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
 5 | from tests.unit_tests.test_utilities import Utils
 6 | 
 7 | 
 8 | def test_vocab_parallel_cross_entropy():
 9 |     Utils.initialize_model_parallel(4, 2)
10 |     vocab_parallel_logits = torch.range(0, 7).repeat(16, 4).cuda()
11 |     target = torch.arange(0, 32, 2).cuda()
12 |     output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
13 |     expected_output = torch.tensor(
14 |         [
15 |             10.2309,
16 |             8.2309,
17 |             6.2309,
18 |             4.2309,
19 |             10.2309,
20 |             8.2309,
21 |             6.2309,
22 |             4.2309,
23 |             10.2309,
24 |             8.2309,
25 |             6.2309,
26 |             4.2309,
27 |             10.2309,
28 |             8.2309,
29 |             6.2309,
30 |             4.2309,
31 |         ]
32 |     ).cuda()
33 |     assert torch.equal(torch.round(expected_output), torch.round(output))
34 |     Utils.destroy_model_parallel()
35 | 


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from megatron.core.tensor_parallel.data import broadcast_data
 4 | from tests.unit_tests.test_utilities import Utils
 5 | 
 6 | 
 7 | def test_broadcast_data():
 8 |     Utils.initialize_model_parallel(2, 4)
 9 |     input_data = {
10 |         0: torch.ones((8, 8)).cuda() * 0.0,
11 |         1: torch.ones((8, 8)).cuda() * 1.0,
12 |         2: torch.ones((8, 8)).cuda() * 2.0,
13 |         3: torch.ones((8, 8)).cuda() * 3.0,
14 |         4: torch.ones((8, 8)).cuda() * 4.0,
15 |         5: torch.ones((8, 8)).cuda() * 5.0,
16 |         6: torch.ones((8, 8)).cuda() * 6.0,
17 |         7: torch.ones((8, 8)).cuda() * 7.0,
18 |     }
19 |     dtype = torch.float32
20 |     actual_output = broadcast_data([0, 1], input_data, dtype)
21 |     assert torch.equal(actual_output[0], input_data[0])
22 |     assert torch.equal(actual_output[1], input_data[1])
23 |     Utils.destroy_model_parallel()
24 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/transformer/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/transformer/moe/__init__.py


--------------------------------------------------------------------------------
/tools/bert_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder
4 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "h5py",
 7 |     "transformers", # for huggingface bert
 8 | ]
 9 | 
10 | for lib in required_libs:
11 |     try:
12 |         globals()[lib] = importlib.import_module(lib)
13 |     except ImportError as e:
14 |         raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.")
15 | 


--------------------------------------------------------------------------------
/tools/checkpoint/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import psutil
 4 | 
 5 | 
 6 | def print_memory_usage(key, rank, num_ranks):
 7 |     '''Print memory usage.'''
 8 |     process = psutil.Process()
 9 |     mem_info = process.memory_info()
10 |     print("> memory usage: '%s', rank %d / %d, mem %.1f/%.1f gb." % (
11 |         key,
12 |         rank,
13 |         num_ranks,
14 |         mem_info.rss / 1024**3,
15 |         100 * mem_info.rss / process.memory_percent() / 1024**3,
16 |     ))
17 | 


--------------------------------------------------------------------------------
/tools/copyright.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Files ending with .py should have Copyright notice in the first line.
 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 5 | 
 6 | # Move to the project root
 7 | cd $SCRIPT_DIR/..
 8 | find_files_with_missing_copyright() {
 9 | find ./megatron/ -type f -name '*.py' | while read path; do
10 |     echo -en $path"\t"
11 |     head -2 $path | grep -iv 'coding=' | head -1
12 | done \
13 |    | egrep -iv 'Copyright.*NVIDIA CORPORATION.*All rights reserved.' \
14 |    | grep -iv 'BSD 3-Clause License' \
15 |    | grep -iv 'Copyright.*Microsoft' \
16 |    | grep -iv 'Copyright.*The Open AI Team' \
17 |    | grep -iv 'Copyright.*The Google AI' \
18 |    | grep -iv 'Copyright.*Facebook' | while read line; do
19 |      echo $line | cut -d' ' -f1
20 |    done
21 | }
22 | 
23 | 
24 | declare RESULT=($(find_files_with_missing_copyright))  # (..) = array
25 | 
26 | if [ "${#RESULT[@]}" -gt 0 ]; then
27 |    echo "Error: Found files with missing copyright:"
28 |    for (( i=0; i<"${#RESULT[@]}"; i++ )); do
29 |       echo "path= ${RESULT[$i]}"
30 |    done
31 |    exit 1;
32 | else
33 |    echo "Ok: All files start with copyright notice"
34 | fi
35 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import glob
 5 | import sys
 6 | import json
 7 | import argparse
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--json_path", type=str, default=".",
13 |         help="path where all the json files are located")
14 | 
15 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
16 |         help="filename where the merged json should go")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     json_path = args.json_path
21 |     out_file = args.output_file
22 | 
23 |     json_files = glob.glob(json_path + '/*.json')
24 | 
25 |     counter = 0
26 | 
27 |     with open(out_file, 'w') as outfile:
28 |         for fname in json_files:
29 |             counter += 1
30 | 
31 |             if counter % 1024 == 0:
32 |                 print("Merging at ", counter, flush=True)
33 | 
34 |             with open(fname, 'r') as infile:
35 |                 for row in infile:
36 |                     each_row = json.loads(row)
37 |                     outfile.write(row)
38 | 
39 | 
40 |     print("Merged file", out_file, flush=True)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/report_theoretical_memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Computes theoretical memory footprint for model training without instantiating
 4 | a model and running training iterations on GPU(s)."""
 5 | 
 6 | from megatron.training import get_args
 7 | from megatron.training.initialize import initialize_megatron
 8 | from megatron.training.theoretical_memory_usage import report_theoretical_memory
 9 | 
10 | if __name__ == "__main__":
11 |     initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)
12 |     args = get_args()
13 | 
14 |     report_theoretical_memory(args, verbose=True)
15 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .cli import retro
4 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | from . import retro
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     retro.init(os.environ["RETRO_PROJECT_DIR"])
10 | 


--------------------------------------------------------------------------------
/tools/retro/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3
 2 | 
 3 | RUN pip install -U faiss-gpu
 4 | 
 5 | RUN apt update
 6 | 
 7 | RUN apt install -qy htop
 8 | 
 9 | RUN pip install -U transformers
10 | 
11 | RUN pip install --upgrade google-api-python-client
12 | 
13 | RUN pip install sentencepiece
14 | 
15 | RUN pip install h5py
16 | 
17 | RUN pip install nltk
18 | 
19 | RUN pip install einops
20 | 


--------------------------------------------------------------------------------
/tools/retro/sft/README.md:
--------------------------------------------------------------------------------
1 | ## Note
2 | 
3 | The content within this `sft` directory is still under active development and will be updated soon.


--------------------------------------------------------------------------------
/tools/retro/sft/open_inst.sh:
--------------------------------------------------------------------------------
1 | DATA_BLEND="1.0 open_inst"
2 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import sys
 3 | import json
 4 | import requests
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     url = sys.argv[1]
 9 |     url = 'http://' + url + '/api'
10 |     headers = {'Content-Type': 'application/json'}
11 | 
12 |     while True:
13 |         sentence = input("Enter prompt: ")
14 |         tokens_to_generate = int(eval(input("Enter number of tokens to generate: ")))
15 | 
16 |         data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate}
17 |         response = requests.put(url, data=json.dumps(data), headers=headers)
18 | 
19 |         if response.status_code != 200:
20 |             print(f"Error {response.status_code}: {response.json()['message']}")
21 |         else:
22 |             print("Megatron Response: ")
23 |             print(response.json()['text'][0])
24 | 


--------------------------------------------------------------------------------