├── .coveragerc ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── enhancement.md │ ├── question.md │ └── regression.md └── workflows │ └── stale.yml ├── .gitignore ├── .gitlab-ci.yml ├── .gitlab ├── labeler-config.yml └── stages │ ├── 00.pre.yml │ ├── 01.test.yml │ ├── 02.functional-tests.yml │ └── 03.publish.yml ├── .pylintrc ├── CHANGELOG.md ├── CODEOWNERS ├── CONTRIBUTING.md ├── Dockerfile.ci.dev ├── Dockerfile.ci.lts ├── Dockerfile.linting ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_MEGATRON.md ├── data_preprocessing.sh ├── docs ├── llama_mistral.md └── source │ ├── api-guide │ ├── context_parallel.rst │ ├── datasets.rst │ ├── dist_checkpointing.rst │ ├── dist_checkpointing.strategies.rst │ ├── dist_optimizer.md │ ├── distributed.rst │ ├── encoder_decoder_parallelism.rst │ ├── fusions.rst │ ├── index.rst │ ├── models.bert.rst │ ├── models.gpt.rst │ ├── models.rst │ ├── models.t5.rst │ ├── moe.rst │ ├── num_microbatches_calculator.rst │ ├── optimizer_param_scheduler.rst │ ├── pipeline_parallel.rst │ ├── tensor_parallel.rst │ └── transformer.rst │ ├── images │ ├── context_parallel │ │ ├── CP_overview.png │ │ └── CP_results.png │ ├── distrib_optimizer │ │ ├── data_flow.png │ │ └── sharding_scheme.png │ └── moe │ │ └── token_drop.png │ ├── index.rst │ └── user-guide │ └── index.rst ├── examples ├── academic_paper_scripts │ ├── detxoify_lm │ │ ├── README.md │ │ ├── annotations │ │ │ ├── filter-selfgeneration.py │ │ │ ├── perspective_api_annotate.py │ │ │ └── preprocess.sh │ │ ├── finetune_gpt.py │ │ ├── finetune_gpt_distributed-1.3b.sh │ │ ├── generate-1.3b.sh │ │ ├── generate_samples_gpt.py │ │ ├── perspective_api.py │ │ └── self_generation │ │ │ └── selfgenerate-1.3b-unconditional.sh │ ├── msdp │ │ ├── README.md │ │ ├── data_processing.sh │ │ ├── eval_knwl_generation.sh │ │ ├── eval_resp_generation.sh │ │ ├── prep_resp_gen.sh │ │ ├── prompt_knwl_gen.sh │ │ └── prompt_resp_gen.sh │ └── sc21 │ │ ├── CONFIG.sh │ │ ├── README.md │ │ ├── SBATCH.sh │ │ ├── SRUN.sh │ │ ├── run_figure_11.sh │ │ ├── run_figure_12.sh │ │ ├── run_figure_13.sh │ │ ├── run_figure_14.sh │ │ ├── run_figure_15.sh │ │ ├── run_figure_16.sh │ │ ├── run_figure_17.sh │ │ ├── run_figure_18.sh │ │ └── run_table_1.sh ├── bert │ ├── README.md │ └── train_bert_340m_distributed.sh ├── export │ ├── README.md │ ├── knowledge_distillation │ │ └── pretrain_gpt_modelopt.py │ ├── ptq_and_trtllm_export │ │ ├── README.md │ │ ├── ptq_trtllm_llama2_7b.sh │ │ ├── ptq_trtllm_llama3_1_8b.sh │ │ ├── ptq_trtllm_llama3_8b.sh │ │ ├── ptq_trtllm_minitron_8b.sh │ │ ├── ptq_trtllm_mistral_12b.sh │ │ ├── ptq_trtllm_mixtral_8x7b.sh │ │ ├── text_generation_ptq.py │ │ └── trtllm_text_generation.py │ └── trtllm_export │ │ ├── README.md │ │ ├── distributed_export │ │ └── gpt_distributed_gpu_export.py │ │ └── single_device_export │ │ └── gpt_single_device_cpu_export.py ├── gpt3 │ ├── README.md │ ├── gpt_config.yaml │ └── train_gpt3_175b_distributed.sh ├── inference │ ├── README.md │ ├── gpt │ │ └── simple_gpt_batch_inference.py │ ├── llama_mistral │ │ ├── huggingface_reference.py │ │ ├── run_text_generation_llama3.1.sh │ │ ├── run_text_generation_llama3.sh │ │ └── run_text_generation_mistral.sh │ ├── run_text_generation_server_345M.sh │ ├── run_text_generation_server_345M_8_tensor_parallel.sh │ └── t5 │ │ └── simple_t5_batch_inference.py ├── mamba │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── run_text_gen_server_8b.sh │ ├── run_text_gen_server_8b_gpt3.sh │ └── train.sh ├── mixtral │ ├── README.md │ └── train_mixtral_8x7b_distributed.sh ├── multimodal │ ├── Dockerfile │ ├── README.md │ ├── assets │ │ └── pretrain_curves.png │ ├── combine_lm_vision_checkpoints.sh │ ├── combine_state_dicts.py │ ├── config.py │ ├── convert_llava_pretrain_to_wds.py │ ├── dataloader_provider.py │ ├── dataset_helpers.py │ ├── evaluate_ai2d.py │ ├── evaluate_chartqa.py │ ├── evaluate_coco.py │ ├── evaluate_mathvista.py │ ├── evaluate_mmmu.py │ ├── evaluate_ocrbench.py │ ├── evaluate_textvqa.py │ ├── evaluate_vqav2.py │ ├── evaluation_datasets.py │ ├── image_processing.py │ ├── layer_specs.py │ ├── manual_prompts.json │ ├── model.py │ ├── model_converter │ │ ├── clip_converter.py │ │ ├── internvit_converter.py │ │ ├── siglip_converter.py │ │ └── vision_model_tester.py │ ├── multimodal_args.py │ ├── nvlm │ │ ├── README.md │ │ ├── internvit.py │ │ ├── nvlm_prompts.json │ │ ├── pp_checkpoint_converter.py │ │ ├── pretrain_blend.yaml │ │ ├── pretrain_qwen20_72b_internvit_6b.sh │ │ ├── pretrain_yi_34b_internvit_6b.sh │ │ ├── run_text_generation_qwen20_72b_internvit_6b.sh │ │ ├── run_text_generation_yi_34b_internvit_6b.sh │ │ ├── sft_34b_internvit.sh │ │ ├── sft_blend.yaml │ │ └── sft_qwen20_72b_internvit_6b.sh │ ├── pretrain_dataset.yaml │ ├── pretrain_mistral_clip.sh │ ├── run_text_generation.py │ ├── sft_dataset.yaml │ ├── sft_mistral_clip.sh │ ├── text_generation_mistral_clip.sh │ └── train.py ├── retro │ ├── README.md │ ├── preprocess_data.sh │ └── train_retro_2b_distributed.sh ├── run_simple_mcore_train_loop.py └── t5 │ ├── README.md │ ├── t5_mcore_train_curve.png │ └── train_t5_220m_distributed.sh ├── images ├── model_table.png ├── remoe_comparison.png ├── remoe_scaling.png ├── strong_scaling.png └── weak_scaling.png ├── megatron ├── core │ ├── QuickStart.md │ ├── README.md │ ├── README_STRAGGLER.md │ ├── __init__.py │ ├── config_logger.py │ ├── datasets │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── bert_dataset.py │ │ ├── blended_dataset.py │ │ ├── blended_megatron_dataset_builder.py │ │ ├── blended_megatron_dataset_config.py │ │ ├── gpt_dataset.py │ │ ├── helpers.cpp │ │ ├── helpers.py │ │ ├── indexed_dataset.py │ │ ├── masked_dataset.py │ │ ├── megatron_dataset.py │ │ ├── megatron_tokenizer.py │ │ ├── multimodal_dataset.py │ │ ├── readme.md │ │ ├── retro │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_embedders.py │ │ │ │ ├── config.py │ │ │ │ ├── gpt_chunk_datasets.py │ │ │ │ └── tokenizers.py │ │ │ ├── db │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── dataset.py │ │ │ │ └── utils.py │ │ │ ├── external_libs.py │ │ │ ├── index │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── factory.py │ │ │ │ ├── index.py │ │ │ │ ├── indexes │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── faiss_base.py │ │ │ │ │ └── faiss_par_add.py │ │ │ │ ├── utils.py │ │ │ │ └── validate.py │ │ │ ├── query │ │ │ │ ├── __init__.py │ │ │ │ ├── gpt_chunk_dataset.py │ │ │ │ ├── multi_split_gpt_dataset.py │ │ │ │ ├── query.py │ │ │ │ ├── retro_dataset.py │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── t5_dataset.py │ │ ├── utils.py │ │ └── utils_s3.py │ ├── dist_checkpointing │ │ ├── __init__.py │ │ ├── core.py │ │ ├── dict_utils.py │ │ ├── exchange_utils.py │ │ ├── mapping.py │ │ ├── optimizer.py │ │ ├── serialization.py │ │ ├── state_dict_transformation.py │ │ ├── strategies │ │ │ ├── __init__.py │ │ │ ├── async_utils.py │ │ │ ├── base.py │ │ │ ├── common.py │ │ │ ├── filesystem_async.py │ │ │ ├── fully_parallel.py │ │ │ ├── resharding.py │ │ │ ├── state_dict_saver.py │ │ │ ├── tensorstore.py │ │ │ ├── torch.py │ │ │ ├── two_stage.py │ │ │ └── zarr.py │ │ ├── utils.py │ │ └── validation.py │ ├── distributed │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data_parallel_base.py │ │ ├── distributed_data_parallel.py │ │ ├── distributed_data_parallel_config.py │ │ ├── finalize_model_grads.py │ │ ├── param_and_grad_buffer.py │ │ └── torch_fully_sharded_data_parallel.py │ ├── enums.py │ ├── export │ │ ├── __init__.py │ │ ├── data_type.py │ │ ├── export_config.py │ │ ├── model_type.py │ │ └── trtllm │ │ │ ├── __init__.py │ │ │ ├── engine_builder │ │ │ ├── __init__.py │ │ │ └── trtllm_engine_builder.py │ │ │ ├── model_to_trllm_mapping │ │ │ ├── __init__.py │ │ │ └── default_conversion_dict.py │ │ │ ├── trt_model_config.py │ │ │ ├── trt_model_type.py │ │ │ ├── trtllm_helper.py │ │ │ ├── trtllm_layers.py │ │ │ └── trtllm_weights_converter │ │ │ ├── __init__.py │ │ │ ├── distributed_trtllm_model_weights_converter.py │ │ │ └── single_device_trtllm_model_weights_converter.py │ ├── extensions │ │ ├── __init__.py │ │ └── transformer_engine.py │ ├── fusions │ │ ├── __init__.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_bias_geglu.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_bias_swiglu.py │ │ ├── fused_cross_entropy.py │ │ ├── fused_layer_norm.py │ │ └── fused_softmax.py │ ├── inference │ │ ├── __init__.py │ │ ├── ammo_support │ │ │ ├── __init__.py │ │ │ └── gpt │ │ │ │ ├── model_specs.py │ │ │ │ └── state_dict_hooks.py │ │ ├── common_inference_params.py │ │ ├── communication_utils.py │ │ ├── engines │ │ │ ├── __init__.py │ │ │ ├── abstract_engine.py │ │ │ └── mcore_engine.py │ │ ├── inference_request.py │ │ ├── model_inference_wrappers │ │ │ ├── __init__.py │ │ │ ├── abstract_model_inference_wrapper.py │ │ │ ├── gpt │ │ │ │ ├── __init__.py │ │ │ │ └── gpt_inference_wrapper.py │ │ │ ├── inference_wrapper_config.py │ │ │ └── t5 │ │ │ │ ├── __init__.py │ │ │ │ └── t5_inference_wrapper.py │ │ ├── modelopt_support │ │ │ ├── __init__.py │ │ │ └── gpt │ │ │ │ ├── __init__.py │ │ │ │ ├── model_specs.py │ │ │ │ └── state_dict_hooks.py │ │ ├── scheduler.py │ │ ├── text_generation_controllers │ │ │ ├── __init__.py │ │ │ ├── encoder_decoder_text_generation_controller.py │ │ │ └── simple_text_generation_controller.py │ │ └── utils.py │ ├── inference_params.py │ ├── jit.py │ ├── model_parallel_config.py │ ├── models │ │ ├── T5 │ │ │ ├── __init__.py │ │ │ ├── t5_model.py │ │ │ └── t5_spec.py │ │ ├── __init__.py │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── bert_layer_specs.py │ │ │ ├── bert_lm_head.py │ │ │ ├── bert_model.py │ │ │ └── pooler.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── embeddings │ │ │ │ ├── __init__.py │ │ │ │ ├── language_model_embedding.py │ │ │ │ ├── rope_utils.py │ │ │ │ ├── rotary_pos_embedding.py │ │ │ │ └── yarn_rotary_pos_embedding.py │ │ │ ├── language_module │ │ │ │ ├── __init__.py │ │ │ │ └── language_module.py │ │ │ └── vision_module │ │ │ │ ├── __init__.py │ │ │ │ └── vision_module.py │ │ ├── gpt │ │ │ ├── __init__.py │ │ │ ├── gpt_layer_specs.py │ │ │ ├── gpt_model.py │ │ │ └── moe_module_specs.py │ │ ├── mamba │ │ │ ├── __init__.py │ │ │ ├── mamba_layer_specs.py │ │ │ └── mamba_model.py │ │ ├── multimodal │ │ │ ├── __init__.py │ │ │ ├── llava_model.py │ │ │ └── llava_spec.py │ │ ├── retro │ │ │ ├── __init__.py │ │ │ ├── base_attention.py │ │ │ ├── config.py │ │ │ ├── decoder_attention.py │ │ │ ├── decoder_spec.py │ │ │ ├── encoder_attention.py │ │ │ ├── encoder_spec.py │ │ │ ├── model.py │ │ │ └── utils.py │ │ └── vision │ │ │ ├── __init__.py │ │ │ ├── clip_vit_model.py │ │ │ ├── multimodal_projector.py │ │ │ └── vit_layer_specs.py │ ├── num_microbatches_calculator.py │ ├── optimizer │ │ ├── __init__.py │ │ ├── clip_grads.py │ │ ├── distrib_optimizer.py │ │ ├── grad_scaler.py │ │ ├── optimizer.py │ │ └── optimizer_config.py │ ├── optimizer_param_scheduler.py │ ├── package_info.py │ ├── packed_seq_params.py │ ├── parallel_state.py │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── p2p_communication.py │ │ └── schedules.py │ ├── requirements.txt │ ├── rerun_state_machine.py │ ├── ssm │ │ ├── __init__.py │ │ ├── mamba_block.py │ │ ├── mamba_hybrid_layer_allocation.py │ │ ├── mamba_layer.py │ │ ├── mamba_mixer.py │ │ └── triton_cache_manager.py │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ ├── timers.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── cuda_graphs.py │ │ ├── custom_layers │ │ │ ├── __init__.py │ │ │ └── transformer_engine.py │ │ ├── dot_product_attention.py │ │ ├── enums.py │ │ ├── identity_op.py │ │ ├── mlp.py │ │ ├── module.py │ │ ├── moe │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── experts.py │ │ │ ├── grouped_gemm_util.py │ │ │ ├── legacy_a2a_token_dispatcher.py │ │ │ ├── moe_layer.py │ │ │ ├── moe_utils.py │ │ │ ├── router.py │ │ │ ├── shared_experts.py │ │ │ ├── token_dispatcher.py │ │ │ └── upcycling_utils.py │ │ ├── multi_latent_attention.py │ │ ├── spec_utils.py │ │ ├── torch_layer_norm.py │ │ ├── torch_norm.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── utils.py │ └── utils.py ├── inference │ ├── __init__.py │ ├── algos │ │ ├── __init__.py │ │ └── distillation.py │ ├── arguments.py │ ├── checkpointing.py │ ├── docs │ │ └── distillation.md │ ├── endpoints │ │ ├── common.py │ │ └── completions.py │ ├── gpt │ │ ├── __init__.py │ │ ├── loss_func.py │ │ └── model_provider.py │ ├── static │ │ └── index.html │ ├── text_generation │ │ ├── __init__.py │ │ ├── api.py │ │ ├── beam_utils.py │ │ ├── communication.py │ │ ├── forward_step.py │ │ ├── generation.py │ │ ├── sampling.py │ │ └── tokenization.py │ └── text_generation_server.py ├── legacy │ ├── data │ │ ├── __init__.py │ │ ├── autoaugment.py │ │ ├── biencoder_dataset_utils.py │ │ ├── data_samplers.py │ │ ├── dataset_utils.py │ │ ├── ict_dataset.py │ │ ├── image_folder.py │ │ ├── multimodal_dataset.py │ │ ├── orqa_wiki_dataset.py │ │ ├── realm_dataset_utils.py │ │ ├── realm_index.py │ │ └── vit_dataset.py │ ├── fp16_deprecated │ │ └── loss_scaler.py │ ├── fused_kernels │ │ ├── __init__.py │ │ ├── compat.h │ │ ├── tests │ │ │ ├── __init__.py │ │ │ └── test_fused_kernels.py │ │ └── type_shim.h │ ├── indexer.py │ ├── model │ │ ├── __init__.py │ │ ├── bert_model.py │ │ ├── biencoder_model.py │ │ ├── classification.py │ │ ├── enums.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_layer_norm.py │ │ ├── fused_softmax.py │ │ ├── gpt_model.py │ │ ├── language_model.py │ │ ├── module.py │ │ ├── multiple_choice.py │ │ ├── realm_model.py │ │ ├── rms_norm.py │ │ ├── t5_model.py │ │ ├── transformer.py │ │ ├── utils.py │ │ └── vision │ │ │ ├── classification.py │ │ │ ├── dino.py │ │ │ ├── esvit_swin_backbone.py │ │ │ ├── inpainting.py │ │ │ ├── knn_monitor.py │ │ │ ├── mit_backbone.py │ │ │ ├── swin_backbone.py │ │ │ ├── utils.py │ │ │ └── vit_backbone.py │ └── mpu │ │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py └── training │ ├── __init__.py │ ├── activations.py │ ├── arguments.py │ ├── async_utils.py │ ├── checkpointing.py │ ├── dist_signal_handler.py │ ├── ft_integration.py │ ├── global_vars.py │ ├── initialize.py │ ├── log_handler.py │ ├── one_logger_utils.py │ ├── theoretical_memory_usage.py │ ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ ├── multimodal_tokenizer.py │ └── tokenizer.py │ ├── training.py │ ├── utils.py │ └── yaml_arguments.py ├── mypy.ini ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_ict.py ├── pretrain_mamba.py ├── pretrain_retro.py ├── pretrain_t5.py ├── pretrain_vision_classify.py ├── pretrain_vision_dino.py ├── pretrain_vision_inpaint.py ├── pretrain_vlm.py ├── pyproject.toml ├── pytest.ini ├── requirements ├── pytorch:24.01 │ └── requirements.txt └── pytorch:24.07 │ └── requirements.txt ├── scripts ├── train_llama_182m_dense.sh ├── train_llama_182m_moe.sh ├── train_llama_182m_remoe.sh ├── train_llama_469m_dense.sh ├── train_llama_469m_moe.sh ├── train_llama_469m_remoe.sh ├── train_llama_978m_dense.sh ├── train_llama_978m_moe.sh └── train_llama_978m_remoe.sh ├── setup.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── main.py ├── msdp │ ├── README.md │ ├── evaluate.py │ ├── main.py │ ├── metrics.py │ ├── preprocessing.py │ └── prompt.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── quantize │ └── calibrate_gpt.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification │ │ ├── classification.py │ │ └── eval_utils.py │ ├── finetune_utils.py │ ├── main.py │ └── segmentation │ │ ├── cityscapes.py │ │ ├── data.py │ │ ├── finetune_segformer.py │ │ ├── finetune_setr.py │ │ ├── metrics.py │ │ ├── seg_heads.py │ │ ├── seg_models.py │ │ ├── transforms.py │ │ └── utils.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── __init__.py ├── functional_tests │ ├── __init__.py │ ├── python_test_utils │ │ ├── __init__.py │ │ ├── common.py │ │ ├── get_test_results_from_tensorboard_logs.py │ │ ├── test_ci_pipeline.py │ │ ├── test_fp8_ci_pipeline.py │ │ └── test_resume_checkpoint_pipeline.py │ ├── shell_test_utils │ │ ├── _run_training.sh │ │ └── run_ci_test.sh │ └── test_cases │ │ ├── bert │ │ ├── bert_mr_mcore_tp2_pp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── bert_mr_tp1_pp4_vp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── bert_mr_tp2_pp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_tp1_pp2 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── bert_nightly_dgx_a100_1N8G_tp4_pp1 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ └── bert_release │ │ │ ├── golden_values_0.9.0.json │ │ │ └── model_config.yaml │ │ ├── common │ │ └── ckpt_converter │ │ │ ├── __main__.py │ │ │ └── model_config.yaml │ │ ├── gpt-nemo │ │ ├── gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ └── gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt │ │ ├── gpt3_15b_8t_release │ │ │ ├── golden_values_0.8.0.json │ │ │ ├── golden_values_0.9.0.json │ │ │ └── model_config.yaml │ │ ├── gpt3_15b_8t_release_sm │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1 │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_te_tp2_pp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_tp2_pp2_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G │ │ │ └── model_config.yaml │ │ └── gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume │ │ │ └── model_config.yaml │ │ ├── mixtral │ │ ├── mixtral_8x22b_tp2pp8ep8vpp1_release │ │ │ ├── golden_values_0.9.0.json │ │ │ └── model_config.yaml │ │ ├── mixtral_8x7b_alltoall_tp2pp4ep4_release │ │ │ ├── golden_values_0.8.0.json │ │ │ ├── golden_values_0.9.0.json │ │ │ └── model_config.yaml │ │ ├── mixtral_8x7b_alltoall_tp2pp4ep4_release_sm │ │ │ └── model_config.yaml │ │ └── mixtral_8x7b_tp1pp4ep8vpp8_release │ │ │ ├── golden_values_0.9.0.json │ │ │ └── model_config.yaml │ │ ├── multimodal-llava │ │ ├── multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ ├── multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G │ │ │ ├── golden_values_dev.json │ │ │ ├── golden_values_lts.json │ │ │ └── model_config.yaml │ │ └── multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G │ │ │ └── model_config.yaml │ │ └── t5 │ │ ├── t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1 │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch │ │ └── model_config.yaml │ │ ├── t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1 │ │ ├── golden_values_dev.json │ │ ├── golden_values_lts.json │ │ └── model_config.yaml │ │ ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 │ │ └── golden_values_lts.json │ │ ├── t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel │ │ └── golden_values_lts.json │ │ └── t5_release │ │ ├── golden_values_0.9.0.json │ │ └── model_config.yaml ├── test_utils │ ├── python_scripts │ │ ├── common.py │ │ ├── generate_jet_trigger_job.py │ │ ├── generate_local_jobs.py │ │ └── launch_jet_workload.py │ ├── recipes │ │ ├── _build-mcore-dev.yaml │ │ ├── _build-mcore-lts.yaml │ │ ├── _build-nemo.yaml │ │ ├── bert.yaml │ │ ├── gpt-modelopt.yaml │ │ ├── gpt-nemo.yaml │ │ ├── gpt.yaml │ │ ├── multimodal-llava.yaml │ │ ├── t5.yaml │ │ └── unit-tests.yaml │ └── shell_scripts │ │ └── notify.sh └── unit_tests │ ├── __init__.py │ ├── conftest.py │ ├── data │ ├── __init__.py │ ├── test_bin_reader.py │ ├── test_builder.py │ ├── test_gpt_dataset.py │ ├── test_multimodal_dataset.py │ ├── test_preprocess_data.py │ └── test_preprocess_mmdata.py │ ├── dist_checkpointing │ ├── __init__.py │ ├── conftest.py │ ├── models │ │ ├── __init__.py │ │ ├── common.py │ │ ├── test_bert_model.py │ │ ├── test_gpt_model.py │ │ ├── test_mamba.py │ │ ├── test_mlp_glu.py │ │ ├── test_moe_experts.py │ │ ├── test_retro_model.py │ │ └── test_t5_model.py │ ├── test_async_save.py │ ├── test_cached_metadata.py │ ├── test_flattened_resharding.py │ ├── test_fp8.py │ ├── test_fully_parallel.py │ ├── test_local.py │ ├── test_mapping.py │ ├── test_nonpersistent.py │ ├── test_optimizer.py │ ├── test_serialization.py │ └── utils.py │ ├── distributed │ ├── test_grad_reduce_for_replicated_embedder.py │ └── test_param_and_grad_buffer.py │ ├── export │ └── trtllm │ │ ├── __init__.py │ │ ├── test_distributed_fp8.py │ │ ├── test_single_device_fp8.py │ │ ├── test_trtllm_distributed_gpu_converter.py │ │ ├── test_trtllm_helper.py │ │ ├── test_trtllm_layers.py │ │ └── test_trtllm_single_device_converter.py │ ├── fusions │ └── test_torch_softmax.py │ ├── inference │ ├── __init__.py │ ├── engines │ │ ├── __init__.py │ │ └── test_mcore_engine.py │ ├── model_inference_wrappers │ │ ├── __init__.py │ │ ├── gpt │ │ │ └── test_gpt_inference_wrapper.py │ │ ├── t5 │ │ │ └── test_t5_inference_wrapper.py │ │ └── test_model_inference_wrapper_config.py │ ├── test_common_inference_params.py │ ├── test_flash_decode.py │ ├── test_inference_utils.py │ ├── test_modelopt_gpt_model.py │ ├── test_scheduler.py │ └── text_generation_controllers │ │ ├── __init__.py │ │ ├── test_encoder_decoder_text_generation_controller.py │ │ └── test_simple_text_generation_controller.py │ ├── models │ ├── __init__.py │ ├── test_base_embedding.py │ ├── test_bert_model.py │ ├── test_clip_vit_model.py │ ├── test_gpt_model.py │ ├── test_llava_model.py │ ├── test_mamba_model.py │ ├── test_multimodal_projector.py │ └── test_t5_model.py │ ├── pipeline_parallel │ ├── __init__.py │ ├── test_helpers.py │ └── test_schedules.py │ ├── ssm │ ├── test_mamba_block.py │ ├── test_mamba_hybrid_layer_allocation.py │ ├── test_mamba_layer.py │ └── test_mamba_mixer.py │ ├── tensor_parallel │ ├── __init__.py │ ├── test_cross_entropy.py │ ├── test_data.py │ ├── test_initialization.py │ ├── test_layers.py │ ├── test_mappings.py │ ├── test_random.py │ └── test_tensor_parallel_utils.py │ ├── test_basic.py │ ├── test_imports.py │ ├── test_inference.py │ ├── test_local_multi_tensor_fns.py │ ├── test_num_microbatches_calculator.py │ ├── test_optimizer.py │ ├── test_optimizer_param_scheduler.py │ ├── test_parallel_state.py │ ├── test_tokenizer.py │ ├── test_training.py │ ├── test_utilities.py │ ├── test_utils.py │ └── transformer │ ├── __init__.py │ ├── moe │ ├── __init__.py │ ├── conftest.py │ ├── test_a2a_token_dispatcher.py │ ├── test_aux_loss.py │ ├── test_grouped_mlp.py │ ├── test_moe_layer.py │ ├── test_routers.py │ ├── test_sequential_mlp.py │ ├── test_shared_experts.py │ ├── test_token_dispatcher.py │ └── test_upcycling.py │ ├── test_attention.py │ ├── test_attention_packed_seq.py │ ├── test_core_attention.py │ ├── test_mlp.py │ ├── test_module.py │ ├── test_multi_latent_attention.py │ ├── test_retro_attention.py │ ├── test_rope.py │ ├── test_spec_customization.py │ ├── test_transformer_block.py │ └── test_transformer_layer.py ├── tools ├── autoformat.sh ├── bert_embedding │ ├── __init__.py │ ├── dataset.py │ ├── embed.py │ ├── external_libs.py │ └── huggingface.py ├── checkpoint │ ├── convert.py │ ├── hybrid_conversion.py │ ├── loader_llama_mistral.py │ ├── loader_mcore.py │ ├── loader_megatron.py │ ├── loader_mixtral_hf.py │ ├── saver_mcore.py │ ├── saver_megatron.py │ ├── schema_base.py │ ├── schema_mcore.py │ └── utils.py ├── copyright.sh ├── linter.py ├── merge_datasets.py ├── openwebtext │ ├── README.md │ ├── add_id.py │ ├── blacklist_urls.py │ ├── cleanup_dataset.py │ ├── cleanup_fix_dataset.py │ ├── filter_ngrams.py │ ├── find_duplicates.py │ ├── group_duplicate_url.py │ ├── merge_jsons.py │ └── remove_group_duplicates.py ├── preprocess_data.py ├── preprocess_data_nmt.py ├── preprocess_mmdata.py ├── report_theoretical_memory.py ├── retro │ ├── README.md │ ├── build_db.md │ ├── cli │ │ ├── __init__.py │ │ ├── __main__.py │ │ └── cli.py │ ├── config_utils.py │ ├── docker │ │ └── Dockerfile │ ├── preprocess_data.py │ ├── sft │ │ ├── README.md │ │ ├── dataset_conv.py │ │ ├── open_inst.sh │ │ ├── sft_retro.py │ │ └── sft_retro_lm.sh │ └── text_generation │ │ ├── evaluate.py │ │ ├── metrics.py │ │ ├── retro_api.py │ │ ├── retro_generate.sh │ │ ├── retro_generation.py │ │ └── retro_text_generation.py ├── run_mamba_text_generation_server.py ├── run_text_generation_server.py ├── run_vlm_text_generation.py └── text_generation_cli.py └── unit-test-job-lts.yaml /.coveragerc: -------------------------------------------------------------------------------- 1 | [html] 2 | directory = coverage 3 | 4 | [run] 5 | data_file = .coverage_$LOCAL_RANK 6 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | extend-ignore = E203,E501,F401,E402,E714 4 | per-file-ignores = __init__.py:F401 -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: BUG 3 | about: Report a bug that needs attention 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Stack trace/logs** 20 | If applicable, add the stack trace or logs from the time of the error. 21 | 22 | **Environment (please complete the following information):** 23 | - Megatron-LM commit ID 24 | - PyTorch version 25 | - CUDA version 26 | - NCCL version 27 | 28 | **Proposed fix** 29 | If you have a proposal for how to fix the issue state it here or link to a PR. 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: ENHANCEMENT 3 | about: Suggest an idea to improve this project 4 | title: "[ENHANCEMENT]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Proposed implementation** 20 | If you have a proposed implementation for the feature state it here or link to a PR. 21 | 22 | **Additional context** 23 | Add any other context or screenshots about the feature request here. 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: QUESTION 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement 4 | request 5 | title: "[QUESTION]" 6 | labels: '' 7 | assignees: '' 8 | 9 | --- 10 | 11 | **Your question** 12 | Ask a clear and concise question about Megatron-LM. 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/regression.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: REGRESSION 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update 4 | title: "[REGRESSION]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the regression** 11 | A clear and concise description of what the regression is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Previous performance** 17 | What speed or accuracy did you previously see. 18 | 19 | **New performance** 20 | What speed or accuracy do you see after the update. 21 | 22 | **Stack trace/logs** 23 | If applicable, add the stack trace or logs related to the regression. 24 | 25 | **Environment (please complete the following information):** 26 | - Previous Megatron-LM commit ID 27 | - New Megatron-LM commit ID 28 | - Previous PyTorch version 29 | - New PyTorch version 30 | - Previous CUDA version 31 | - New CUDA version 32 | - Previous NCCL version 33 | - New NCCL version 34 | 35 | **Proposed fix** 36 | If you have a proposal for how to fix the issue state it here or link to a PR. 37 | 38 | **Additional context** 39 | Add any other context about the problem here. 40 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '15 18 * * *' 11 | 12 | jobs: 13 | stale: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/stale@v5 22 | with: 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | days-before-stale: 60 25 | stale-issue-message: 'Marking as stale. No activity in 60 days.' 26 | stale-pr-message: 'Marking as stale. No activity in 60 days.' 27 | stale-issue-label: 'stale' 28 | stale-pr-label: 'stale' 29 | remove-stale-when-updated: true 30 | operations-per-run: 1000 31 | days-before-close: -1 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | slurm* 8 | logs 9 | .vscode 10 | local/ 11 | .gitmodules -------------------------------------------------------------------------------- /.gitlab/labeler-config.yml: -------------------------------------------------------------------------------- 1 | CI: 2 | - .gitlab-ci.yml 3 | - Dockerfile.ci.lts 4 | - Dockerfile.ci.dev 5 | - .github/** 6 | - .gitlab/** 7 | 8 | Datasets: 9 | - megatron/core/datasets/** 10 | 11 | BERT: 12 | - megatron/core/models/bert/** 13 | 14 | GPT: 15 | - megatron/core/models/gpt/** 16 | 17 | RETRO: 18 | - megatron/core/models/retro/** 19 | 20 | Dist-Ckpt: 21 | - megatron/core/dist_checkpointing 22 | 23 | Dist-Opt: 24 | - megatron/core/optimizer/distrib_optimizer 25 | 26 | Inference: 27 | - megatron/core/inference 28 | 29 | MoE: 30 | - megatron/core/transformer/moe 31 | 32 | Tests: 33 | - tests/** -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MAIN] 2 | ignore-paths=tests 3 | max-line-length=100 4 | 5 | [MESSAGES CONTROL] 6 | disable=all 7 | 8 | enable=C0115,C0116,W0611,C0301 9 | # C0115: missing-class-docstring 10 | # C0116: missing-function-docstring 11 | # W0611: unused-import 12 | # C0301: line-too-long 13 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | [Core-ADLR] @mcore-reviewers/core-adlr 2 | megatron/core/ 3 | 4 | [Core-NeMo] @mcore-reviewers/core-nemo 5 | megatron/core/ 6 | 7 | ^[Core-MLPerf] @mcore-reviewers/mlperf 8 | megatron/core/ 9 | 10 | [MoE-ADLR] @mcore-reviewers/moe-adlr 11 | megatron/core/transformer/moe/ 12 | 13 | [MoE-Moe] @mcore-reviewers/moe-moe 14 | megatron/core/transformer/moe/ 15 | 16 | [Datasets] @mcore-reviewers/datasets 17 | megatron/core/datasets/ 18 | 19 | [BERT] @mcore-reviewers/bert 20 | megatron/core/models/bert/ 21 | 22 | [GPT] @mcore-reviewers/gpt 23 | megatron/core/models/gpt/ 24 | 25 | [Retro] @mcore-reviewers/retro 26 | megatron/core/models/retro/ 27 | 28 | [Distributed Checkpointing] @mcore-reviewers/dist-checkpointing 29 | megatron/core/dist_checkpointing/ 30 | 31 | [Distributed Optimizer] @mcore-reviewers/dist-optimizer 32 | megatron/core/optimizer/distrib_optimizer/ 33 | 34 | [Inference] @mcore-reviewers/inference 35 | megatron/core/inference/ 36 | 37 | ^[Quantization and Inference (QAT)] @mcore-reviewers/quantization-and-inference 38 | megatron/core/inference/ 39 | 40 | ; [Context Parallelism] @mcore-reviewers/context-parallelism 41 | ; 42 | 43 | [CI] @mcore-reviewers/ci 44 | .gitlab/ 45 | .github/ 46 | .gitlab-ci.yml 47 | Dockerfile.ci.lts 48 | Dockerfile.ci.dev 49 | tests/ 50 | -------------------------------------------------------------------------------- /Dockerfile.linting: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | 3 | ARG FROM_IMAGE_NAME 4 | FROM $FROM_IMAGE_NAME as main 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \ 8 | /etc/apt/apt.conf.d/docker-clean 9 | 10 | RUN apt-get update && \ 11 | apt-get install -y python3-venv && \ 12 | apt-get clean && \ 13 | python -m venv /opt/jet 14 | 15 | RUN pip3 install --no-cache-dir \ 16 | black==24.4.2 \ 17 | isort==5.13.2 \ 18 | flake8==7.1.0 \ 19 | pylint==3.2.6 \ 20 | mypy 21 | 22 | COPY . /opt/megatron-lm 23 | 24 | WORKDIR /opt/megatron-lm 25 | 26 | ##### For NVIDIANS only ##### 27 | FROM main as jet 28 | ARG CACHEBUST=0 29 | RUN --mount=type=secret,id=JET_INDEX_URLS \ 30 | JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ 31 | pip install jet-client jet-api --upgrade $JET_INDEX_URLS 32 | ENV PATH="$PATH:/opt/jet/bin" 33 | ### -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/core/requirements.txt 2 | include megatron/core/README.md 3 | recursive-include requirements * 4 | -------------------------------------------------------------------------------- /data_preprocessing.sh: -------------------------------------------------------------------------------- 1 | for i in $(seq -w 0 29); do 2 | python tools/preprocess_data.py \ 3 | --input ../pile/${i}.jsonl \ 4 | --output-prefix ../pile_gpt_test/${i} \ 5 | --vocab-file ../gpt2-vocab.json \ 6 | --tokenizer-type GPT2BPETokenizer \ 7 | --merge-file ../gpt2-merges.txt \ 8 | --append-eod \ 9 | --workers 32 10 | done -------------------------------------------------------------------------------- /docs/source/api-guide/index.rst: -------------------------------------------------------------------------------- 1 | API Guide 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | models 8 | tensor_parallel 9 | context_parallel 10 | pipeline_parallel 11 | fusions 12 | transformer 13 | moe 14 | dist_checkpointing 15 | dist_optimizer 16 | distributed 17 | datasets 18 | num_microbatches_calculator 19 | optimizer_param_scheduler 20 | encoder_decoder_parallelism -------------------------------------------------------------------------------- /docs/source/api-guide/models.bert.rst: -------------------------------------------------------------------------------- 1 | models.bert package 2 | =================== 3 | Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 4 | 5 | Submodules 6 | ---------- 7 | 8 | models.bert.bert\_model module 9 | ------------------------------ 10 | 11 | .. automodule:: core.models.bert.bert_model 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: core.models.bert 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.gpt.rst: -------------------------------------------------------------------------------- 1 | models.gpt package 2 | ================== 3 | This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 4 | 5 | Submodules 6 | ---------- 7 | 8 | models.gpt.gpt\_model module 9 | ---------------------------- 10 | 11 | .. automodule:: core.models.gpt.gpt_model 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: core.models.gpt 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.rst: -------------------------------------------------------------------------------- 1 | models package 2 | ============== 3 | This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 4 | 5 | Subpackages 6 | ----------- 7 | 8 | .. toctree:: 9 | :maxdepth: 4 10 | 11 | models.gpt 12 | models.t5 13 | models.bert 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: core.models 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.t5.rst: -------------------------------------------------------------------------------- 1 | models.t5 package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | models.t5.t5\_model module 8 | -------------------------- 9 | 10 | .. automodule:: core.models.T5.t5_model 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: core.models.T5 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api-guide/moe.rst: -------------------------------------------------------------------------------- 1 | Mixture of Experts package 2 | ========================== 3 | 4 | .. mdinclude :: ../../../megatron/core/transformer/moe/README.md 5 | -------------------------------------------------------------------------------- /docs/source/api-guide/num_microbatches_calculator.rst: -------------------------------------------------------------------------------- 1 | Microbatches Calculator 2 | ======================= 3 | This api is used to calculate the number of microbatches required to fit a given model on a given batch size. 4 | 5 | 6 | Module contents 7 | --------------- 8 | 9 | .. automodule:: core.num_microbatches_calculator 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/api-guide/optimizer_param_scheduler.rst: -------------------------------------------------------------------------------- 1 | Optimizer Parameters Scheduler 2 | ============================== 3 | This api is used to calculate the learning rate and weight decay for the optimizer. 4 | 5 | 6 | Module contents 7 | --------------- 8 | 9 | .. automodule:: core.optimizer_param_scheduler 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/images/context_parallel/CP_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/context_parallel/CP_overview.png -------------------------------------------------------------------------------- /docs/source/images/context_parallel/CP_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/context_parallel/CP_results.png -------------------------------------------------------------------------------- /docs/source/images/distrib_optimizer/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/distrib_optimizer/data_flow.png -------------------------------------------------------------------------------- /docs/source/images/distrib_optimizer/sharding_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/distrib_optimizer/sharding_scheme.png -------------------------------------------------------------------------------- /docs/source/images/moe/token_drop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/docs/source/images/moe/token_drop.png -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Lumache documentation master file, created by 2 | sphinx-quickstart on Tue Aug 15 13:44:10 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Megatron Core User Guide 7 | =================================== 8 | 9 | **Megatron Core** is a Python library that has the core components required to build your language models. 10 | A reference implementation of Megatron Core can be found in `NeMo `_ It offers a *simple* and 11 | *intuitive* API. 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: User Guide 16 | 17 | user-guide/index 18 | 19 | .. toctree:: 20 | :maxdepth: 3 21 | :caption: API Guide 22 | 23 | api-guide/index 24 | -------------------------------------------------------------------------------- /docs/source/user-guide/index.rst: -------------------------------------------------------------------------------- 1 | User Guide 2 | ============ 3 | 4 | .. mdinclude:: ../../../megatron/core/QuickStart.md -------------------------------------------------------------------------------- /examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh: -------------------------------------------------------------------------------- 1 | VOCAB_FILE=pt2-vocab.json 2 | MERGE_FILE=gpt2-merges.txt 3 | 4 | python3 tools/preprocess_data.py \ 5 | --input $1 \ 6 | --output-prefix $2 \ 7 | --vocab-file $VOCAB_FILE \ 8 | --merge-file $MERGE_FILE \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --append-eod --workers 20 --chunk-size 25 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp). 5 | 6 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/prep_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Preparing the input file for the response generation (second-stage prompting) 4 | 5 | DIR=`pwd` 6 | 7 | TEST_FILE= \ 8 | (e.g., /testseen_processed.txt) 9 | KNOWLEDGE_FILE= \ 10 | (e.g., /testseen_knowledge_generations.txt) 11 | PROCESSED_FILE= \ 12 | (e.g., /testseen_processed_with_generated_knowledge.txt) 13 | 14 | python ${DIR}/tasks/msdp/preprocessing.py \ 15 | --func prepare_input \ 16 | --test_file ${TEST_FILE} \ 17 | --knwl_gen_file ${KNOWLEDGE_FILE} \ 18 | --processed_file ${PROCESSED_FILE} 19 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/SBATCH.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | sbatch -p ${SLURM_PARTITION} \ 5 | -A ${SLURM_ACCOUNT} \ 6 | --job-name=${JOB_NAME} \ 7 | --nodes=${NNODES} \ 8 | --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh 9 | 10 | exit 0 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/SRUN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 4 | 5 | 6 | THIS_DIR=`pwd` 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 8 | mkdir -p ${THIS_DIR}/logs 9 | 10 | 11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" 12 | 13 | 14 | srun -l \ 15 | --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ 16 | --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ 17 | --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" 18 | 19 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [1, 2, 4, 8]. 8 | PP=1 9 | 10 | # Batch size (global batch size) options = [8, 128]. 11 | GBS=8 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel size options. 18 | NLS=$((3*PP)) 19 | NNODES=${PP} 20 | 21 | 22 | # Other params. 23 | TP=8 24 | MBS=1 25 | HS=20480 26 | NAH=128 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Interleaved schedule options = [YES, NO]. 8 | INTERLEAVED=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set interleaved schedule options. 18 | if [ ${INTERLEAVED} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${INTERLEAVED} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 128]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and tensor-parallel size options. 18 | TP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | MBS=1 23 | NLS=32 24 | HS=20480 25 | NAH=128 26 | DDP=local 27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 28 | NNODES=8 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and data-parallel size options. 18 | DP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | TP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32]. 8 | TP=2 9 | 10 | # Batch size (global batch size) options = [32, 128, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set tensor-parallel and data-parallel size options. 18 | DP=$((64/TP)) 19 | 20 | 21 | # Other params. 22 | PP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Microbatch size options = [1, 2, 4, 8]. 8 | MBS=1 9 | 10 | # Batch size (global batch size) options = [128, 512]. 11 | GBS=128 12 | 13 | 14 | 15 | 16 | 17 | # Other params. 18 | TP=8 19 | PP=8 20 | NLS=32 21 | HS=15360 22 | NAH=128 23 | DDP=local 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | NNODES=8 26 | 27 | 28 | # Name of the job. 29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} 30 | 31 | 32 | # Import the configs. 33 | . `pwd`/CONFIG.sh 34 | 35 | 36 | # Submit the job. 37 | . `pwd`/SBATCH.sh 38 | 39 | 40 | exit 0 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_17.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Activation recomputation options = [YES, NO]. 8 | ACTIVATION_RECOMPUTATION=YES 9 | 10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256]. 11 | GBS=1 12 | 13 | 14 | 15 | 16 | 17 | # Set activation recomputation. 18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="" 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=16 31 | MBS=1 32 | NLS=80 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=16 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_18.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Scatter-gather communication optimization options = [YES, NO]. 8 | SCATTER_GATHER=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set scatter-gather communication optimization options. 18 | if [ ${SCATTER_GATHER} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${SCATTER_GATHER} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/export/README.md: -------------------------------------------------------------------------------- 1 | # Megatron Core Export 2 | 3 | This module is used to export megatron core models to different inference frameworks. 4 | Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 5 | 6 | ## PTQ AND EXPORT 7 | Follow the instructions in [ptq_and_trtllm_export](./ptq_and_trtllm_export) to do post training quantization, followed by an export to TRTLLM format. 8 | 9 | # TRTLLM EXPORT 10 | Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone. -------------------------------------------------------------------------------- /examples/inference/llama_mistral/huggingface_reference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer 3 | 4 | # Set up argument parsing 5 | parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.") 6 | parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation") 7 | parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint") 8 | 9 | # Parse command-line arguments 10 | args = parser.parse_args() 11 | 12 | model_path = args.model_path 13 | prompt = args.prompt 14 | 15 | config = AutoConfig.from_pretrained(model_path) 16 | tokenizer = AutoTokenizer.from_pretrained(model_path, config=config) 17 | model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda() 18 | 19 | inputs = tokenizer(prompt, return_tensors="pt") 20 | for key in inputs: 21 | inputs[key] = inputs[key].cuda() 22 | # top_k, top_p and do_sample are set for greedy argmax based sampling 23 | 24 | outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0) 25 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) -------------------------------------------------------------------------------- /examples/inference/run_text_generation_server_345M.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model. 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | export CUDA_DEVICE_MAX_CONNECTIONS=1 14 | 15 | pip install flask-restful 16 | 17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 18 | --tensor-model-parallel-size 1 \ 19 | --pipeline-model-parallel-size 1 \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --load ${CHECKPOINT} \ 23 | --num-attention-heads 16 \ 24 | --max-position-embeddings 1024 \ 25 | --tokenizer-type GPT2BPETokenizer \ 26 | --fp16 \ 27 | --micro-batch-size 1 \ 28 | --seq-length 1024 \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --seed 42 32 | -------------------------------------------------------------------------------- /examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | pip install flask-restful 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 16 | --tensor-model-parallel-size 8 \ 17 | --pipeline-model-parallel-size 1 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --load ${CHECKPOINT} \ 21 | --num-attention-heads 16 \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --micro-batch-size 1 \ 26 | --seq-length 1024 \ 27 | --vocab-file $VOCAB_FILE \ 28 | --merge-file $MERGE_FILE \ 29 | --seed 42 30 | -------------------------------------------------------------------------------- /examples/mamba/.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints/ 2 | data-cache/ 3 | tensorboard/ 4 | triton-cache/ 5 | -------------------------------------------------------------------------------- /examples/multimodal/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.02-py3 2 | 3 | RUN apt update && \ 4 | apt -y upgrade && \ 5 | apt install -y --no-install-recommends \ 6 | software-properties-common \ 7 | build-essential \ 8 | python3-pip \ 9 | python3-dev \ 10 | bash \ 11 | git \ 12 | vim \ 13 | tmux \ 14 | python-is-python3 \ 15 | default-jre 16 | 17 | RUN pip install --upgrade pip 18 | RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging 19 | RUN pip install transformers datasets accelerate timm 20 | RUN pip install pytest-cov pytest_mock nltk wrapt 21 | RUN pip install zarr "tensorstore==0.1.45" 22 | RUN pip install black isort click==8.0.2 23 | RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken 24 | RUN pip install git+https://github.com/openai/CLIP.git 25 | # Use --no-deps for the following to avoid outdated and unnecessary dependencies. 26 | RUN pip install open_clip_torch open-flamingo[eval] --no-deps 27 | -------------------------------------------------------------------------------- /examples/multimodal/assets/pretrain_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/examples/multimodal/assets/pretrain_curves.png -------------------------------------------------------------------------------- /examples/multimodal/convert_llava_pretrain_to_wds.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import webdataset as wds 4 | 5 | from tqdm import tqdm 6 | 7 | llava_pretrain_dir = '' 8 | 9 | # Paths to the dataset files 10 | json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json') 11 | output = os.path.join(llava_pretrain_dir, 'wds') 12 | 13 | if not os.path.exists(output): 14 | os.mkdir(output) 15 | 16 | # Load data 17 | with open(json_file, 'r') as f: 18 | data = json.load(f) 19 | 20 | with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer: 21 | for entry in tqdm(data): 22 | with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file: 23 | image_data = img_file.read() 24 | sample = { 25 | "__key__": entry['id'], 26 | "jpg": image_data, 27 | "json": json.dumps(entry['conversations']).encode("utf-8"), 28 | } 29 | shard_writer.write(sample) 30 | 31 | print(f"Dataset successfully converted to wds") 32 | -------------------------------------------------------------------------------- /examples/multimodal/nvlm/pretrain_blend.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 0.579 # Datasets are weighted according to their size. Weights sum up to 1. 7 | path: 8 | subflavors: 9 | augmentation: False 10 | 11 | - weight: 0.02 12 | path: 13 | subflavors: 14 | augmentation: False 15 | 16 | - weight: 0.01 17 | path: 18 | subflavors: 19 | augmentation: False 20 | 21 | # Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets. 22 | # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format. 23 | val: 24 | datasets: 25 | - weight: 1. 26 | path: 27 | subflavors: 28 | augmentation: False 29 | -------------------------------------------------------------------------------- /examples/multimodal/nvlm/sft_blend.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 0.01 # # Datasets are weighted according to their size. Weights sum up to 1. 7 | path: 8 | subflavors: 9 | augmentation: False 10 | 11 | - weight: 0.02 12 | path: 13 | subflavors: 14 | augmentation: False 15 | 16 | # Please refer to Table 6 in https://arxiv.org/pdf/2409.11402 for full list of SFT datasets. 17 | # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format. 18 | val: 19 | datasets: 20 | - weight: 1. 21 | path: 22 | subflavors: 23 | augmentation: False 24 | -------------------------------------------------------------------------------- /examples/multimodal/pretrain_dataset.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 1. 7 | path: 8 | subflavors: 9 | augmentation: false 10 | val: 11 | datasets: 12 | - weight: 1. 13 | path: 14 | subflavors: 15 | augmentation: false 16 | -------------------------------------------------------------------------------- /examples/multimodal/sft_dataset.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 1. 7 | path: 8 | subflavors: 9 | augmentation: false 10 | val: 11 | datasets: 12 | - weight: 1. 13 | path: 14 | subflavors: 15 | augmentation: false 16 | -------------------------------------------------------------------------------- /examples/t5/t5_mcore_train_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/examples/t5/t5_mcore_train_curve.png -------------------------------------------------------------------------------- /images/model_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/model_table.png -------------------------------------------------------------------------------- /images/remoe_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/remoe_comparison.png -------------------------------------------------------------------------------- /images/remoe_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/remoe_scaling.png -------------------------------------------------------------------------------- /images/strong_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/strong_scaling.png -------------------------------------------------------------------------------- /images/weak_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/images/weak_scaling.png -------------------------------------------------------------------------------- /megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | import megatron.core.tensor_parallel 3 | import megatron.core.utils 4 | from megatron.core import parallel_state 5 | from megatron.core.distributed import DistributedDataParallel 6 | from megatron.core.inference_params import InferenceParams 7 | from megatron.core.model_parallel_config import ModelParallelConfig 8 | from megatron.core.package_info import ( 9 | __contact_emails__, 10 | __contact_names__, 11 | __description__, 12 | __download_url__, 13 | __homepage__, 14 | __keywords__, 15 | __license__, 16 | __package_name__, 17 | __repository_url__, 18 | __shortversion__, 19 | __version__, 20 | ) 21 | from megatron.core.timers import Timers 22 | 23 | # Alias parallel_state as mpu, its legacy name 24 | mpu = parallel_state 25 | 26 | __all__ = [ 27 | "parallel_state", 28 | "tensor_parallel", 29 | "utils", 30 | "DistributedDataParallel", 31 | "InferenceParams", 32 | "ModelParallelConfig", 33 | "Timers", 34 | ] 35 | -------------------------------------------------------------------------------- /megatron/core/datasets/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | 4 | LIBNAME = helpers_cpp 5 | LIBEXT = $(shell python3-config --extension-suffix) 6 | 7 | OUT = $(LIBNAME)$(LIBEXT) 8 | SRC = helpers.cpp 9 | 10 | default: $(OUT) 11 | 12 | $(OUT): $(SRC) 13 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 14 | -------------------------------------------------------------------------------- /megatron/core/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/datasets/__init__.py -------------------------------------------------------------------------------- /megatron/core/datasets/retro/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .config import RetroGPTChunkDatasets 4 | from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig 5 | from .query.retro_dataset import get_retro_datasets 6 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - Embedder: Base class for all Bert embedders. 7 | - RetroBertEmbedders: Container class for in-memory and on-disk embedders. 8 | - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing. 9 | - RetroGPTChunkDatasets: Container class for train, valid, and test datasets. 10 | - RetroTokenizers: Container class for GPT and Bert tokenizers. 11 | """ 12 | 13 | from .bert_embedders import Embedder, RetroBertEmbedders 14 | from .config import RetroPreprocessingConfig 15 | from .gpt_chunk_datasets import RetroGPTChunkDatasets 16 | from .tokenizers import RetroTokenizers 17 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/gpt_chunk_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Container dataclass for GPT chunk datasets (train, valid, and test).""" 4 | 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass 9 | class RetroGPTChunkDatasets: 10 | """Container dataclass for GPT chunk datasets.""" 11 | 12 | # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'. 13 | train: dict = None 14 | valid: dict = None 15 | test: dict = None 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/tokenizers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Container class for GPT and Bert tokenizers.""" 4 | 5 | from dataclasses import dataclass 6 | 7 | from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer 8 | 9 | 10 | @dataclass 11 | class RetroTokenizers: 12 | """Container class for GPT and Bert tokenizers.""" 13 | 14 | gpt: MegatronTokenizer = None 15 | bert: MegatronTokenizer = None 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/db/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - build_db: Build a chunk database from a list of indexed datasets. 7 | """ 8 | 9 | from .build import build_db 10 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Required external libraries for Retro preprocessing.""" 4 | 5 | import importlib 6 | 7 | required_libs = ["faiss", "h5py", "transformers"] # for huggingface bert 8 | 9 | for lib in required_libs: 10 | try: 11 | globals()[lib] = importlib.import_module(lib) 12 | except ImportError as e: 13 | raise Exception( 14 | f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'." 15 | ) 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - train_index: Train an index on representative vectors. 7 | - add_to_index: Add vectors to a trained index. 8 | - build_index: Wrapper function that calls above two functions. 9 | """ 10 | 11 | from .build import add_to_index, build_index, train_index 12 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/index/indexes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | - FaissBaseIndex: Unoptimized Faiss index wrapper 6 | - FaissParallelAddIndex: Optimized index.add() for Faiss index. 7 | """ 8 | 9 | from .faiss_base import FaissBaseIndex 10 | from .faiss_par_add import FaissParallelAddIndex 11 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/query/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for querying the pretraining dataset.""" 4 | 5 | import os 6 | 7 | from megatron.core.datasets.megatron_dataset import MegatronDataset 8 | 9 | 10 | def get_query_dir(project_dir: str) -> str: 11 | """Get root directory of all saved query data. 12 | 13 | Args: 14 | project_dir (str): Retro project dir. 15 | 16 | Returns: 17 | Path to query sub-directory in Retro project. 18 | """ 19 | return os.path.join(project_dir, "query") 20 | 21 | 22 | def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str: 23 | """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test). 24 | 25 | Args: 26 | project_dir (str): Retro project dir. 27 | key (str): Dataset split key; 'train', 'valid', or 'test'. 28 | dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors. 29 | 30 | Returns: 31 | Path to directory containing this dataset's neighbors within Retro project. 32 | """ 33 | return os.path.join( 34 | get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}") 35 | ) 36 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .core import check_is_distributed_checkpoint 4 | from .mapping import LocalNonpersistentObject, LocalNonpersitentObject, ShardedTensor 5 | from .serialization import ( 6 | load, 7 | load_common_state_dict, 8 | load_plain_tensors, 9 | load_tensors_metadata, 10 | remove_sharded_tensors, 11 | save, 12 | ) 13 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Various loading and saving strategies """ 4 | from megatron.core.dist_checkpointing.strategies.common import register_default_common_strategies 5 | 6 | # We load "common" strategies by default to be always available 7 | register_default_common_strategies() 8 | -------------------------------------------------------------------------------- /megatron/core/distributed/README.md: -------------------------------------------------------------------------------- 1 | ## How to use pytorch FSDP2? 2 | 3 | Add these flag to enable Torch FSDP2. 4 | 5 | ``` 6 | --use-torch-fsdp2 7 | --no-gradient-accumulation-fusion 8 | --ckpt-format torch_dist 9 | ``` 10 | 11 | It is worth noting that CUDA_MAX_CONNECTIONS=1 should not be enabled to ensure that the communication of FSDP and the computation on the primary stream can be fully parallelized. 12 | -------------------------------------------------------------------------------- /megatron/core/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from packaging.version import Version 4 | 5 | from .distributed_data_parallel import DistributedDataParallel 6 | from .distributed_data_parallel_config import DistributedDataParallelConfig 7 | from .finalize_model_grads import finalize_model_grads 8 | from .torch_fully_sharded_data_parallel import TorchFullyShardedDataParallel 9 | -------------------------------------------------------------------------------- /megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | class ModelType(enum.Enum): 7 | encoder_or_decoder = 1 8 | encoder_and_decoder = 2 9 | retro_encoder = 3 10 | retro_decoder = 4 11 | -------------------------------------------------------------------------------- /megatron/core/export/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/data_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from enum import Enum 4 | 5 | DataType = Enum('DataType', ["bfloat16", "float16", "float32"]) 6 | -------------------------------------------------------------------------------- /megatron/core/export/export_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class ExportConfig: 8 | """Base configuration for Megatron Core Export 9 | 10 | These parameters control the export setting for trtllm 11 | """ 12 | 13 | inference_tp_size: int = 1 14 | 15 | inference_pp_size: int = 1 16 | 17 | use_parallel_embedding: bool = False 18 | 19 | use_embedding_sharing: bool = False 20 | -------------------------------------------------------------------------------- /megatron/core/export/model_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from enum import Enum 4 | 5 | ModelType = Enum( 6 | 'ModelType', ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"] 7 | ) 8 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/engine_builder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/trt_model_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import tensorrt_llm 4 | 5 | from megatron.core.export.model_type import ModelType 6 | 7 | TRT_MODEL_CONFIG = { 8 | ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig, 9 | ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig, 10 | ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig, 11 | ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig, 12 | ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig, 13 | ModelType.gemma: tensorrt_llm.models.GemmaConfig, 14 | ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig, 15 | } 16 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/trt_model_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from megatron.core.export.model_type import ModelType 4 | 5 | TRT_MODEL_TYPE_STRING = { 6 | ModelType.gpt: 'GPTForCausalLM', 7 | ModelType.gptnext: 'GPTForCausalLM', 8 | ModelType.starcoder: 'GPTForCausalLM', 9 | ModelType.mixtral: 'LlamaForCausalLM', 10 | ModelType.llama: 'LlamaForCausalLM', 11 | ModelType.gemma: 'GemmaForCausalLM', 12 | ModelType.falcon: 'FalconForCausalLM', 13 | } 14 | -------------------------------------------------------------------------------- /megatron/core/export/trtllm/trtllm_weights_converter/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/extensions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/extensions/__init__.py -------------------------------------------------------------------------------- /megatron/core/fusions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/fusions/__init__.py -------------------------------------------------------------------------------- /megatron/core/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/ammo_support/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | import warnings 3 | 4 | warnings.warn( 5 | "The 'megatron.core.inference.ammo_support' module is deprecated and will be removed in a future release. " 6 | "Please use megatron.core.inference.modelopt_support instead", 7 | DeprecationWarning, 8 | ) 9 | -------------------------------------------------------------------------------- /megatron/core/inference/ammo_support/gpt/model_specs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from megatron.core.inference.modelopt_support.gpt.model_specs import get_gpt_layer_modelopt_spec 3 | -------------------------------------------------------------------------------- /megatron/core/inference/ammo_support/gpt/state_dict_hooks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from megatron.core.inference.modelopt_support.gpt.state_dict_hooks import ( 3 | mcore_gpt_load_legacy_state_dict_pre_hook, 4 | mcore_gpt_load_te_state_dict_pre_hook, 5 | ) 6 | -------------------------------------------------------------------------------- /megatron/core/inference/common_inference_params.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class CommonInferenceParams: 7 | """Inference parameters sent along with the prompts 8 | 9 | For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910 10 | """ 11 | 12 | temperature: float = 1.0 13 | top_k: int = 0 14 | top_p: float = 0.0 15 | return_log_probs: bool = False 16 | num_tokens_to_generate: int = 30 17 | 18 | def add_attributes(self, attribute_value_pair: dict): 19 | """Utility to add more attributes to inference params 20 | 21 | Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows 22 | c = CommonInferenceParams 23 | c.add_attributes({'min_length':4, 'eod_id':153}) 24 | 25 | Args: 26 | attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values. 27 | """ 28 | for key, value in attribute_value_pair.items(): 29 | setattr(self, key, value) 30 | -------------------------------------------------------------------------------- /megatron/core/inference/engines/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/engines/abstract_engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | from typing import List 4 | 5 | 6 | class AbstractEngine(ABC): 7 | @staticmethod 8 | @abstractmethod 9 | def generate(self) -> dict: 10 | """The abstract backend's generate function. 11 | 12 | To define a new backend, implement this and return the outputs as a dictionary. 13 | 14 | Returns: 15 | dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`. 16 | """ 17 | pass 18 | -------------------------------------------------------------------------------- /megatron/core/inference/inference_request.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | from typing import List 5 | 6 | import torch 7 | 8 | from megatron.core.inference.common_inference_params import CommonInferenceParams 9 | 10 | 11 | # class syntax 12 | class Status(Enum): 13 | """Enum for status""" 14 | 15 | WAITING_IN_QUEUE = 1 16 | ACTIVE_AND_GENERATING_TOKENS = 2 17 | ACTIVE_BUT_NOT_GENERATING_TOKENS = 3 18 | COMPLETED = 4 19 | 20 | 21 | @dataclass 22 | class InferenceRequest: 23 | """Class for one inference request 24 | 25 | Containing relevant data for an inference request 26 | 27 | """ 28 | 29 | request_id: str 30 | prompt: str 31 | inference_parameters: CommonInferenceParams 32 | prompt_tokens: List[int] 33 | arrival_time: float 34 | status: Status 35 | encoder_prompt: str = None 36 | generated_text: str = None 37 | generated_tokens: torch.Tensor = None 38 | generated_log_probs: torch.Tensor = None 39 | generated_length: int = 0 40 | -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/t5/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/modelopt_support/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | """Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt). 3 | 4 | ModelOpt is a library comprising state-of-the-art model optimization techniques including quantization and sparsity to 5 | compress model for efficient inference on NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless 6 | experience for users to optimize their Megatron-core models for inference. More details on ModelOpt including 7 | installation and usage can be found at https://github.com/NVIDIA/TensorRT-Model-Optimizer. 8 | """ 9 | -------------------------------------------------------------------------------- /megatron/core/inference/modelopt_support/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/text_generation_controllers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | class Counter: 3 | """A simple counter class 4 | 5 | This class is responsible for assigning request ids to incoming requests 6 | """ 7 | 8 | def __init__(self, start: int = 0) -> None: 9 | self.counter = start 10 | 11 | def __next__(self) -> int: 12 | i = self.counter 13 | self.counter += 1 14 | return i 15 | 16 | def reset(self) -> None: 17 | self.counter = 0 18 | -------------------------------------------------------------------------------- /megatron/core/jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core.utils import is_torch_min_version 6 | 7 | jit_fuser = torch.jit.script 8 | # nvFuser is deprecated in PyTorch JIT starting from 2.2 9 | if is_torch_min_version("2.2.0a0"): 10 | jit_fuser = torch.compile 11 | -------------------------------------------------------------------------------- /megatron/core/models/T5/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .t5_model import T5Model 3 | -------------------------------------------------------------------------------- /megatron/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/bert/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/common/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .rope_utils import apply_rotary_pos_emb 4 | from .rotary_pos_embedding import RotaryEmbedding 5 | from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale 6 | -------------------------------------------------------------------------------- /megatron/core/models/common/language_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/common/language_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/common/vision_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/vision_module.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | """Megatron Vision Module.""" 3 | 4 | from megatron.core.transformer.module import MegatronModule 5 | from megatron.core.transformer.transformer_config import TransformerConfig 6 | 7 | 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes. 9 | class VisionModule(MegatronModule): 10 | """Base vision module that has common helper functions used across CLIP, ViT, etc. 11 | 12 | Args: 13 | config (TransformerConfig): Input transformer config for the model 14 | """ 15 | 16 | def __init__(self, config: TransformerConfig) -> None: 17 | super().__init__(config=config) 18 | -------------------------------------------------------------------------------- /megatron/core/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .gpt_model import GPTModel 3 | -------------------------------------------------------------------------------- /megatron/core/models/mamba/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .mamba_model import MambaModel 3 | -------------------------------------------------------------------------------- /megatron/core/models/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/models/retro/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - RetroConfig: configuration dataclass for RetroModel. 7 | - RetroModel: The Retro model. 8 | - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block. 9 | """ 10 | 11 | from .config import RetroConfig 12 | from .decoder_spec import get_retro_decoder_block_spec 13 | from .model import RetroModel 14 | -------------------------------------------------------------------------------- /megatron/core/models/retro/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | import torch 6 | 7 | 8 | def get_config_path(project_dir: str) -> str: 9 | """Config copy stored within retro project dir.""" 10 | return os.path.join(project_dir, "config.json") 11 | 12 | 13 | def get_gpt_data_dir(project_dir: str) -> str: 14 | """Get project-relative directory of GPT bin/idx datasets.""" 15 | return os.path.join(project_dir, "data") 16 | 17 | 18 | # ** Note ** : Retro's compatibility between cross attention and Flash/Fused 19 | # Attention is currently a work in progress. We default to returning None for 20 | # now. 21 | # def get_all_true_mask(size, device): 22 | # return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device) 23 | def get_all_true_mask(size, device): 24 | return None 25 | -------------------------------------------------------------------------------- /megatron/core/models/vision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/models/vision/__init__.py -------------------------------------------------------------------------------- /megatron/core/package_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | MAJOR = 0 5 | MINOR = 10 6 | PATCH = 0 7 | PRE_RELEASE = 'rc0' 8 | 9 | # Use the following formatting: (major, minor, patch, pre-release) 10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) 11 | 12 | __shortversion__ = '.'.join(map(str, VERSION[:3])) 13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) 14 | 15 | __package_name__ = 'megatron_core' 16 | __contact_names__ = 'NVIDIA' 17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email 18 | __homepage__ = ( 19 | 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage 20 | ) 21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' 22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 23 | __description__ = ( 24 | 'Megatron Core - a library for efficient and scalable training of transformer based models' 25 | ) 26 | __license__ = 'BSD-3' 27 | __keywords__ = ( 28 | 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' 29 | ) 30 | -------------------------------------------------------------------------------- /megatron/core/packed_seq_params.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from dataclasses import dataclass 3 | 4 | from torch import Tensor 5 | 6 | 7 | @dataclass 8 | class PackedSeqParams: 9 | ''' 10 | parameters to TEDotProductAttention and fused rope kernels for the 11 | `thd` (packed) sequence format 12 | ''' 13 | 14 | qkv_format: str = None 15 | cu_seqlens_q: Tensor = None 16 | cu_seqlens_kv: Tensor = None 17 | cu_seqlens_q_padded: Tensor = None 18 | cu_seqlens_kv_padded: Tensor = None 19 | max_seqlen_q: Tensor = None 20 | max_seqlen_kv: Tensor = None 21 | -------------------------------------------------------------------------------- /megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from .schedules import get_forward_backward_func 3 | -------------------------------------------------------------------------------- /megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | packaging 3 | -------------------------------------------------------------------------------- /megatron/core/ssm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/ssm/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .module import MegatronModule 4 | from .spec_utils import ModuleSpec, build_module 5 | from .transformer_config import MLATransformerConfig, TransformerConfig 6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules 7 | -------------------------------------------------------------------------------- /megatron/core/transformer/custom_layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/transformer/custom_layers/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/custom_layers/transformer_engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import warnings 4 | 5 | warnings.warn( 6 | """The 'megatron.core.transformer.custom_layers.transformer_engine' 7 | module is deprecated and will be removed in 0.10.0. Please use 8 | 'megatron.core.extensions.transformer_engine' instead.""", 9 | DeprecationWarning, 10 | stacklevel=2, 11 | ) 12 | from megatron.core.extensions.transformer_engine import * 13 | -------------------------------------------------------------------------------- /megatron/core/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | # can we get rid of this? 7 | # it's being used in pipeline schedules 8 | class ModelType(enum.Enum): 9 | """Model Type 10 | 11 | encoder_or_decoder for bert, gpt etc 12 | encoder_and_decoder for multimodal , T5 etc 13 | """ 14 | 15 | encoder_or_decoder = 1 16 | encoder_and_decoder = 2 17 | 18 | 19 | # class LayerType(enum.Enum): 20 | # encoder = 1 21 | # decoder = 2 22 | 23 | 24 | class AttnType(enum.Enum): 25 | """Attention type""" 26 | 27 | self_attn = 1 28 | cross_attn = 2 29 | 30 | 31 | class AttnMaskType(enum.Enum): 32 | """Attention Mask Type""" 33 | 34 | padding = 1 35 | causal = 2 36 | no_mask = 3 # only used for TE 37 | padding_causal = 4 # only used for thd attention 38 | arbitrary = 5 39 | 40 | 41 | class AttnBackend(enum.Enum): 42 | """Attention Backend""" 43 | 44 | flash = 1 45 | fused = 2 46 | unfused = 3 47 | local = 4 48 | auto = 5 49 | -------------------------------------------------------------------------------- /megatron/core/transformer/identity_op.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | 4 | 5 | class IdentityOp(torch.nn.Module): 6 | """ 7 | This is a placeholder for IdentityOp(x) -> x 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | 17 | class IdentityFuncOp(IdentityOp): 18 | """ 19 | This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x. 20 | Such a func is handy for ops like `bias_dropout_fusion` which themselves 21 | return a function at runtime based on passed arguments 22 | """ 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__() 26 | 27 | def forward(self, *args, **kwargs): 28 | return super().forward 29 | -------------------------------------------------------------------------------- /megatron/core/transformer/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/core/transformer/moe/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/moe/grouped_gemm_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | try: 4 | import grouped_gemm 5 | except ImportError: 6 | grouped_gemm = None 7 | 8 | 9 | def grouped_gemm_is_available(): 10 | """Check if grouped_gemm is available.""" 11 | return grouped_gemm is not None 12 | 13 | 14 | def assert_grouped_gemm_is_available(): 15 | """Assert that grouped_gemm is available.""" 16 | assert grouped_gemm_is_available(), ( 17 | "Grouped GEMM is not available. Please run " 18 | "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4`." 19 | ) 20 | 21 | 22 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None 23 | -------------------------------------------------------------------------------- /megatron/core/transformer/torch_layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | from megatron.core.transformer.torch_norm import WrappedTorchNorm 3 | 4 | WrappedTorchLayerNorm = WrappedTorchNorm 5 | -------------------------------------------------------------------------------- /megatron/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/inference/algos/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/inference/endpoints/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | import threading 5 | 6 | GENERATE_NUM = 0 7 | BEAM_NUM = 1 8 | LOCK = threading.Lock() 9 | 10 | 11 | def send_do_generate(): 12 | choice = torch.tensor([GENERATE_NUM], dtype=torch.long, device="cuda") 13 | torch.distributed.broadcast(choice, 0) 14 | 15 | 16 | def send_do_beam_search(): 17 | choice = torch.tensor([BEAM_NUM], dtype=torch.long, device="cuda") 18 | torch.distributed.broadcast(choice, 0) 19 | -------------------------------------------------------------------------------- /megatron/inference/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .loss_func import loss_func 4 | from .model_provider import model_provider 5 | -------------------------------------------------------------------------------- /megatron/inference/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /megatron/legacy/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/legacy/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /megatron/legacy/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /megatron/legacy/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/legacy/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/legacy/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 4 | from .rms_norm import RMSNorm 5 | 6 | from .bert_model import BertModel 7 | from .gpt_model import GPTModel 8 | from .t5_model import T5Model 9 | from .language_model import get_language_model 10 | from .module import Float16Module 11 | -------------------------------------------------------------------------------- /megatron/legacy/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | retro_decoder_with_retriever = 5 11 | 12 | class AttnType(enum.Enum): 13 | self_attn = 1 14 | cross_attn = 2 15 | 16 | class AttnMaskType(enum.Enum): 17 | padding = 1 18 | causal = 2 19 | 20 | # For backward compatibility with old model checkpoints 21 | from megatron.core.enums import ModelType 22 | -------------------------------------------------------------------------------- /megatron/legacy/model/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from torch import nn 5 | 6 | class RMSNorm(torch.nn.Module): 7 | 8 | def __init__(self, 9 | dim: int, 10 | eps: float = 1e-6, 11 | sequence_parallel: bool = False, 12 | config: dict = None): 13 | """RMS Normaliation module 14 | 15 | Args: 16 | dim (int): The width of input, i.e. hidden size 17 | eps (float): epsilon to use for the norm, default to 1e-6 18 | sequence_parallel (bool): Set to true if sequence parallelism is being used, 19 | this marks the weights as needing to be allreduced. 20 | """ 21 | super().__init__() 22 | self.eps = eps 23 | self.weight = nn.Parameter(torch.ones(dim)) 24 | 25 | setattr(self.weight, 'sequence_parallel', sequence_parallel) 26 | 27 | def _norm(self, x): 28 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 29 | 30 | def forward(self, x): 31 | output = self._norm(x.float()).type_as(x) 32 | return output * self.weight 33 | -------------------------------------------------------------------------------- /megatron/legacy/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | import warnings 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | def resize(input, 8 | size=None, 9 | scale_factor=None, 10 | mode='nearest', 11 | align_corners=None, 12 | warning=True): 13 | if warning: 14 | if size is not None and align_corners: 15 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 16 | output_h, output_w = tuple(int(x) for x in size) 17 | if output_h > input_h or output_w > output_h: 18 | if ((output_h > 1 and output_w > 1 and input_h > 1 19 | and input_w > 1) and (output_h - 1) % (input_h - 1) 20 | and (output_w - 1) % (input_w - 1)): 21 | warnings.warn( 22 | f'When align_corners={align_corners}, ' 23 | 'the output would more aligned if ' 24 | f'input size {(input_h, input_w)} is `x+1` and ' 25 | f'out size {(output_h, output_w)} is `nx+1`') 26 | if isinstance(size, torch.Size): 27 | size = tuple(int(x) for x in size) 28 | return F.interpolate(input, size, scale_factor, mode, align_corners) 29 | -------------------------------------------------------------------------------- /megatron/legacy/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/megatron/legacy/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/training/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args 6 | from .global_vars import get_signal_handler 7 | from .global_vars import get_tokenizer 8 | from .global_vars import get_tensorboard_writer 9 | from .global_vars import get_wandb_writer 10 | from .global_vars import get_one_logger 11 | from .global_vars import get_adlr_autoresume 12 | from .global_vars import get_timers 13 | from .initialize import initialize_megatron 14 | from .training import pretrain, get_model, get_train_valid_test_num_samples 15 | 16 | from .utils import (print_rank_0, 17 | is_last_rank, 18 | print_rank_last) 19 | -------------------------------------------------------------------------------- /megatron/training/activations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | from megatron.core.jit import jit_fuser 6 | 7 | 8 | @jit_fuser 9 | def squared_relu(x: torch.Tensor) -> torch.Tensor: 10 | return torch.pow(F.relu(x), 2) 11 | 12 | 13 | @jit_fuser 14 | def quick_gelu(x: torch.Tensor) -> torch.Tensor: 15 | return x * torch.sigmoid(1.702 * x) 16 | 17 | @jit_fuser 18 | def fast_gelu(x: torch.Tensor) -> torch.Tensor: 19 | return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) 20 | -------------------------------------------------------------------------------- /megatron/training/log_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import sys 4 | from logging import LogRecord, StreamHandler 5 | 6 | BLACKLISTED_MODULES = ["torch.distributed"] 7 | 8 | 9 | class CustomHandler(StreamHandler): 10 | """ 11 | Custom handler to filter out logging from code outside of 12 | Megatron Core, and dump to stdout. 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__(stream=sys.stdout) 17 | 18 | def filter(self, record: LogRecord) -> bool: 19 | # Prevent log entries that come from the blacklisted modules 20 | # through (e.g., PyTorch Distributed). 21 | for blacklisted_module in BLACKLISTED_MODULES: 22 | if record.name.startswith(blacklisted_module): 23 | return False 24 | return True 25 | -------------------------------------------------------------------------------- /megatron/training/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | check_untyped_defs = False 4 | disallow_untyped_calls = False 5 | disallow_untyped_defs = False 6 | disallow_incomplete_defs = False 7 | 8 | disable_error_code = call-arg,operator,var-annotated,union-attr,import-untyped 9 | 10 | # Enable only `assignment` error checking 11 | enable_error_code = assignment -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # content of pytest.ini 2 | [pytest] 3 | markers = 4 | internal: mark a test as a test to private/internal functions. -------------------------------------------------------------------------------- /requirements/pytorch:24.01/requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | flask-restful 3 | nltk 4 | pytest 5 | pytest-cov 6 | pytest_mock 7 | pytest-random-order 8 | sentencepiece 9 | tiktoken 10 | wrapt 11 | zarr 12 | wandb 13 | triton==2.1.0 14 | tensorstore==0.1.45 15 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin" -------------------------------------------------------------------------------- /requirements/pytorch:24.07/requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | flask-restful 3 | nltk 4 | pytest 5 | pytest-cov 6 | pytest_mock 7 | pytest-random-order 8 | sentencepiece 9 | tiktoken 10 | wrapt 11 | zarr 12 | wandb 13 | tensorstore==0.1.45 14 | nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin" -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | from megatron.training import get_args, print_rank_0 6 | from megatron.legacy.indexer import IndexBuilder 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator 8 | 9 | def main(): 10 | """ 11 | Main program 12 | """ 13 | 14 | args = get_args() 15 | 16 | """ 17 | Create a BlockData data structure by running an IndexBuilder over an 18 | ICT Dataset and then evaluate on NQ task 19 | """ 20 | 21 | print_rank_0("Starting index builder!") 22 | 23 | index_builder = IndexBuilder() 24 | index_builder.build_and_save_index() 25 | print_rank_0("Build and save indices: done!") 26 | 27 | 28 | print_rank_0("Starting evaluations!") 29 | 30 | # Set up the model and evaluator 31 | evaluator = ORQAEvaluator() 32 | 33 | # Run evaluation 34 | if args.qa_data_dev is not None: 35 | evaluator.evaluate(args.qa_data_dev, "DEV") 36 | 37 | if args.qa_data_test is not None: 38 | evaluator.evaluate(args.qa_data_test, "TEST") 39 | 40 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/functional_tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/functional_tests/python_test_utils/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 4 | import json 5 | 6 | import click 7 | 8 | from tests.functional_tests.python_test_utils import common 9 | 10 | 11 | @click.command() 12 | @click.option("--logs-dir", required=True, type=str, help="Path to Tensorboard logs") 13 | @click.option("--output-path", required=False, type=str, help="Path to write golden values") 14 | @click.option( 15 | "--is-convergence-test/--is-normal-test", 16 | type=bool, 17 | help="Tensorboard index to extract", 18 | default=False, 19 | ) 20 | def collect_train_test_metrics(logs_dir: str, output_path: str, is_convergence_test: bool): 21 | summaries = common.read_tb_logs_as_list(logs_dir, index=-1 if is_convergence_test else 0) 22 | 23 | train_metrics = { 24 | metric_name: { 25 | "start_step": 0, 26 | "end_step": len(metric_values), 27 | "step_interval": 5, 28 | "values": metric_values[0 : len(metric_values) : 5], 29 | } 30 | for metric_name, metric_values in summaries.items() 31 | } 32 | 33 | if output_path is not None: 34 | with open(output_path, "w") as fh: 35 | json.dump(train_metrics, fh) 36 | 37 | 38 | if __name__ == "__main__": 39 | collect_train_test_metrics() 40 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { "lm loss": { 2 | "start_step": 0, 3 | "end_step": 50, 4 | "step_interval": 5, 5 | "values": [ 6 | 10.49569, 7 | 10.48173, 8 | 10.48047, 9 | 10.45353, 10 | 10.44394, 11 | 10.35611, 12 | 10.13779, 13 | 10.04017, 14 | 9.86834, 15 | 9.67307 16 | ] 17 | }, 18 | "num-zeros": { 19 | "start_step": 0, 20 | "end_step": 50, 21 | "step_interval": 5, 22 | "values": [ 23 | 2254.0, 24 | 2585.0, 25 | 2101.0, 26 | 2157.0, 27 | 2241.0, 28 | 2475.0, 29 | 2890.0, 30 | 3199.0, 31 | 3524.0, 32 | 3090.0 33 | ] 34 | }, 35 | "iteration-time": { 36 | "start_step": 0, 37 | "end_step": 50, 38 | "step_interval": 5, 39 | "values": [ 40 | 13.65829, 41 | 1.27589, 42 | 1.2782, 43 | 1.32374, 44 | 1.26543, 45 | 1.26423, 46 | 1.26203, 47 | 1.54723, 48 | 1.27297, 49 | 1.26491 50 | ] 51 | } 52 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44396, 10.35607, 10.13786, 10.04016, 9.86838, 9.67302]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2291.0, 2485.0, 2953.0, 3287.0, 3440.0, 3059.0]}, "iteration_timing_avg": 0.8110379411764704} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49568, 10.45958, 10.32846, 10.17264, 9.96952]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27442.0, 22852.0, 22567.0, 20740.0, 23315.0]}, "iteration_timing_avg": 0.7692817647058824} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49568, 10.45958, 10.32846, 10.17264, 9.96952]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27442.0, 22852.0, 22567.0, 20740.0, 23315.0]}, "iteration_timing_avg": 0.7692817647058824} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.49411, 8 | 10.4825, 9 | 10.49242, 10 | 10.47802, 11 | 10.46608, 12 | 10.35193, 13 | 10.17693, 14 | 10.07728, 15 | 9.88753, 16 | 9.68034 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1931.0, 25 | 2555.0, 26 | 2017.0, 27 | 2135.0, 28 | 2440.0, 29 | 2464.0, 30 | 3070.0, 31 | 3006.0, 32 | 2932.0, 33 | 2303.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 10.94975, 42 | 0.67196, 43 | 0.67378, 44 | 0.66862, 45 | 0.69618, 46 | 0.66936, 47 | 0.67757, 48 | 0.67189, 49 | 0.67519, 50 | 0.67762 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49405, 10.48276, 10.49249, 10.47813, 10.46623, 10.35183, 10.17697, 10.07728, 9.8875, 9.68029]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2018.0, 2636.0, 2067.0, 2225.0, 2555.0, 2554.0, 2969.0, 2935.0, 2967.0, 2287.0]}, "iteration_timing_avg": 0.5847132352941178} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.46796, 8 | 10.45723, 9 | 10.44911, 10 | 10.44107, 11 | 10.41739, 12 | 10.34626, 13 | 10.11387, 14 | 10.0439, 15 | 9.86702, 16 | 9.679 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 2404.0, 25 | 2610.0, 26 | 2173.0, 27 | 2312.0, 28 | 2371.0, 29 | 2652.0, 30 | 3089.0, 31 | 3200.0, 32 | 3497.0, 33 | 3075.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 15.80389, 42 | 0.94155, 43 | 0.88518, 44 | 1.22442, 45 | 0.86955, 46 | 0.85166, 47 | 1.02329, 48 | 1.07525, 49 | 0.90283, 50 | 0.88308 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.4681, 8 | 10.45734, 9 | 10.4491, 10 | 10.44121, 11 | 10.41764, 12 | 10.34626, 13 | 10.11384, 14 | 10.04383, 15 | 9.86686, 16 | 9.67906 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 2373.0, 25 | 2593.0, 26 | 2187.0, 27 | 2325.0, 28 | 2407.0, 29 | 2627.0, 30 | 3036.0, 31 | 3109.0, 32 | 3568.0, 33 | 3019.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 22.86543, 42 | 0.84168, 43 | 0.92727, 44 | 0.84734, 45 | 0.93196, 46 | 0.86308, 47 | 0.86633, 48 | 0.86112, 49 | 0.87598, 50 | 1.02461 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.42085, 8 | 10.42901, 9 | 10.43576, 10 | 10.40804, 11 | 10.38463, 12 | 10.32426, 13 | 10.13148, 14 | 10.04317, 15 | 9.86257, 16 | 9.65771 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 3252.0, 25 | 2595.0, 26 | 3240.0, 27 | 3429.0, 28 | 3463.0, 29 | 3509.0, 30 | 4065.0, 31 | 4114.0, 32 | 4651.0, 33 | 4253.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 10.83012, 42 | 2.26196, 43 | 2.22779, 44 | 2.22677, 45 | 2.23847, 46 | 2.24307, 47 | 2.23859, 48 | 2.23544, 49 | 2.2414, 50 | 2.25107 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.4209, 8 | 10.42905, 9 | 10.43557, 10 | 10.40806, 11 | 10.38457, 12 | 10.32414, 13 | 10.13167, 14 | 10.04335, 15 | 9.86262, 16 | 9.65771 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 2249.0, 25 | 3640.0, 26 | 3249.0, 27 | 2318.0, 28 | 3512.0, 29 | 3601.0, 30 | 4111.0, 31 | 3175.0, 32 | 4713.0, 33 | 3320.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 12.51144, 42 | 2.1285, 43 | 2.28886, 44 | 2.24273, 45 | 2.20818, 46 | 2.20231, 47 | 2.18786, 48 | 2.17554, 49 | 2.213, 50 | 2.18811 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.49101, 8 | 10.49526, 9 | 10.48682, 10 | 10.48817, 11 | 10.49415, 12 | 10.4724, 13 | 10.42265, 14 | 10.29901, 15 | 10.1572, 16 | 9.97594 17 | ] 18 | }, 19 | "iteration-time": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 12.56945, 25 | 0.58599, 26 | 0.58451, 27 | 0.68178, 28 | 0.6056, 29 | 0.609, 30 | 0.59965, 31 | 0.60618, 32 | 0.60152, 33 | 0.59945 34 | ] 35 | }, 36 | "num-zeros": { 37 | "start_step": 0, 38 | "end_step": 34, 39 | "step_interval": 5, 40 | "values": [ 41 | 17032.0, 42 | 16918.0, 43 | 19957.0, 44 | 18761.0, 45 | 25689.0, 46 | 19897.0, 47 | 22224.0 48 | ] 49 | } 50 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.50096, 8 | 10.48594, 9 | 10.4936, 10 | 10.48501, 11 | 10.50417, 12 | 10.4773, 13 | 10.42154, 14 | 10.29716, 15 | 10.15831, 16 | 9.96751 17 | ] 18 | }, 19 | "iteration-time": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 12.85743, 25 | 0.58922, 26 | 0.54928, 27 | 0.54147, 28 | 0.56305, 29 | 0.56895, 30 | 0.56282, 31 | 0.56247, 32 | 0.56751, 33 | 0.69574 34 | ] 35 | }, 36 | "num-zeros": { 37 | "start_step": 0, 38 | "end_step": 34, 39 | "step_interval": 5, 40 | "values": [ 41 | 16595.0, 42 | 18537.0, 43 | 19509.0, 44 | 18532.0, 45 | 26712.0, 46 | 20164.0, 47 | 20981.0 48 | ] 49 | } 50 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.49734, 8 | 10.49243, 9 | 10.49325, 10 | 10.50311, 11 | 10.48985, 12 | 10.4721, 13 | 10.41217, 14 | 10.2805, 15 | 10.14052, 16 | 9.94191 17 | ] 18 | }, 19 | "iteration-time": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 8.58282, 25 | 2.06311, 26 | 2.05789, 27 | 2.24493, 28 | 2.05273, 29 | 2.05118, 30 | 2.05666, 31 | 2.04533, 32 | 2.05152, 33 | 2.04761 34 | ] 35 | }, 36 | "num-zeros": { 37 | "start_step": 0, 38 | "end_step": 34, 39 | "step_interval": 5, 40 | "values": [ 41 | 26081.0, 42 | 18799.0, 43 | 24479.0, 44 | 23782.0, 45 | 21056.0, 46 | 19877.0, 47 | 19774.0 48 | ] 49 | } 50 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.48685, 8 | 10.49276, 9 | 10.48837, 10 | 10.51348, 11 | 10.49396, 12 | 10.4755, 13 | 10.41921, 14 | 10.28044, 15 | 10.14256, 16 | 9.94738 17 | ] 18 | }, 19 | "iteration-time": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 10.8221, 25 | 1.96114, 26 | 1.9401, 27 | 2.22227, 28 | 1.94508, 29 | 1.94212, 30 | 1.93958, 31 | 1.94562, 32 | 1.9442, 33 | 1.94606 34 | ] 35 | }, 36 | "num-zeros": { 37 | "start_step": 0, 38 | "end_step": 34, 39 | "step_interval": 5, 40 | "values": [ 41 | 26876.0, 42 | 19339.0, 43 | 24146.0, 44 | 23625.0, 45 | 21440.0, 46 | 17865.0, 47 | 19282.0 48 | ] 49 | } 50 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 4 | NCCL_ALGO: Tree 5 | CUBLAS_WORKSPACE_CONFIG: :4096:8 6 | MODEL_ARGS: 7 | TEST_TYPE: regular 8 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | MODEL_ARGS: 5 | trainer.num_nodes: 1 6 | trainer.devices: 8 7 | trainer.max_steps: 50 8 | trainer.val_check_interval: 50 9 | trainer.limit_val_batches: 50 10 | trainer.max_epochs: 'null' 11 | trainer.precision: bf16 12 | model.num_layers: 12 13 | model.hidden_size: 768 14 | model.num_attention_heads: 12 15 | model.micro_batch_size: 1 16 | model.global_batch_size: 8 17 | model.tensor_model_parallel_size: 2 18 | model.pipeline_model_parallel_size: 4 19 | model.virtual_pipeline_model_parallel_size: 3 20 | model.encoder_seq_length: 2048 21 | model.max_position_embeddings: 2048 22 | model.ffn_hidden_size: 3072 23 | model.mcore_gpt: 'True' 24 | model.apply_query_key_layer_scaling: 'True' 25 | model.megatron_amp_O2: 'True' 26 | model.data.data_prefix: '[]' 27 | model.data.data_impl: mock 28 | model.data.splits_string: '[99990,8,2]' 29 | model.optim.name: distributed_fused_adam 30 | model.optim.weight_decay: 0.1 31 | exp_manager.create_checkpoint_callback: 'False' 32 | model.sequence_parallel: 'True' 33 | model.overlap_p2p_comm: 'True' 34 | model.batch_p2p_comm: 'False' 35 | TEST_TYPE: regular 36 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml: -------------------------------------------------------------------------------- 1 | ENV_VARS: 2 | CUDA_DEVICE_MAX_CONNECTIONS: 1 3 | SKIP_PYTEST: 1 4 | MODEL_ARGS: 5 | trainer.num_nodes: 1 6 | trainer.devices: 8 7 | trainer.max_steps: 50 8 | trainer.val_check_interval: 50 9 | trainer.limit_val_batches: 50 10 | trainer.max_epochs: 'null' 11 | trainer.precision: bf16 12 | model.num_layers: 12 13 | model.hidden_size: 768 14 | model.num_attention_heads: 12 15 | model.micro_batch_size: 4 16 | model.global_batch_size: 64 17 | model.tensor_model_parallel_size: 1 18 | model.pipeline_model_parallel_size: 1 19 | model.virtual_pipeline_model_parallel_size: 'null' 20 | model.encoder_seq_length: 2048 21 | model.max_position_embeddings: 2048 22 | model.ffn_hidden_size: 3072 23 | model.mcore_gpt: 'True' 24 | model.apply_query_key_layer_scaling: 'True' 25 | model.megatron_amp_O2: 'True' 26 | model.data.data_prefix: '[]' 27 | model.data.data_impl: mock 28 | model.data.splits_string: '[99990,8,2]' 29 | model.optim.name: distributed_fused_adam 30 | model.optim.weight_decay: 0.1 31 | exp_manager.create_checkpoint_callback: 'False' 32 | TEST_TYPE: regular 33 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.83373, 8 | 10.86683, 9 | 10.89023, 10 | 10.81051, 11 | 10.68459, 12 | 10.60979, 13 | 10.08992, 14 | 10.21481, 15 | 10.14018, 16 | 9.80603 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1488.0, 25 | 1854.0, 26 | 1854.0, 27 | 1884.0, 28 | 1794.0, 29 | 1784.0, 30 | 1569.0, 31 | 1942.0, 32 | 2263.0, 33 | 2147.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 13.39475, 42 | 0.14158, 43 | 0.14256, 44 | 0.14166, 45 | 0.14243, 46 | 0.14232, 47 | 0.143, 48 | 0.14113, 49 | 0.14164, 50 | 0.14069 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.83373, 8 | 10.86683, 9 | 10.89023, 10 | 10.81051, 11 | 10.68459, 12 | 10.60979, 13 | 10.08992, 14 | 10.21481, 15 | 10.14018, 16 | 9.80603 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1488.0, 25 | 1854.0, 26 | 1854.0, 27 | 1884.0, 28 | 1794.0, 29 | 1784.0, 30 | 1569.0, 31 | 1942.0, 32 | 2263.0, 33 | 2147.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 13.39475, 42 | 0.14158, 43 | 0.14256, 44 | 0.14166, 45 | 0.14243, 46 | 0.14232, 47 | 0.143, 48 | 0.14113, 49 | 0.14164, 50 | 0.14069 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.79206, 8 | 10.86691, 9 | 10.89065, 10 | 10.78186, 11 | 10.65978, 12 | 10.58022, 13 | 10.08207, 14 | 10.19156, 15 | 10.13495, 16 | 9.81167 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1626.0, 25 | 1866.0, 26 | 1959.0, 27 | 1816.0, 28 | 1890.0, 29 | 1654.0, 30 | 1537.0, 31 | 1965.0, 32 | 2436.0, 33 | 2405.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 21.9348, 42 | 0.1633, 43 | 0.16334, 44 | 0.16269, 45 | 0.16133, 46 | 0.16064, 47 | 0.16007, 48 | 0.15926, 49 | 0.1592, 50 | 0.15982 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.79206, 8 | 10.86691, 9 | 10.89065, 10 | 10.78186, 11 | 10.65978, 12 | 10.58022, 13 | 10.08207, 14 | 10.19156, 15 | 10.13495, 16 | 9.81167 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1626.0, 25 | 1866.0, 26 | 1959.0, 27 | 1816.0, 28 | 1890.0, 29 | 1654.0, 30 | 1537.0, 31 | 1965.0, 32 | 2436.0, 33 | 2405.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 21.9348, 42 | 0.1633, 43 | 0.16334, 44 | 0.16269, 45 | 0.16133, 46 | 0.16064, 47 | 0.16007, 48 | 0.15926, 49 | 0.1592, 50 | 0.15982 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.86122, 8 | 10.88647, 9 | 10.87773, 10 | 10.83111, 11 | 10.7165, 12 | 10.60619, 13 | 10.13147, 14 | 10.22767, 15 | 10.15929, 16 | 9.83482 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1694.0, 25 | 2148.0, 26 | 2169.0, 27 | 2103.0, 28 | 1991.0, 29 | 1900.0, 30 | 1707.0, 31 | 2189.0, 32 | 2557.0, 33 | 2606.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 9.61991, 42 | 0.29135, 43 | 0.28852, 44 | 0.28971, 45 | 0.29221, 46 | 0.28994, 47 | 0.28976, 48 | 0.28887, 49 | 0.28975, 50 | 0.2869 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.86122, 8 | 10.88647, 9 | 10.87773, 10 | 10.83111, 11 | 10.7165, 12 | 10.60623, 13 | 10.13146, 14 | 10.2277, 15 | 10.15933, 16 | 9.8348 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1694.0, 25 | 2148.0, 26 | 2169.0, 27 | 2103.0, 28 | 1991.0, 29 | 1869.0, 30 | 1760.0, 31 | 2214.0, 32 | 2529.0, 33 | 2587.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 11.72537, 42 | 0.29824, 43 | 0.29549, 44 | 0.29574, 45 | 0.29514, 46 | 0.29533, 47 | 0.29415, 48 | 0.30722, 49 | 0.29731, 50 | 0.29867 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.87346, 8 | 10.89625, 9 | 10.88939, 10 | 10.88681, 11 | 10.8893, 12 | 10.84863, 13 | 10.6962, 14 | 10.63919, 15 | 10.53931, 16 | 10.31119 17 | ] 18 | }, 19 | "iteration-time": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 4.95266, 25 | 0.07818, 26 | 0.07961, 27 | 0.07716, 28 | 0.08368, 29 | 0.08327, 30 | 0.08409, 31 | 0.08371, 32 | 0.08372, 33 | 0.08387 34 | ] 35 | }, 36 | "num-zeros": { 37 | "start_step": 0, 38 | "end_step": 32, 39 | "step_interval": 5, 40 | "values": [ 41 | 1300.0, 42 | 1287.0, 43 | 1565.0, 44 | 1441.0, 45 | 1419.0, 46 | 1295.0, 47 | 1177.0 48 | ] 49 | } 50 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.8893, 10.84864, 10.6962, 10.63918, 10.5393, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1298.0, 1352.0, 1590.0, 1403.0, 1435.0, 1266.0, 1195.0]}, "iteration_timing_avg": 0.07655911764705883} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.87346, 8 | 10.89625, 9 | 10.88939, 10 | 10.88681, 11 | 10.88931, 12 | 10.84864, 13 | 10.6962, 14 | 10.63918, 15 | 10.5393, 16 | 10.31119 17 | ] 18 | }, 19 | "iteration-time": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 5.32064, 25 | 0.08204, 26 | 0.08233, 27 | 0.08176, 28 | 0.09748, 29 | 0.0966, 30 | 0.09648, 31 | 0.09617, 32 | 0.09604, 33 | 0.09646 34 | ] 35 | }, 36 | "num-zeros": { 37 | "start_step": 0, 38 | "end_step": 32, 39 | "step_interval": 5, 40 | "values": [ 41 | 1112.0, 42 | 1124.0, 43 | 1229.0, 44 | 1665.0, 45 | 1269.0, 46 | 1219.0, 47 | 1572.0 48 | ] 49 | } 50 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.88931, 10.84864, 10.6962, 10.63918, 10.53931, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1131.0, 1173.0, 1218.0, 1783.0, 1278.0, 1244.0, 1555.0]}, "iteration_timing_avg": 0.07975499999999999} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.8401, 8 | 10.87259, 9 | 10.85024, 10 | 10.79646, 11 | 10.68156, 12 | 10.60618, 13 | 10.12768, 14 | 10.22185, 15 | 10.13788, 16 | 9.82309 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1698.0, 25 | 1855.0, 26 | 1949.0, 27 | 1968.0, 28 | 1881.0, 29 | 1783.0, 30 | 1653.0, 31 | 2037.0, 32 | 2313.0, 33 | 2300.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 5.37706, 42 | 0.09618, 43 | 0.09432, 44 | 0.09666, 45 | 0.09442, 46 | 0.09619, 47 | 0.09453, 48 | 0.0975, 49 | 0.09517, 50 | 0.09727 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.8401, 8 | 10.87259, 9 | 10.85023, 10 | 10.79646, 11 | 10.68153, 12 | 10.60619, 13 | 10.12767, 14 | 10.22185, 15 | 10.13787, 16 | 9.82307 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1698.0, 25 | 1855.0, 26 | 1896.0, 27 | 1866.0, 28 | 2032.0, 29 | 1814.0, 30 | 1664.0, 31 | 1961.0, 32 | 2306.0, 33 | 2403.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 8.00253, 42 | 0.13176, 43 | 0.13026, 44 | 0.13184, 45 | 0.13023, 46 | 0.13135, 47 | 0.13014, 48 | 0.13143, 49 | 0.1305, 50 | 0.13191 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82974, 10.85934, 10.88536, 10.78981, 10.64534, 10.56415, 9.99534, 10.13972, 10.06259, 9.71481]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [261.0, 256.0, 258.0, 250.0, 243.0, 265.0, 254.0, 299.0, 299.0, 294.0]}, "iteration_timing_avg": 0.3993126470588235} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85803, 10.88122, 10.85832, 10.80987, 10.66115, 10.55375, 10.01843, 10.14234, 10.05958, 9.71149]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [244.0, 231.0, 243.0, 257.0, 247.0, 267.0, 256.0, 299.0, 318.0, 325.0]}, "iteration_timing_avg": 0.3993126470588235} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.8468, 8 | 10.87769, 9 | 10.90302, 10 | 10.82026, 11 | 10.67979, 12 | 10.60157, 13 | 10.06449, 14 | 10.19316, 15 | 10.11411, 16 | 9.76007 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1692.0, 25 | 2044.0, 26 | 2005.0, 27 | 2007.0, 28 | 1945.0, 29 | 1868.0, 30 | 1701.0, 31 | 2085.0, 32 | 2389.0, 33 | 2377.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 10.20538, 42 | 0.14353, 43 | 0.14213, 44 | 0.14213, 45 | 0.14068, 46 | 0.14104, 47 | 0.14078, 48 | 0.14149, 49 | 0.14065, 50 | 0.14118 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.84474, 8 | 10.87688, 9 | 10.90253, 10 | 10.81872, 11 | 10.67849, 12 | 10.60076, 13 | 10.06361, 14 | 10.19267, 15 | 10.11344, 16 | 9.75987 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1769.0, 25 | 2129.0, 26 | 1987.0, 27 | 1961.0, 28 | 1961.0, 29 | 1886.0, 30 | 1655.0, 31 | 2130.0, 32 | 2315.0, 33 | 2362.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 8.72642, 42 | 0.16194, 43 | 0.15926, 44 | 0.15956, 45 | 0.15972, 46 | 0.1623, 47 | 0.16029, 48 | 0.15863, 49 | 0.15947, 50 | 0.15935 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.79205, 8 | 10.86789, 9 | 10.89149, 10 | 10.78328, 11 | 10.66126, 12 | 10.58275, 13 | 10.08467, 14 | 10.19448, 15 | 10.13785, 16 | 9.81454 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1580.0, 25 | 1778.0, 26 | 1849.0, 27 | 1841.0, 28 | 1884.0, 29 | 1679.0, 30 | 1544.0, 31 | 1953.0, 32 | 2449.0, 33 | 2335.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 10.79458, 42 | 0.16744, 43 | 0.16286, 44 | 0.16276, 45 | 0.16292, 46 | 0.16346, 47 | 0.16288, 48 | 0.16273, 49 | 0.16282, 50 | 0.16245 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.79208, 8 | 10.86688, 9 | 10.89063, 10 | 10.7818, 11 | 10.65964, 12 | 10.58005, 13 | 10.0819, 14 | 10.19136, 15 | 10.13478, 16 | 9.81149 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1602.0, 25 | 1792.0, 26 | 1751.0, 27 | 1885.0, 28 | 1872.0, 29 | 1716.0, 30 | 1561.0, 31 | 1867.0, 32 | 2355.0, 33 | 2329.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 13.82777, 42 | 0.17397, 43 | 0.17253, 44 | 0.17285, 45 | 0.17221, 46 | 0.17204, 47 | 0.17139, 48 | 0.17105, 49 | 0.17258, 50 | 0.17185 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.12348235294117646} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.74049, 8 | 10.81937, 9 | 10.84178, 10 | 10.75558, 11 | 10.69821, 12 | 10.63096, 13 | 10.2026, 14 | 10.36288, 15 | 10.25634, 16 | 9.94255 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 2529.0, 25 | 2845.0, 26 | 2909.0, 27 | 2683.0, 28 | 2631.0, 29 | 2573.0, 30 | 2281.0, 31 | 2559.0, 32 | 2484.0, 33 | 2360.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 14.80986, 42 | 0.17896, 43 | 0.17664, 44 | 0.17758, 45 | 0.17762, 46 | 0.17676, 47 | 0.17638, 48 | 0.1761, 49 | 0.17725, 50 | 0.1755 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.74049, 10.81937, 10.84178, 10.75551, 10.69818, 10.63091, 10.20265, 10.36288, 10.25632, 9.94256]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2527.0, 2937.0, 2975.0, 2749.0, 2580.0, 2593.0, 2320.0, 2616.0, 2541.0, 2393.0]}, "iteration_timing_avg": 0.12725500000000006} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.90105, 10.91104, 10.91635, 10.84822, 10.70727, 10.63018, 10.15241, 10.26052, 10.15994, 9.83162]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727086.0, 23021732.0, 22500940.0, 22830674.0, 22739332.0, 22547236.0, 22955516.0, 22590012.0, 22659588.0, 22884630.0]}, "iteration_timing_avg": 0.1246464705882353} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.82005, 8 | 10.87447, 9 | 10.87793, 10 | 10.79509, 11 | 10.68164, 12 | 10.59514, 13 | 10.10045, 14 | 10.21239, 15 | 10.13862, 16 | 9.80879 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1562.0, 25 | 1754.0, 26 | 1879.0, 27 | 1778.0, 28 | 1877.0, 29 | 1733.0, 30 | 1578.0, 31 | 1924.0, 32 | 2299.0, 33 | 2292.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 18.71949, 42 | 0.16575, 43 | 0.16508, 44 | 0.16465, 45 | 0.16475, 46 | 0.16222, 47 | 0.16473, 48 | 0.16461, 49 | 0.16489, 50 | 0.16518 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.82005, 8 | 10.87448, 9 | 10.87796, 10 | 10.79506, 11 | 10.68153, 12 | 10.59413, 13 | 10.09983, 14 | 10.20957, 15 | 10.13642, 16 | 9.80012 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1562.0, 25 | 1687.0, 26 | 1848.0, 27 | 1736.0, 28 | 1955.0, 29 | 1764.0, 30 | 1580.0, 31 | 1886.0, 32 | 2252.0, 33 | 2259.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 16.16694, 42 | 0.16354, 43 | 0.16237, 44 | 0.16232, 45 | 0.16088, 46 | 0.15891, 47 | 0.15894, 48 | 0.15865, 49 | 0.16009, 50 | 0.1576 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87448, 10.87794, 10.79507, 10.68154, 10.59412, 10.09987, 10.20952, 10.13639, 9.80012]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1734.0, 1884.0, 1684.0, 1815.0, 1766.0, 1601.0, 1904.0, 2361.0, 2347.0]}, "iteration_timing_avg": 0.12273676470588235} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.82005, 8 | 10.87447, 9 | 10.87793, 10 | 10.79509, 11 | 10.68164, 12 | 10.59514, 13 | 10.10045, 14 | 10.21239, 15 | 10.13862, 16 | 9.80879 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1562.0, 25 | 1754.0, 26 | 1879.0, 27 | 1778.0, 28 | 1877.0, 29 | 1733.0, 30 | 1578.0, 31 | 1924.0, 32 | 2299.0, 33 | 2292.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 18.68941, 42 | 0.16498, 43 | 0.16403, 44 | 0.16281, 45 | 0.16302, 46 | 0.16352, 47 | 0.16473, 48 | 0.16207, 49 | 0.16362, 50 | 0.16219 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.82005, 8 | 10.87447, 9 | 10.87799, 10 | 10.79507, 11 | 10.68165, 12 | 10.59511, 13 | 10.10047, 14 | 10.2124, 15 | 10.13861, 16 | 9.80876 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1562.0, 25 | 1738.0, 26 | 1852.0, 27 | 1802.0, 28 | 1917.0, 29 | 1765.0, 30 | 1570.0, 31 | 1949.0, 32 | 2251.0, 33 | 2270.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 14.96968, 42 | 0.16347, 43 | 0.16403, 44 | 0.16317, 45 | 0.162, 46 | 0.16129, 47 | 0.16268, 48 | 0.16156, 49 | 0.16212, 50 | 0.16407 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12168999999999999} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12873676470588236} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87799, 10.79508, 10.68166, 10.59514, 10.10042, 10.21238, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1857.0, 1746.0, 1883.0, 1738.0, 1475.0, 1851.0, 2303.0, 2258.0]}, "iteration_timing_avg": 0.12873676470588236} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9359, 10.93547, 10.94238, 10.88073, 10.75653, 10.66332, 10.1672, 10.27241, 10.19577, 9.86006]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727686.0, 23020980.0, 22501260.0, 22830024.0, 22739772.0, 22548148.0, 22955712.0, 22589816.0, 22660000.0, 22884332.0]}, "iteration_timing_avg": 0.12799705882352944} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81184, 10.84052, 10.8763, 10.79906, 10.68214, 10.59702, 10.49258, 10.11236, 10.12393, 9.98165]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1118.0, 1331.0, 1230.0, 1085.0, 1180.0, 1245.0, 1454.0, 1330.0, 1752.0, 1851.0]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [17.24286, 0.35341, 0.35187, 0.35028, 0.34941, 0.35093, 0.3488, 0.35179, 0.34905, 0.34684]}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81184, 10.84052, 10.87624, 10.79904, 10.68212, 10.59698, 10.49257, 10.11232, 10.12396, 9.98163]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1125.0, 1304.0, 1252.0, 1102.0, 1201.0, 1200.0, 1489.0, 1395.0, 1677.0, 1867.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1125.0, 1304.0, 1252.0, 1102.0, 1201.0, 1200.0, 1489.0, 1395.0, 1677.0, 1867.0]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22.22011, 0.36082, 0.35927, 0.35627, 0.35901, 0.35008, 0.34828, 0.34774, 0.35145, 0.35141]}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9735, 10.96043, 10.95576, 10.91038, 10.78791, 10.71201, 10.22424, 10.28926, 10.19049, 9.86378]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727052.0, 23021930.0, 22501022.0, 22831208.0, 22740024.0, 22547916.0, 22955210.0, 22589344.0, 22658940.0, 22884970.0]},"iteration_timing_avg": 0.1367805882352941} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.88734, 8 | 10.91614, 9 | 10.89061, 10 | 10.86173, 11 | 10.72753, 12 | 10.64491, 13 | 10.18012, 14 | 10.2562, 15 | 10.1611, 16 | 9.8539 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 3268.0, 25 | 4040.0, 26 | 4142.0, 27 | 3766.0, 28 | 4028.0, 29 | 3648.0, 30 | 3306.0, 31 | 4028.0, 32 | 4648.0, 33 | 4546.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 7.0561, 42 | 0.32588, 43 | 0.32628, 44 | 0.32385, 45 | 0.32419, 46 | 0.32364, 47 | 0.32337, 48 | 0.32334, 49 | 0.32358, 50 | 0.32395 51 | ] 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88734, 10.91612, 10.8906, 10.86171, 10.72752, 10.64491, 10.18015, 10.25622, 10.16111, 9.85394]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3228.0, 3820.0, 3890.0, 3848.0, 3902.0, 3486.0, 3310.0, 3982.0, 4472.0, 4532.0]}, "iteration_timing_avg": 0.22043823529411763} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.79987, 8 | 10.85983, 9 | 10.865, 10 | 10.799, 11 | 10.70987, 12 | 10.63782, 13 | 10.1965, 14 | 10.3099, 15 | 10.22262, 16 | 9.91423 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 30784.0, 25 | 37528.0, 26 | 37616.0, 27 | 36105.0, 28 | 33464.0, 29 | 34923.0, 30 | 30806.0, 31 | 35663.0, 32 | 36661.0, 33 | 37641.0 34 | ] 35 | }, 36 | "iteration_timing_avg": 0.3566726470588235 37 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.8029, 8 | 10.86149, 9 | 10.86819, 10 | 10.80829, 11 | 10.72062, 12 | 10.64588, 13 | 10.21132, 14 | 10.32324, 15 | 10.2265, 16 | 9.92918 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 31473.0, 25 | 37753.0, 26 | 38332.0, 27 | 36348.0, 28 | 33270.0, 29 | 34310.0, 30 | 30284.0, 31 | 35432.0, 32 | 36356.0, 33 | 37109.0 34 | ] 35 | }, 36 | "iteration_timing_avg": 0.21900323529411767 37 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.83445, 8 | 10.87978, 9 | 10.87924, 10 | 10.81567, 11 | 10.69374, 12 | 10.60333, 13 | 10.08824, 14 | 10.21471, 15 | 10.10778, 16 | 9.78309 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 26648.0, 25 | 32884.0, 26 | 33611.0, 27 | 31683.0, 28 | 28744.0, 29 | 30671.0, 30 | 28602.0, 31 | 33538.0, 32 | 34560.0, 33 | 35099.0 34 | ] 35 | }, 36 | "iteration_timing_avg": 0.28211852941176474 37 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.81823, 8 | 10.86998, 9 | 10.8727, 10 | 10.80014, 11 | 10.67571, 12 | 10.57944, 13 | 10.06572, 14 | 10.19342, 15 | 10.08575, 16 | 9.75236 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 26801.0, 25 | 32734.0, 26 | 32925.0, 27 | 31593.0, 28 | 28610.0, 29 | 30362.0, 30 | 28464.0, 31 | 33486.0, 32 | 33403.0, 33 | 35162.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 8.63293, 42 | 0.29454, 43 | 0.28102, 44 | 0.28297, 45 | 0.28369, 46 | 0.2848, 47 | 0.30008, 48 | 0.29214, 49 | 0.31041, 50 | 0.295 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.81823, 8 | 10.86998, 9 | 10.8727, 10 | 10.80014, 11 | 10.67571, 12 | 10.57944, 13 | 10.06572, 14 | 10.19342, 15 | 10.08575, 16 | 9.75236 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 26801.0, 25 | 32734.0, 26 | 32925.0, 27 | 31593.0, 28 | 28610.0, 29 | 30362.0, 30 | 28464.0, 31 | 33486.0, 32 | 33403.0, 33 | 35162.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 11.94141, 42 | 0.28425, 43 | 0.28413, 44 | 0.29449, 45 | 0.28534, 46 | 0.29977, 47 | 0.30061, 48 | 0.30321, 49 | 0.30986, 50 | 0.30404 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.92705, 8 | 10.93624, 9 | 10.89333, 10 | 10.87317, 11 | 10.74871, 12 | 10.65379, 13 | 10.15753, 14 | 10.24638, 15 | 10.15178, 16 | 9.83806 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1653.0, 25 | 1874.0, 26 | 1994.0, 27 | 1828.0, 28 | 1769.0, 29 | 1845.0, 30 | 1674.0, 31 | 1957.0, 32 | 2364.0, 33 | 2345.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 11.33146, 42 | 0.22344, 43 | 0.21997, 44 | 0.21977, 45 | 0.21792, 46 | 0.21685, 47 | 0.22555, 48 | 0.21755, 49 | 0.21796, 50 | 0.21694 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.92705, 8 | 10.93628, 9 | 10.89334, 10 | 10.87322, 11 | 10.74869, 12 | 10.65374, 13 | 10.15755, 14 | 10.24638, 15 | 10.15177, 16 | 9.83799 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 68.0, 25 | 64.0, 26 | 61.0, 27 | 70.0, 28 | 66.0, 29 | 55.0, 30 | 76.0, 31 | 72.0, 32 | 64.0, 33 | 85.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 9.68102, 42 | 0.22487, 43 | 0.22503, 44 | 0.22418, 45 | 0.22445, 46 | 0.22504, 47 | 0.22333, 48 | 0.22333, 49 | 0.22458, 50 | 0.22367 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93628, 10.89335, 10.87322, 10.7487, 10.65379, 10.15754, 10.2464, 10.15175, 9.83801]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [68.0, 64.0, 61.0, 58.0, 55.0, 85.0, 77.0, 68.0, 78.0, 63.0]}} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.92705, 8 | 10.93624, 9 | 10.89333, 10 | 10.87317, 11 | 10.74871, 12 | 10.65379, 13 | 10.15753, 14 | 10.24638, 15 | 10.15178, 16 | 9.83806 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1653.0, 25 | 1874.0, 26 | 1994.0, 27 | 1828.0, 28 | 1769.0, 29 | 1845.0, 30 | 1674.0, 31 | 1957.0, 32 | 2364.0, 33 | 2345.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 11.05896, 42 | 0.21941, 43 | 0.22052, 44 | 0.22086, 45 | 0.22118, 46 | 0.22063, 47 | 0.22075, 48 | 0.22064, 49 | 0.22956, 50 | 0.23548 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.92705, 8 | 10.93624, 9 | 10.89333, 10 | 10.87317, 11 | 10.74871, 12 | 10.65379, 13 | 10.15753, 14 | 10.24638, 15 | 10.15178, 16 | 9.83806 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1653.0, 25 | 1874.0, 26 | 1994.0, 27 | 1828.0, 28 | 1769.0, 29 | 1845.0, 30 | 1674.0, 31 | 1957.0, 32 | 2364.0, 33 | 2345.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 9.47055, 42 | 0.34439, 43 | 0.22313, 44 | 0.22277, 45 | 0.22175, 46 | 0.21936, 47 | 0.23348, 48 | 0.22009, 49 | 0.22043, 50 | 0.21934 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.86217, 8 | 10.88646, 9 | 10.87861, 10 | 10.83295, 11 | 10.7203, 12 | 10.61089, 13 | 10.14181, 14 | 10.23434, 15 | 10.16609, 16 | 9.84444 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 1769.0, 25 | 2056.0, 26 | 2198.0, 27 | 2079.0, 28 | 2181.0, 29 | 1912.0, 30 | 1825.0, 31 | 2115.0, 32 | 2621.0, 33 | 2598.0 34 | ] 35 | }, 36 | "iteration-time": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 6.42448, 42 | 0.42854, 43 | 0.42836, 44 | 0.42582, 45 | 0.42274, 46 | 0.42187, 47 | 0.42561, 48 | 0.42178, 49 | 0.44234, 50 | 0.42304 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13495, 9.13325, 9.12905, 9.11323, 9.05401, 9.04233, 8.98255, 8.93258, 8.88937, 8.78788]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3477473.0, 3584371.0, 3475194.0, 3382773.0, 3699802.0, 3478715.0, 3397967.0, 3453615.0, 3424973.0, 3585127.0]},"iteration_timing_avg": 0.2253964705882353} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3557301.0, 3663955.0, 3555196.0, 3462888.0, 3780083.0, 3559007.0, 3477262.0, 3533752.0, 3505033.0, 3665096.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16173, 9.16211, 9.15686, 9.14022, 9.09396, 9.07146, 9.01401, 8.9651, 8.91881, 8.82578]}} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19864, 9.20112, 9.19598, 9.17297, 9.1171, 9.10232, 9.04013, 8.98432, 8.94016, 8.83862]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3717564.0, 3824205.0, 3714643.0, 3622971.0, 3939727.0, 3718836.0, 3637293.0, 3694227.0, 3665382.0, 3825257.0]}, "iteration_timing_avg": 0.5847132352941178} -------------------------------------------------------------------------------- /tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.41501, 9.20443, 8.62112, 8.34419, 8.08454, 7.96905, 7.68086, 7.39418, 7.26109, 7.19122]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [115751.0, 111072.0, 117055.0, 112398.0, 118712.0, 116944.0, 111387.0, 114025.0, 118464.0, 116959.0]}, "iteration_timing_avg": 0.2253964705882353} -------------------------------------------------------------------------------- /tests/test_utils/recipes/_build-mcore-dev.yaml: -------------------------------------------------------------------------------- 1 | type: build 2 | format_version: 1 3 | maintainers: [maanug] 4 | spec: 5 | name: mcore-pyt-dev 6 | platforms: [linux/amd64] 7 | source: 8 | # The image tag will be added via `jet-tests.yaml` 9 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 10 | image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_dev 11 | -------------------------------------------------------------------------------- /tests/test_utils/recipes/_build-mcore-lts.yaml: -------------------------------------------------------------------------------- 1 | type: build 2 | format_version: 1 3 | maintainers: [maanug] 4 | spec: 5 | name: mcore-pyt-lts 6 | platforms: [linux/amd64] 7 | source: 8 | # The image tag will be added via `jet-tests.yaml` 9 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 10 | image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_lts 11 | -------------------------------------------------------------------------------- /tests/test_utils/recipes/_build-nemo.yaml: -------------------------------------------------------------------------------- 1 | type: build 2 | format_version: 1 3 | maintainers: [maanug] 4 | spec: 5 | name: mcore-nemo 6 | platforms: [linux/amd64] 7 | source: 8 | # The image tag will be added via `jet-tests.yaml` 9 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 10 | image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci -------------------------------------------------------------------------------- /tests/test_utils/recipes/gpt-nemo.yaml: -------------------------------------------------------------------------------- 1 | type: basic 2 | format_version: 1 3 | maintainers: [mcore] 4 | loggers: [stdout] 5 | spec: 6 | name: "{test_case}" 7 | model: gpt-nemo 8 | build: mcore-nemo 9 | nodes: 1 10 | gpus: 8 11 | platforms: dgx_a100 12 | time_limit: 1800 13 | scope: null 14 | script: |- 15 | ls 16 | cd /opt/NeMo 17 | 18 | ARGUMENTS=( 19 | "DATA_PATH='-'" 20 | "DATA_CACHE_PATH='-'" 21 | "OUTPUT_PATH={assets_dir}" 22 | "TENSORBOARD_PATH={assets_dir}/tensorboard" 23 | "CHECKPOINT_PATH=/workspace/checkpoints" 24 | "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py" 25 | "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" 26 | "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json" 27 | "N_REPEAT={n_repeat}" 28 | ) 29 | 30 | bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} 31 | 32 | products: 33 | - environment: [dev] 34 | scope: [mr] 35 | n_repeat: [5] 36 | test_case: 37 | - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G 38 | - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G 39 | -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- 1 | import torch._dynamo 2 | 3 | torch._dynamo.config.suppress_errors = True 4 | -------------------------------------------------------------------------------- /tests/unit_tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/data/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/dist_checkpointing/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | 5 | from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy 6 | 7 | 8 | def pytest_sessionfinish(session, exitstatus): 9 | if exitstatus == 5: 10 | session.exitstatus = 0 11 | 12 | 13 | @pytest.fixture(scope='session', autouse=True) 14 | def set_default_dist_ckpt_strategy(): 15 | def get_pyt_dist_save_sharded_strategy(): 16 | return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1) 17 | 18 | with mock.patch( 19 | 'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy', 20 | new=get_pyt_dist_save_sharded_strategy, 21 | ) as _fixture: 22 | yield _fixture 23 | -------------------------------------------------------------------------------- /tests/unit_tests/dist_checkpointing/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/dist_checkpointing/models/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/export/trtllm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/export/trtllm/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/engines/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/model_inference_wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/model_inference_wrappers/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( 4 | InferenceWrapperConfig, 5 | ) 6 | 7 | 8 | class TestModelInferenceWrapperConfig: 9 | 10 | def test_inference_params(self): 11 | inference_parameters = InferenceWrapperConfig( 12 | hidden_size=10, 13 | inference_batch_times_seqlen_threshold=10, 14 | padded_vocab_size=10, 15 | params_dtype=torch.float, 16 | fp32_residual_connection=False, 17 | ) 18 | inference_parameters.add_attributes({"abc": 45}) 19 | assert ( 20 | inference_parameters.abc == 45 21 | ), f"min tokens not set correctly. it is {inference_parameters.min_tokens}" 22 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/test_common_inference_params.py: -------------------------------------------------------------------------------- 1 | from megatron.core.inference.common_inference_params import CommonInferenceParams 2 | 3 | 4 | class TestCommonInferenceParams: 5 | 6 | def test_inference_params(self): 7 | inference_parameters = CommonInferenceParams() 8 | inference_parameters.add_attributes({"min_tokens": 45}) 9 | assert ( 10 | inference_parameters.min_tokens == 45 11 | ), f"min tokens not set correctly. it is {inference_parameters.min_tokens}" 12 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/test_inference_utils.py: -------------------------------------------------------------------------------- 1 | from megatron.core.inference.utils import Counter 2 | 3 | 4 | class TestInferenceUtils: 5 | 6 | def test_counter(self): 7 | counter = Counter() 8 | r = next(counter) 9 | assert r == 0, f'Counter return value should be 0 but it is {r}' 10 | assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}' 11 | counter.reset() 12 | assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}' 13 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/text_generation_controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/inference/text_generation_controllers/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/models/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/pipeline_parallel/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/tensor_parallel/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy 5 | from tests.unit_tests.test_utilities import Utils 6 | 7 | 8 | def test_vocab_parallel_cross_entropy(): 9 | Utils.initialize_model_parallel(4, 2) 10 | vocab_parallel_logits = torch.range(0, 7).repeat(16, 4).cuda() 11 | target = torch.arange(0, 32, 2).cuda() 12 | output = vocab_parallel_cross_entropy(vocab_parallel_logits, target) 13 | expected_output = torch.tensor( 14 | [ 15 | 10.2309, 16 | 8.2309, 17 | 6.2309, 18 | 4.2309, 19 | 10.2309, 20 | 8.2309, 21 | 6.2309, 22 | 4.2309, 23 | 10.2309, 24 | 8.2309, 25 | 6.2309, 26 | 4.2309, 27 | 10.2309, 28 | 8.2309, 29 | 6.2309, 30 | 4.2309, 31 | ] 32 | ).cuda() 33 | assert torch.equal(torch.round(expected_output), torch.round(output)) 34 | Utils.destroy_model_parallel() 35 | -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from megatron.core.tensor_parallel.data import broadcast_data 4 | from tests.unit_tests.test_utilities import Utils 5 | 6 | 7 | def test_broadcast_data(): 8 | Utils.initialize_model_parallel(2, 4) 9 | input_data = { 10 | 0: torch.ones((8, 8)).cuda() * 0.0, 11 | 1: torch.ones((8, 8)).cuda() * 1.0, 12 | 2: torch.ones((8, 8)).cuda() * 2.0, 13 | 3: torch.ones((8, 8)).cuda() * 3.0, 14 | 4: torch.ones((8, 8)).cuda() * 4.0, 15 | 5: torch.ones((8, 8)).cuda() * 5.0, 16 | 6: torch.ones((8, 8)).cuda() * 6.0, 17 | 7: torch.ones((8, 8)).cuda() * 7.0, 18 | } 19 | dtype = torch.float32 20 | actual_output = broadcast_data([0, 1], input_data, dtype) 21 | assert torch.equal(actual_output[0], input_data[0]) 22 | assert torch.equal(actual_output[1], input_data[1]) 23 | Utils.destroy_model_parallel() 24 | -------------------------------------------------------------------------------- /tests/unit_tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | -------------------------------------------------------------------------------- /tests/unit_tests/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/transformer/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/transformer/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-ml/ReMoE/463c6af38ffaa841941ea834470ee09f8d515652/tests/unit_tests/transformer/moe/__init__.py -------------------------------------------------------------------------------- /tools/bert_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder 4 | -------------------------------------------------------------------------------- /tools/bert_embedding/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "h5py", 7 | "transformers", # for huggingface bert 8 | ] 9 | 10 | for lib in required_libs: 11 | try: 12 | globals()[lib] = importlib.import_module(lib) 13 | except ImportError as e: 14 | raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.") 15 | -------------------------------------------------------------------------------- /tools/checkpoint/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import psutil 4 | 5 | 6 | def print_memory_usage(key, rank, num_ranks): 7 | '''Print memory usage.''' 8 | process = psutil.Process() 9 | mem_info = process.memory_info() 10 | print("> memory usage: '%s', rank %d / %d, mem %.1f/%.1f gb." % ( 11 | key, 12 | rank, 13 | num_ranks, 14 | mem_info.rss / 1024**3, 15 | 100 * mem_info.rss / process.memory_percent() / 1024**3, 16 | )) 17 | -------------------------------------------------------------------------------- /tools/copyright.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Files ending with .py should have Copyright notice in the first line. 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 5 | 6 | # Move to the project root 7 | cd $SCRIPT_DIR/.. 8 | find_files_with_missing_copyright() { 9 | find ./megatron/ -type f -name '*.py' | while read path; do 10 | echo -en $path"\t" 11 | head -2 $path | grep -iv 'coding=' | head -1 12 | done \ 13 | | egrep -iv 'Copyright.*NVIDIA CORPORATION.*All rights reserved.' \ 14 | | grep -iv 'BSD 3-Clause License' \ 15 | | grep -iv 'Copyright.*Microsoft' \ 16 | | grep -iv 'Copyright.*The Open AI Team' \ 17 | | grep -iv 'Copyright.*The Google AI' \ 18 | | grep -iv 'Copyright.*Facebook' | while read line; do 19 | echo $line | cut -d' ' -f1 20 | done 21 | } 22 | 23 | 24 | declare RESULT=($(find_files_with_missing_copyright)) # (..) = array 25 | 26 | if [ "${#RESULT[@]}" -gt 0 ]; then 27 | echo "Error: Found files with missing copyright:" 28 | for (( i=0; i<"${#RESULT[@]}"; i++ )); do 29 | echo "path= ${RESULT[$i]}" 30 | done 31 | exit 1; 32 | else 33 | echo "Ok: All files start with copyright notice" 34 | fi 35 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import glob 5 | import sys 6 | import json 7 | import argparse 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--json_path", type=str, default=".", 13 | help="path where all the json files are located") 14 | 15 | parser.add_argument("--output_file", type=str, default="merged_output.json", 16 | help="filename where the merged json should go") 17 | 18 | args = parser.parse_args() 19 | 20 | json_path = args.json_path 21 | out_file = args.output_file 22 | 23 | json_files = glob.glob(json_path + '/*.json') 24 | 25 | counter = 0 26 | 27 | with open(out_file, 'w') as outfile: 28 | for fname in json_files: 29 | counter += 1 30 | 31 | if counter % 1024 == 0: 32 | print("Merging at ", counter, flush=True) 33 | 34 | with open(fname, 'r') as infile: 35 | for row in infile: 36 | each_row = json.loads(row) 37 | outfile.write(row) 38 | 39 | 40 | print("Merged file", out_file, flush=True) 41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/report_theoretical_memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Computes theoretical memory footprint for model training without instantiating 4 | a model and running training iterations on GPU(s).""" 5 | 6 | from megatron.training import get_args 7 | from megatron.training.initialize import initialize_megatron 8 | from megatron.training.theoretical_memory_usage import report_theoretical_memory 9 | 10 | if __name__ == "__main__": 11 | initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True) 12 | args = get_args() 13 | 14 | report_theoretical_memory(args, verbose=True) 15 | -------------------------------------------------------------------------------- /tools/retro/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .cli import retro 4 | -------------------------------------------------------------------------------- /tools/retro/cli/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | from . import retro 6 | 7 | 8 | if __name__ == "__main__": 9 | retro.init(os.environ["RETRO_PROJECT_DIR"]) 10 | -------------------------------------------------------------------------------- /tools/retro/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3 2 | 3 | RUN pip install -U faiss-gpu 4 | 5 | RUN apt update 6 | 7 | RUN apt install -qy htop 8 | 9 | RUN pip install -U transformers 10 | 11 | RUN pip install --upgrade google-api-python-client 12 | 13 | RUN pip install sentencepiece 14 | 15 | RUN pip install h5py 16 | 17 | RUN pip install nltk 18 | 19 | RUN pip install einops 20 | -------------------------------------------------------------------------------- /tools/retro/sft/README.md: -------------------------------------------------------------------------------- 1 | ## Note 2 | 3 | The content within this `sft` directory is still under active development and will be updated soon. -------------------------------------------------------------------------------- /tools/retro/sft/open_inst.sh: -------------------------------------------------------------------------------- 1 | DATA_BLEND="1.0 open_inst" 2 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import sys 3 | import json 4 | import requests 5 | 6 | 7 | if __name__ == "__main__": 8 | url = sys.argv[1] 9 | url = 'http://' + url + '/api' 10 | headers = {'Content-Type': 'application/json'} 11 | 12 | while True: 13 | sentence = input("Enter prompt: ") 14 | tokens_to_generate = int(eval(input("Enter number of tokens to generate: "))) 15 | 16 | data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} 17 | response = requests.put(url, data=json.dumps(data), headers=headers) 18 | 19 | if response.status_code != 200: 20 | print(f"Error {response.status_code}: {response.json()['message']}") 21 | else: 22 | print("Megatron Response: ") 23 | print(response.json()['text'][0]) 24 | --------------------------------------------------------------------------------