├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── enhancement.md │ ├── question.md │ └── regression.md └── workflows │ └── stale.yml ├── .gitignore ├── .gitlab-ci.yml ├── CODEOWNERS ├── CONTRIBUTING.md ├── Dockerfile.ci ├── Dockerfile.linting ├── LICENSE ├── MANIFEST.in ├── README.md ├── artifact ├── README.md ├── example-results │ ├── full-exp.txt │ └── quick-exp.txt ├── exp_one_host.csv ├── full_exp.sh ├── quick_exp.sh └── show_result_full_exp.py ├── docs ├── llama_mistral.md └── source │ ├── api-guide │ ├── context_parallel.rst │ ├── datasets.rst │ ├── dist_checkpointing.rst │ ├── dist_checkpointing.strategies.rst │ ├── distributed.rst │ ├── fusions.rst │ ├── index.rst │ ├── models.bert.rst │ ├── models.gpt.rst │ ├── models.rst │ ├── models.t5.rst │ ├── moe.rst │ ├── num_microbatches_calculator.rst │ ├── pipeline_parallel.rst │ ├── tensor_parallel.rst │ └── transformer.rst │ ├── distrib_optimizer.md │ ├── images │ ├── context_parallel │ │ ├── CP_overview.png │ │ └── CP_results.png │ └── distrib_optimizer │ │ ├── data_flow.png │ │ └── sharding_scheme.png │ ├── index.rst │ └── user-guide │ └── index.rst ├── examples ├── academic_paper_scripts │ ├── detxoify_lm │ │ ├── README.md │ │ ├── annotations │ │ │ ├── filter-selfgeneration.py │ │ │ ├── perspective_api_annotate.py │ │ │ └── preprocess.sh │ │ ├── finetune_gpt.py │ │ ├── finetune_gpt_distributed-1.3b.sh │ │ ├── generate-1.3b.sh │ │ ├── generate_samples_gpt.py │ │ ├── perspective_api.py │ │ └── self_generation │ │ │ └── selfgenerate-1.3b-unconditional.sh │ ├── msdp │ │ ├── README.md │ │ ├── data_processing.sh │ │ ├── eval_knwl_generation.sh │ │ ├── eval_resp_generation.sh │ │ ├── prep_resp_gen.sh │ │ ├── prompt_knwl_gen.sh │ │ └── prompt_resp_gen.sh │ └── sc21 │ │ ├── CONFIG.sh │ │ ├── README.md │ │ ├── SBATCH.sh │ │ ├── SRUN.sh │ │ ├── run_figure_11.sh │ │ ├── run_figure_12.sh │ │ ├── run_figure_13.sh │ │ ├── run_figure_14.sh │ │ ├── run_figure_15.sh │ │ ├── run_figure_16.sh │ │ ├── run_figure_17.sh │ │ ├── run_figure_18.sh │ │ └── run_table_1.sh ├── bert │ ├── README.md │ └── train_bert_340m_distributed.sh ├── gpt3 │ ├── README.md │ ├── gpt_config.yaml │ └── train_gpt3_175b_distributed.sh ├── inference │ ├── README.md │ ├── gpt │ │ └── simple_gpt_batch_inference.py │ ├── quantization │ │ ├── README.md │ │ ├── ptq_trtllm_llama_7b.sh │ │ ├── ptq_trtllm_nemotron3_8b.sh │ │ ├── text_generation_ptq.py │ │ └── trtllm_text_generation.py │ ├── run_text_generation_server_345M.sh │ └── run_text_generation_server_345M_8_tensor_parallel.sh ├── mamba │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── run_text_gen_server_8b.sh │ ├── run_text_gen_server_8b_gpt3.sh │ └── train.sh ├── mixtral │ ├── README.md │ └── train_mixtral_8x7b_distributed.sh ├── multimodal │ ├── Dockerfile │ ├── README.md │ ├── assets │ │ └── pretrain_curves.png │ ├── clip_converter.py │ ├── combine_mistral_clip.sh │ ├── combine_state_dicts.py │ ├── config.py │ ├── convert_llava_pretrain_to_wds.py │ ├── dataloader_provider.py │ ├── dataset_helpers.py │ ├── evaluate_coco.py │ ├── evaluate_mmmu.py │ ├── evaluate_textvqa.py │ ├── evaluate_vqav2.py │ ├── layer_specs.py │ ├── manual_prompts.json │ ├── pretrain_dataset.yaml │ ├── pretrain_mistral_clip.sh │ ├── run_text_generation.py │ ├── sft_dataset.yaml │ ├── sft_mistral_clip.sh │ ├── text_generation_mistral_clip.sh │ └── train.py ├── retro │ ├── README.md │ ├── preprocess_data.sh │ └── train_retro_2b_distributed.sh ├── run_simple_mcore_train_loop.py └── t5 │ ├── README.md │ ├── t5_mcore_train_curve.png │ └── train_t5_220m_distributed.sh ├── images ├── expt-pp32-flops.png ├── expt-pp32-mem.png ├── model_table.png ├── schedule-interlaced.png ├── schedule-vocab-1.png ├── schedule-vocab-2.png ├── st-passes-1.png ├── st-passes-2.png ├── strong_scaling.png └── weak_scaling.png ├── input_store.py ├── jet-tests.yml ├── megatron ├── core │ ├── QuickStart.md │ ├── README.md │ ├── README_STRAGGLER.md │ ├── __init__.py │ ├── datasets │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── bert_dataset.py │ │ ├── blended_dataset.py │ │ ├── blended_megatron_dataset_builder.py │ │ ├── blended_megatron_dataset_config.py │ │ ├── gpt_dataset.py │ │ ├── helpers.cpp │ │ ├── indexed_dataset.py │ │ ├── masked_dataset.py │ │ ├── megatron_dataset.py │ │ ├── megatron_tokenizer.py │ │ ├── multimodal_dataset.py │ │ ├── readme.md │ │ ├── retro │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_embedders.py │ │ │ │ ├── config.py │ │ │ │ ├── gpt_chunk_datasets.py │ │ │ │ └── tokenizers.py │ │ │ ├── db │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── dataset.py │ │ │ │ └── utils.py │ │ │ ├── external_libs.py │ │ │ ├── index │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── factory.py │ │ │ │ ├── index.py │ │ │ │ ├── indexes │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── faiss_base.py │ │ │ │ │ └── faiss_par_add.py │ │ │ │ ├── utils.py │ │ │ │ └── validate.py │ │ │ ├── query │ │ │ │ ├── __init__.py │ │ │ │ ├── gpt_chunk_dataset.py │ │ │ │ ├── multi_split_gpt_dataset.py │ │ │ │ ├── query.py │ │ │ │ ├── retro_dataset.py │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── t5_dataset.py │ │ ├── utils.py │ │ └── utils_s3.py │ ├── dist_checkpointing │ │ ├── __init__.py │ │ ├── core.py │ │ ├── dict_utils.py │ │ ├── mapping.py │ │ ├── optimizer.py │ │ ├── serialization.py │ │ ├── strategies │ │ │ ├── __init__.py │ │ │ ├── async_utils.py │ │ │ ├── base.py │ │ │ ├── common.py │ │ │ ├── filesystem_async.py │ │ │ ├── fully_parallel.py │ │ │ ├── resharding.py │ │ │ ├── state_dict_saver.py │ │ │ ├── tensorstore.py │ │ │ ├── torch.py │ │ │ ├── two_stage.py │ │ │ └── zarr.py │ │ ├── utils.py │ │ └── validation.py │ ├── distributed │ │ ├── __init__.py │ │ ├── distributed_data_parallel.py │ │ ├── distributed_data_parallel_config.py │ │ ├── finalize_model_grads.py │ │ └── param_and_grad_buffer.py │ ├── enums.py │ ├── fusions │ │ ├── __init__.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_bias_geglu.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_bias_swiglu.py │ │ ├── fused_cross_entropy.py │ │ ├── fused_layer_norm.py │ │ └── fused_softmax.py │ ├── inference │ │ ├── __init__.py │ │ ├── ammo_support │ │ │ ├── __init__.py │ │ │ └── gpt │ │ │ │ ├── __init__.py │ │ │ │ ├── model_specs.py │ │ │ │ └── state_dict_hooks.py │ │ ├── common_inference_params.py │ │ ├── communication_utils.py │ │ ├── engines │ │ │ ├── __init__.py │ │ │ ├── abstract_engine.py │ │ │ └── mcore_engine.py │ │ ├── inference_request.py │ │ ├── model_inference_wrappers │ │ │ ├── __init__.py │ │ │ ├── abstract_model_inference_wrapper.py │ │ │ ├── gpt │ │ │ │ ├── __init__.py │ │ │ │ └── gpt_inference_wrapper.py │ │ │ └── inference_wrapper_config.py │ │ ├── scheduler.py │ │ ├── text_generation_controllers │ │ │ ├── __init__.py │ │ │ └── simple_text_generation_controller.py │ │ └── utils.py │ ├── inference_params.py │ ├── jit.py │ ├── model_parallel_config.py │ ├── models │ │ ├── T5 │ │ │ ├── __init__.py │ │ │ ├── t5_model.py │ │ │ └── t5_spec.py │ │ ├── __init__.py │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── bert_layer_specs.py │ │ │ ├── bert_lm_head.py │ │ │ ├── bert_model.py │ │ │ └── pooler.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── embeddings │ │ │ │ ├── __init__.py │ │ │ │ ├── language_model_embedding.py │ │ │ │ └── rotary_pos_embedding.py │ │ │ ├── language_module │ │ │ │ ├── __init__.py │ │ │ │ └── language_module.py │ │ │ └── vision_module │ │ │ │ ├── __init__.py │ │ │ │ └── vision_module.py │ │ ├── gpt │ │ │ ├── __init__.py │ │ │ ├── gpt_layer_specs.py │ │ │ └── gpt_model.py │ │ ├── mamba │ │ │ ├── __init__.py │ │ │ ├── mamba_layer_specs.py │ │ │ └── mamba_model.py │ │ ├── multimodal │ │ │ ├── __init__.py │ │ │ ├── llava_model.py │ │ │ └── llava_spec.py │ │ ├── retro │ │ │ ├── __init__.py │ │ │ ├── base_attention.py │ │ │ ├── config.py │ │ │ ├── decoder_attention.py │ │ │ ├── decoder_spec.py │ │ │ ├── encoder_attention.py │ │ │ ├── encoder_spec.py │ │ │ ├── model.py │ │ │ └── utils.py │ │ └── vision │ │ │ ├── __init__.py │ │ │ ├── clip_vit_model.py │ │ │ ├── multimodal_projector.py │ │ │ └── vit_layer_specs.py │ ├── num_microbatches_calculator.py │ ├── optimizer │ │ ├── __init__.py │ │ ├── clip_grads.py │ │ ├── distrib_optimizer.py │ │ ├── grad_scaler.py │ │ ├── optimizer.py │ │ └── optimizer_config.py │ ├── package_info.py │ ├── packed_seq_params.py │ ├── parallel_state.py │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── interlaced_schedule.py │ │ ├── p2p_communication.py │ │ ├── schedule_timers.py │ │ ├── schedules.py │ │ └── vocab_parallel_schedule.py │ ├── requirements.txt │ ├── ssm │ │ ├── __init__.py │ │ ├── mamba_block.py │ │ ├── mamba_hybrid_layer_allocation.py │ │ ├── mamba_layer.py │ │ ├── mamba_mixer.py │ │ └── triton_cache_manager.py │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ ├── utils.py │ │ ├── vocab_input.py │ │ ├── vocab_input_store.py │ │ ├── vocab_output.py │ │ └── vocab_output_store.py │ ├── timers.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── custom_layers │ │ │ ├── __init__.py │ │ │ └── transformer_engine.py │ │ ├── dot_product_attention.py │ │ ├── enums.py │ │ ├── identity_op.py │ │ ├── mlp.py │ │ ├── module.py │ │ ├── moe │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── experts.py │ │ │ ├── grouped_gemm_util.py │ │ │ ├── moe_layer.py │ │ │ ├── moe_utils.py │ │ │ ├── router.py │ │ │ └── token_dispatcher.py │ │ ├── spec_utils.py │ │ ├── torch_layer_norm.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── utils.py │ └── utils.py ├── inference │ ├── __init__.py │ ├── arguments.py │ ├── checkpointing.py │ ├── gpt │ │ ├── __init__.py │ │ └── model_provider.py │ ├── static │ │ └── index.html │ ├── text_generation │ │ ├── __init__.py │ │ ├── api.py │ │ ├── beam_utils.py │ │ ├── communication.py │ │ ├── forward_step.py │ │ ├── generation.py │ │ ├── sampling.py │ │ └── tokenization.py │ └── text_generation_server.py ├── legacy │ ├── data │ │ ├── __init__.py │ │ ├── autoaugment.py │ │ ├── biencoder_dataset_utils.py │ │ ├── data_samplers.py │ │ ├── dataset_utils.py │ │ ├── ict_dataset.py │ │ ├── image_folder.py │ │ ├── multimodal_dataset.py │ │ ├── orqa_wiki_dataset.py │ │ ├── realm_dataset_utils.py │ │ ├── realm_index.py │ │ └── vit_dataset.py │ ├── fp16_deprecated │ │ └── loss_scaler.py │ ├── fused_kernels │ │ ├── __init__.py │ │ ├── compat.h │ │ ├── tests │ │ │ ├── __init__.py │ │ │ └── test_fused_kernels.py │ │ └── type_shim.h │ ├── indexer.py │ ├── model │ │ ├── __init__.py │ │ ├── bert_model.py │ │ ├── biencoder_model.py │ │ ├── classification.py │ │ ├── enums.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_layer_norm.py │ │ ├── fused_softmax.py │ │ ├── gpt_model.py │ │ ├── language_model.py │ │ ├── module.py │ │ ├── multiple_choice.py │ │ ├── realm_model.py │ │ ├── rms_norm.py │ │ ├── t5_model.py │ │ ├── transformer.py │ │ ├── utils.py │ │ └── vision │ │ │ ├── classification.py │ │ │ ├── dino.py │ │ │ ├── esvit_swin_backbone.py │ │ │ ├── inpainting.py │ │ │ ├── knn_monitor.py │ │ │ ├── mit_backbone.py │ │ │ ├── swin_backbone.py │ │ │ ├── utils.py │ │ │ └── vit_backbone.py │ └── mpu │ │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py └── training │ ├── __init__.py │ ├── activations.py │ ├── arguments.py │ ├── async_utils.py │ ├── checkpointing.py │ ├── dist_signal_handler.py │ ├── global_vars.py │ ├── initialize.py │ ├── log_handler.py │ ├── one_logger_utils.py │ ├── optimizer_param_scheduler.py │ ├── theoretical_memory_usage.py │ ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py │ ├── training.py │ ├── utils.py │ └── yaml_arguments.py ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_gpt.sh ├── pretrain_ict.py ├── pretrain_mamba.py ├── pretrain_retro.py ├── pretrain_t5.py ├── pretrain_vision_classify.py ├── pretrain_vision_dino.py ├── pretrain_vision_inpaint.py ├── pretrain_vlm.py ├── pyproject.toml ├── setup.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── main.py ├── msdp │ ├── README.md │ ├── evaluate.py │ ├── main.py │ ├── metrics.py │ ├── preprocessing.py │ └── prompt.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification │ │ ├── classification.py │ │ └── eval_utils.py │ ├── finetune_utils.py │ ├── main.py │ └── segmentation │ │ ├── cityscapes.py │ │ ├── data.py │ │ ├── finetune_segformer.py │ │ ├── finetune_setr.py │ │ ├── metrics.py │ │ ├── seg_heads.py │ │ ├── seg_models.py │ │ ├── transforms.py │ │ └── utils.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── __init__.py ├── functional_tests │ ├── __init__.py │ ├── jet_recipes │ │ ├── MR-bert.yaml │ │ ├── MR-gpt-nemo.yaml │ │ ├── MR-gpt.yaml │ │ ├── MR-multimodal.yaml │ │ ├── MR-t5.yaml │ │ ├── build-pyt.yaml │ │ ├── local-generator.py │ │ ├── nightly-bert.yaml │ │ ├── nightly-gpt.yaml │ │ ├── weekly-gpt.yaml │ │ └── weekly-t5.yaml │ ├── python_test_utils │ │ ├── __init__.py │ │ ├── common.py │ │ ├── get_test_results_from_tensorboard_logs.py │ │ ├── jet_test_pipeline.py │ │ ├── multitest_ci_pipeline.py │ │ ├── test_ci_pipeline.py │ │ ├── test_fp8_ci_pipeline.py │ │ └── test_resume_checkpoint_pipeline.py │ ├── shell_test_utils │ │ ├── _run_local_training.sh │ │ ├── restart_jet_log_jobs.sh │ │ └── run_release_record.sh │ ├── test_results │ │ └── jet │ │ │ ├── bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json │ │ │ ├── bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json │ │ │ ├── bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json │ │ │ ├── bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json │ │ │ ├── bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json │ │ │ ├── bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json │ │ │ ├── bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json │ │ │ ├── bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json │ │ │ ├── bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json │ │ │ ├── bert_mr_tp2_pp2_dgx_a100_1N8G.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json │ │ │ ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json │ │ │ ├── gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json │ │ │ ├── gpt3_mr_tp2_pp2_dgx_a100_1N8G.json │ │ │ ├── multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json │ │ │ ├── multimodal_llava_mr_mcore_te_tp2_pp4_dgx_a100_1N8G.json │ │ │ └── t5_220m_mr_mcore_tp2_pp4_dgx_a100_1N8G.json │ └── test_scripts │ │ ├── bert │ │ └── pretrain_bert_distributed_test.sh │ │ ├── gpt3 │ │ ├── pretrain_gpt3_distributed_test.sh │ │ └── pretrain_gpt3_nemo_test.sh │ │ ├── multimodal │ │ └── pretrain_llava_distributed_test.sh │ │ ├── retro │ │ └── pretrain_retro_distributed_test.sh │ │ └── t5 │ │ └── pretrain_t5_distributed_test.sh └── unit_tests │ ├── __init__.py │ ├── data │ ├── __init__.py │ ├── test_bin_reader.py │ ├── test_builder.py │ ├── test_gpt_dataset.py │ ├── test_multimodal_dataset.py │ ├── test_preprocess_data.py │ └── test_preprocess_mmdata.py │ ├── dist_checkpointing │ ├── __init__.py │ ├── conftest.py │ ├── models │ │ ├── __init__.py │ │ ├── common.py │ │ ├── test_bert_model.py │ │ ├── test_gpt_model.py │ │ ├── test_grouped_mlp.py │ │ ├── test_mlp_glu.py │ │ ├── test_retro_model.py │ │ ├── test_sequential_mlp.py │ │ └── test_t5_model.py │ ├── test_async_save.py │ ├── test_cached_metadata.py │ ├── test_flattened_resharding.py │ ├── test_fully_parallel.py │ ├── test_mapping.py │ ├── test_optimizer.py │ └── test_serialization.py │ ├── distributed │ └── test_param_and_grad_buffer.py │ ├── fusions │ └── test_torch_softmax.py │ ├── inference │ ├── __init__.py │ ├── engines │ │ ├── __init__.py │ │ └── test_mcore_engine.py │ ├── model_inference_wrappers │ │ ├── __init__.py │ │ ├── gpt │ │ │ └── test_gpt_inference_wrapper.py │ │ └── test_model_inference_wrapper_config.py │ ├── test_common_inference_params.py │ ├── test_inference_utils.py │ ├── test_modelopt_gpt_model.py │ ├── test_scheduler.py │ └── text_generation_controllers │ │ ├── __init__.py │ │ └── test_simple_text_generation_controller.py │ ├── models │ ├── __init__.py │ ├── test_base_embedding.py │ ├── test_bert_model.py │ ├── test_clip_vit_model.py │ ├── test_gpt_model.py │ ├── test_llava_model.py │ ├── test_mamba_model.py │ ├── test_multimodal_projector.py │ └── test_t5_model.py │ ├── pipeline_parallel │ ├── __init__.py │ └── test_schedules.py │ ├── tensor_parallel │ ├── __init__.py │ ├── test_cross_entropy.py │ ├── test_data.py │ ├── test_initialization.py │ ├── test_layers.py │ ├── test_mappings.py │ ├── test_random.py │ └── test_tensor_parallel_utils.py │ ├── test_basic.py │ ├── test_imports.py │ ├── test_local_multi_tensor_fns.py │ ├── test_num_microbatches_calculator.py │ ├── test_optimizer.py │ ├── test_parallel_state.py │ ├── test_training.py │ ├── test_utilities.py │ ├── test_utils.py │ └── transformer │ ├── __init__.py │ ├── moe │ ├── __init__.py │ ├── test_a2a_token_dispatcher.py │ ├── test_aux_loss.py │ ├── test_grouped_mlp.py │ ├── test_routers.py │ ├── test_sequential_mlp.py │ └── test_token_dispatcher.py │ ├── test_attention.py │ ├── test_attention_packed_seq.py │ ├── test_core_attention.py │ ├── test_mlp.py │ ├── test_module.py │ ├── test_retro_attention.py │ ├── test_spec_customization.py │ ├── test_transformer_block.py │ └── test_transformer_layer.py └── tools ├── autoformat.sh ├── bert_embedding ├── __init__.py ├── dataset.py ├── embed.py ├── external_libs.py └── huggingface.py ├── checkpoint ├── convert.py ├── hybrid_conversion.py ├── loader_llama_mistral.py ├── loader_mcore.py ├── loader_megatron.py ├── loader_mixtral_hf.py ├── saver_mcore.py ├── saver_megatron.py ├── setter.py └── utils.py ├── linter.py ├── merge_datasets.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py ├── preprocess_data.py ├── preprocess_data_nmt.py ├── preprocess_mmdata.py ├── report_theoretical_memory.py ├── retro ├── README.md ├── build_db.md ├── cli │ ├── __init__.py │ ├── __main__.py │ └── cli.py ├── config_utils.py ├── docker │ └── Dockerfile ├── preprocess_data.py ├── sft │ ├── README.md │ ├── dataset_conv.py │ ├── open_inst.sh │ ├── sft_retro.py │ └── sft_retro_lm.sh └── text_generation │ ├── evaluate.py │ ├── metrics.py │ ├── retro_api.py │ ├── retro_generate.sh │ ├── retro_generation.py │ └── retro_text_generation.py ├── run_mamba_text_generation_server.py ├── run_text_generation_server.py ├── run_vlm_text_generation.py └── text_generation_cli.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [html] 2 | directory = coverage 3 | 4 | [run] 5 | data_file = .coverage_$LOCAL_RANK 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: BUG 3 | about: Report a bug that needs attention 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Stack trace/logs** 20 | If applicable, add the stack trace or logs from the time of the error. 21 | 22 | **Environment (please complete the following information):** 23 | - Megatron-LM commit ID 24 | - PyTorch version 25 | - CUDA version 26 | - NCCL version 27 | 28 | **Proposed fix** 29 | If you have a proposal for how to fix the issue state it here or link to a PR. 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: ENHANCEMENT 3 | about: Suggest an idea to improve this project 4 | title: "[ENHANCEMENT]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Proposed implementation** 20 | If you have a proposed implementation for the feature state it here or link to a PR. 21 | 22 | **Additional context** 23 | Add any other context or screenshots about the feature request here. 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: QUESTION 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement 4 | request 5 | title: "[QUESTION]" 6 | labels: '' 7 | assignees: '' 8 | 9 | --- 10 | 11 | **Your question** 12 | Ask a clear and concise question about Megatron-LM. 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/regression.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: REGRESSION 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update 4 | title: "[REGRESSION]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the regression** 11 | A clear and concise description of what the regression is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Previous performance** 17 | What speed or accuracy did you previously see. 18 | 19 | **New performance** 20 | What speed or accuracy do you see after the update. 21 | 22 | **Stack trace/logs** 23 | If applicable, add the stack trace or logs related to the regression. 24 | 25 | **Environment (please complete the following information):** 26 | - Previous Megatron-LM commit ID 27 | - New Megatron-LM commit ID 28 | - Previous PyTorch version 29 | - New PyTorch version 30 | - Previous CUDA version 31 | - New CUDA version 32 | - Previous NCCL version 33 | - New NCCL version 34 | 35 | **Proposed fix** 36 | If you have a proposal for how to fix the issue state it here or link to a PR. 37 | 38 | **Additional context** 39 | Add any other context about the problem here. 40 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '15 18 * * *' 11 | 12 | jobs: 13 | stale: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/stale@v5 22 | with: 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | days-before-stale: 60 25 | stale-issue-message: 'Marking as stale. No activity in 60 days.' 26 | stale-pr-message: 'Marking as stale. No activity in 60 days.' 27 | stale-issue-label: 'stale' 28 | stale-pr-label: 'stale' 29 | remove-stale-when-updated: true 30 | operations-per-run: 1000 31 | days-before-close: -1 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | slurm* 8 | logs 9 | .vscode 10 | local/ 11 | *.tar.gz 12 | *.tar.gz.* 13 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | [MCORE][3] 2 | megatron/core/ @shanmugamr @jcasper @eharper @terryk @okoenig 3 | 4 | [TESTS] 5 | tests/ @shanmugamr @terryk @okoenig 6 | 7 | [MODELOPT] 8 | examples/inference/quantization @chenhany @kmorabia 9 | -------------------------------------------------------------------------------- /Dockerfile.linting: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | 3 | ARG FROM_IMAGE_NAME 4 | FROM $FROM_IMAGE_NAME 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \ 8 | /etc/apt/apt.conf.d/docker-clean 9 | 10 | 11 | RUN pip3 install --no-cache-dir \ 12 | black==24.4.2 \ 13 | isort 14 | 15 | COPY . /opt/megatron-lm 16 | 17 | WORKDIR /opt/megatron-lm -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/core/requirements.txt 2 | -------------------------------------------------------------------------------- /artifact/example-results/quick-exp.txt: -------------------------------------------------------------------------------- 1 | Method: baseline 2 | Peak Memory: 33.7227 GB 3 | MFU: 28.8833 % 4 | 5 | Method: redis 6 | Peak Memory: 33.7227 GB 7 | MFU: 44.6221 % 8 | 9 | Method: interlaced 10 | Peak Memory: 30.7168 GB 11 | MFU: 53.8638 % 12 | 13 | Method: vocab-1 14 | Peak Memory: 27.3848 GB 15 | MFU: 53.9708 % 16 | 17 | Method: vocab-2 18 | Peak Memory: 26.1094 GB 19 | MFU: 53.5333 % -------------------------------------------------------------------------------- /docs/source/api-guide/dist_checkpointing.strategies.rst: -------------------------------------------------------------------------------- 1 | dist\_checkpointing.strategies package 2 | ====================================== 3 | 4 | Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies). 5 | 6 | Strategies can be used for implementing new checkpoint formats or implementing new (more optimal for a given use case) ways of saving/loading of existing formats. 7 | Strategies are passed to `dist_checkpointing.load` and `dist_checkpointing.save` functions and control the actual saving/loading procedure. 8 | 9 | Submodules 10 | ---------- 11 | 12 | dist\_checkpointing.strategies.base module 13 | ------------------------------------------ 14 | 15 | .. automodule:: core.dist_checkpointing.strategies.base 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | dist\_checkpointing.strategies.tensorstore module 21 | ------------------------------------------------- 22 | 23 | .. automodule:: core.dist_checkpointing.strategies.tensorstore 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | dist\_checkpointing.strategies.two\_stage module 29 | ------------------------------------------------ 30 | 31 | .. automodule:: core.dist_checkpointing.strategies.two_stage 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | dist\_checkpointing.strategies.zarr module 37 | ------------------------------------------ 38 | 39 | .. automodule:: core.dist_checkpointing.strategies.zarr 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | Module contents 45 | --------------- 46 | 47 | .. automodule:: core.dist_checkpointing.strategies 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | -------------------------------------------------------------------------------- /docs/source/api-guide/index.rst: -------------------------------------------------------------------------------- 1 | API Guide 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | models 8 | tensor_parallel 9 | context_parallel 10 | pipeline_parallel 11 | fusions 12 | transformer 13 | moe 14 | dist_checkpointing 15 | distributed 16 | datasets 17 | num_microbatches_calculator 18 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.bert.rst: -------------------------------------------------------------------------------- 1 | models.bert package 2 | =================== 3 | Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 4 | 5 | Submodules 6 | ---------- 7 | 8 | models.bert.bert\_model module 9 | ------------------------------ 10 | 11 | .. automodule:: core.models.bert.bert_model 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: core.models.bert 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.gpt.rst: -------------------------------------------------------------------------------- 1 | models.gpt package 2 | ================== 3 | This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 4 | 5 | Submodules 6 | ---------- 7 | 8 | models.gpt.gpt\_model module 9 | ---------------------------- 10 | 11 | .. automodule:: core.models.gpt.gpt_model 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: core.models.gpt 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.rst: -------------------------------------------------------------------------------- 1 | models package 2 | ============== 3 | This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 4 | 5 | Subpackages 6 | ----------- 7 | 8 | .. toctree:: 9 | :maxdepth: 4 10 | 11 | models.gpt 12 | models.t5 13 | models.bert 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: core.models 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api-guide/models.t5.rst: -------------------------------------------------------------------------------- 1 | models.t5 package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | models.t5.t5\_model module 8 | -------------------------- 9 | 10 | .. automodule:: core.models.T5.t5_model 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: core.models.T5 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api-guide/moe.rst: -------------------------------------------------------------------------------- 1 | Mixture of Experts package 2 | ========================== 3 | 4 | .. mdinclude :: ../../../megatron/core/transformer/moe/README.md 5 | -------------------------------------------------------------------------------- /docs/source/api-guide/num_microbatches_calculator.rst: -------------------------------------------------------------------------------- 1 | Microbatches Calculator 2 | ============== 3 | This api is used to calculate the number of microbatches required to fit a given model on a given batch size. 4 | 5 | 6 | Module contents 7 | --------------- 8 | 9 | .. automodule:: core.num_microbatches_calculator 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/api-guide/pipeline_parallel.rst: -------------------------------------------------------------------------------- 1 | pipeline\_parallel package 2 | ========================== 3 | 4 | This package contains implementations for two different pipeline parallelism 5 | schedules (one without interleaving and one with interleaving, see `Efficient 6 | Large-Scale Language Model Training on GPU Clusters Using Megatron-LM `_ 7 | for details), and a default no-pipelining schedule. It also contains methods 8 | for the point-to-point communication that is needed between pipeline stages. 9 | 10 | Submodules 11 | ---------- 12 | 13 | pipeline\_parallel.p2p\_communication module 14 | -------------------------------------------- 15 | 16 | Contains implementations for the various point-to-point communication needed 17 | (e.g., `recv_forward` and `recv_backward`) in the different pipeline parallelism 18 | schedules. 19 | 20 | .. automodule:: core.pipeline_parallel.p2p_communication 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | pipeline\_parallel.schedules module 26 | ----------------------------------- 27 | 28 | Contains implementations for two pipeline parallelism schedules 29 | (`forward_backward_pipelining_with_interleaving`for pipeline parallelism with 30 | interleaving, `forward_backward_pipelining_without_interleaving` for pipeline 31 | parallelism without interleaving) and a default no-pipelining schedule 32 | (`forward_backward_no_pipelining`). `get_forward_backward_func` returns the right 33 | scheduling function to use based on the configuration being trained 34 | (e.g., if pipeline-parallel size is 1, use `forward_backward_no_pipelining`). 35 | 36 | .. automodule:: core.pipeline_parallel.schedules 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | Module contents 42 | --------------- 43 | 44 | .. automodule:: core.pipeline_parallel 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | -------------------------------------------------------------------------------- /docs/source/api-guide/tensor_parallel.rst: -------------------------------------------------------------------------------- 1 | tensor\_parallel package 2 | ======================== 3 | 4 | This package contains an implementation for tensor parallelism in transformer 5 | models (see `Megatron-LM: Training Multi-Billion Parameter Language Models 6 | Using Model Parallelism `_ and `Reducing 7 | Activation Recomputation in Large Transformer Models `_ 8 | for details). 9 | 10 | Submodules 11 | ---------- 12 | 13 | tensor\_parallel.cross\_entropy module 14 | -------------------------------------- 15 | 16 | .. automodule:: core.tensor_parallel.cross_entropy 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | tensor\_parallel.data module 22 | ---------------------------- 23 | 24 | .. automodule:: core.tensor_parallel.data 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | tensor\_parallel.layers module 30 | ------------------------------ 31 | 32 | .. automodule:: core.tensor_parallel.layers 33 | :members: 34 | :undoc-members: 35 | :show-inheritance: 36 | 37 | tensor\_parallel.mappings module 38 | -------------------------------- 39 | 40 | .. automodule:: core.tensor_parallel.mappings 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | tensor\_parallel.random module 46 | ------------------------------ 47 | 48 | .. automodule:: core.tensor_parallel.random 49 | :members: 50 | :undoc-members: 51 | :show-inheritance: 52 | 53 | tensor\_parallel.utils module 54 | ----------------------------- 55 | 56 | .. automodule:: core.tensor_parallel.utils 57 | :members: 58 | :undoc-members: 59 | :show-inheritance: 60 | 61 | Module contents 62 | --------------- 63 | 64 | .. automodule:: core.tensor_parallel 65 | :members: 66 | :undoc-members: 67 | :show-inheritance: 68 | -------------------------------------------------------------------------------- /docs/source/images/context_parallel/CP_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/context_parallel/CP_overview.png -------------------------------------------------------------------------------- /docs/source/images/context_parallel/CP_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/context_parallel/CP_results.png -------------------------------------------------------------------------------- /docs/source/images/distrib_optimizer/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/distrib_optimizer/data_flow.png -------------------------------------------------------------------------------- /docs/source/images/distrib_optimizer/sharding_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/distrib_optimizer/sharding_scheme.png -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Lumache documentation master file, created by 2 | sphinx-quickstart on Tue Aug 15 13:44:10 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Megatron Core User Guide 7 | =================================== 8 | 9 | **Megatron Core** is a Python library that has the core components required to build your language models. 10 | A reference implementation of megatorn core can be found in `NeMo `_ It offers a *simple* and 11 | *intuitive* API. 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: User Guide 16 | 17 | user-guide/index 18 | 19 | .. toctree:: 20 | :maxdepth: 3 21 | :caption: API Guide 22 | 23 | api-guide/index 24 | -------------------------------------------------------------------------------- /docs/source/user-guide/index.rst: -------------------------------------------------------------------------------- 1 | USER GUIDE 2 | ========== 3 | 4 | .. mdinclude:: ../../../megatron/core/QuickStart.md -------------------------------------------------------------------------------- /examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh: -------------------------------------------------------------------------------- 1 | VOCAB_FILE=pt2-vocab.json 2 | MERGE_FILE=gpt2-merges.txt 3 | 4 | python3 tools/preprocess_data.py \ 5 | --input $1 \ 6 | --output-prefix $2 \ 7 | --vocab-file $VOCAB_FILE \ 8 | --merge-file $MERGE_FILE \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --append-eod --workers 20 --chunk-size 25 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | VOCAB_FILE=gpt2-vocab.json 4 | MERGE_FILE=gpt2-merges.txt 5 | 6 | GPUS_PER_NODE=1 7 | # Change for multinode config 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=$(($RANDOM + 1024)) 10 | NNODES=1 11 | NODE_RANK=0 12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 13 | NUM_SAMPLES=$(wc -l < $1) 14 | PREFIX=$(basename $2) 15 | SEED=$(($RANDOM)) 16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl 17 | 18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 19 | 20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 21 | --tensor-model-parallel-size 1 \ 22 | --num-layers 24 \ 23 | --hidden-size 2048 \ 24 | --load $CHECKPOINT_PATH \ 25 | --num-attention-heads 32 \ 26 | --max-position-embeddings 2048 \ 27 | --tokenizer-type GPT2BPETokenizer \ 28 | --fp16 \ 29 | --micro-batch-size 400 \ 30 | --seq-length 2048 \ 31 | --out-seq-length 20 \ 32 | --temperature 1.0 \ 33 | --vocab-file $VOCAB_FILE \ 34 | --merge-file $MERGE_FILE \ 35 | --sample-input-file $1 \ 36 | --sample-output-file $OUTPUT \ 37 | --num-samples $NUM_SAMPLES \ 38 | --max-tokens-to-oom 1200000 \ 39 | --top_p 0.9 \ 40 | --seed $SEED 41 | 42 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | SHARE_DATA=$PWD # current work dir 4 | VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab 5 | MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file 6 | 7 | GPUS_PER_NODE=1 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=$(($RANDOM + 1024)) 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | SEED=$3 15 | SUFFIX=$(basename $CHECKPOINT_PATH) 16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/ 17 | mkdir -p $save_dir 18 | echo $save_dir/$SEED.out 19 | 20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 21 | 22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 23 | --tensor-model-parallel-size 1 \ 24 | --num-layers 24 \ 25 | --hidden-size 2048 \ 26 | --load $CHECKPOINT_PATH \ 27 | --num-attention-heads 32 \ 28 | --max-position-embeddings 2048 \ 29 | --tokenizer-type GPT2BPETokenizer \ 30 | --fp16 \ 31 | --micro-batch-size 150 \ 32 | --seq-length 2048 \ 33 | --out-seq-length 1000 \ 34 | --temperature 1.0 \ 35 | --vocab-file $VOCAB_FILE \ 36 | --merge-file $MERGE_FILE \ 37 | --num-samples $1 \ 38 | --top_p 0.9 \ 39 | --max-tokens-to-oom 1200000 \ 40 | --genfile $save_dir/$SEED.out \ 41 | --seed $SEED 42 | 43 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp). 5 | 6 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/eval_knwl_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################### 4 | # Evaluate the F1 scores. 5 | ######################### 6 | 7 | WORLD_SIZE=1 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 9 | --nnodes 1 \ 10 | --node_rank 0 \ 11 | --master_addr localhost \ 12 | --master_port 6000" 13 | 14 | MODEL_GEN_PATH= \ 15 | (e.g., /testseen_knowledge_generations.txt) 16 | GROUND_TRUTH_PATH= \ 17 | (e.g., /testseen_knowledge_reference.txt) 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --seq-length 2048 \ 24 | --max-position-embeddings 2048 \ 25 | --micro-batch-size 4 \ 26 | --task MSDP-EVAL-F1 \ 27 | --guess-file ${MODEL_GEN_PATH} \ 28 | --answer-file ${GROUND_TRUTH_PATH} 29 | 30 | 31 | ############################################ 32 | # Evaluate BLEU, METEOR, and ROUGE-L scores. 33 | ############################################ 34 | 35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 37 | 38 | # To evaluate on these metrics, please setup the environments based on 39 | # the nlg-eval github, and run the corresponding evaluation commands. 40 | 41 | nlg-eval \ 42 | --hypothesis= \ 43 | --references= 44 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/prep_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Preparing the input file for the response generation (second-stage prompting) 4 | 5 | DIR=`pwd` 6 | 7 | TEST_FILE= \ 8 | (e.g., /testseen_processed.txt) 9 | KNOWLEDGE_FILE= \ 10 | (e.g., /testseen_knowledge_generations.txt) 11 | PROCESSED_FILE= \ 12 | (e.g., /testseen_processed_with_generated_knowledge.txt) 13 | 14 | python ${DIR}/tasks/msdp/preprocessing.py \ 15 | --func prepare_input \ 16 | --test_file ${TEST_FILE} \ 17 | --knwl_gen_file ${KNOWLEDGE_FILE} \ 18 | --processed_file ${PROCESSED_FILE} 19 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge 5 | # The size of the pretrained language model is 357M 6 | 7 | WORLD_SIZE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | CHECKPOINT_PATH= (e.g., /357m) 16 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 17 | MERGE_PATH= (e.g., /gpt2-merges.txt) 18 | INPUT_PATH= \ 19 | (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /testseen_knowledge_prompts.json) 22 | OUTPUT_PATH= \ 23 | (e.g., /testseen_knowledge_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type knowledge \ 42 | --num-prompt-examples 10 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/msdp/prompt_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1 5 | # The output is the corresponding response. 6 | # The size of the pretrained language model is 357M 7 | 8 | WORLD_SIZE=8 9 | 10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 11 | --nnodes 1 \ 12 | --node_rank 0 \ 13 | --master_addr localhost \ 14 | --master_port 6000" 15 | 16 | CHECKPOINT_PATH= (e.g., /357m) 17 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 18 | MERGE_PATH= (e.g., /gpt2-merges.txt) 19 | INPUT_PATH= (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /response_prompts.txt) 22 | OUTPUT_PATH= \ 23 | (e.g., /output_testseen_response_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type response \ 42 | --num-prompt-examples 20 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/CONFIG.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # SLURM options. 5 | export SLURM_PARTITION= 6 | export SLURM_ACCOUNT= 7 | 8 | 9 | # Source code. 10 | export MEGATRON_CODE_DIR= 11 | 12 | 13 | # This variable is used to mount the relevant part of the filesystem 14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the 15 | # launch directory already get mounted; this variable should be used to 16 | # mount the directories that contain the data and tokenizer files. 17 | export DOCKER_MOUNT_DIR= 18 | 19 | 20 | # Data and tokenizer files. 21 | MEGATRON_DATA= 22 | BPE_VOCAB_FILE= 23 | BPE_MERGE_FILE= 24 | 25 | 26 | # Megatron input parameters. 27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters 28 | # that are not listed here. 29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ 30 | --tensor-model-parallel-size ${TP} \ 31 | --pipeline-model-parallel-size ${PP} \ 32 | --micro-batch-size ${MBS} \ 33 | --global-batch-size ${GBS} \ 34 | --num-layers ${NLS} \ 35 | --hidden-size ${HS} \ 36 | --num-attention-heads ${NAH} \ 37 | --DDP-impl ${DDP} \ 38 | --data-path ${MEGATRON_DATA} \ 39 | --vocab-file ${BPE_VOCAB_FILE} \ 40 | --merge-file ${BPE_MERGE_FILE} \ 41 | --log-interval 5 \ 42 | --seq-length 2048 \ 43 | --max-position-embeddings 2048 \ 44 | --train-iters 500 \ 45 | --lr-decay-iters 320 \ 46 | --lr 0.0001 \ 47 | --min-lr 0.00001 \ 48 | --lr-decay-style cosine \ 49 | --lr-warmup-fraction 0.01 \ 50 | --split 969,30,1 \ 51 | --eval-iters 100 \ 52 | --eval-interval 1000 \ 53 | --clip-grad 1.0 \ 54 | --fp16 \ 55 | --loss-scale 8192 " 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/SBATCH.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | sbatch -p ${SLURM_PARTITION} \ 5 | -A ${SLURM_ACCOUNT} \ 6 | --job-name=${JOB_NAME} \ 7 | --nodes=${NNODES} \ 8 | --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh 9 | 10 | exit 0 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/SRUN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 4 | 5 | 6 | THIS_DIR=`pwd` 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 8 | mkdir -p ${THIS_DIR}/logs 9 | 10 | 11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" 12 | 13 | 14 | srun -l \ 15 | --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ 16 | --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ 17 | --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" 18 | 19 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [1, 2, 4, 8]. 8 | PP=1 9 | 10 | # Batch size (global batch size) options = [8, 128]. 11 | GBS=8 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel size options. 18 | NLS=$((3*PP)) 19 | NNODES=${PP} 20 | 21 | 22 | # Other params. 23 | TP=8 24 | MBS=1 25 | HS=20480 26 | NAH=128 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Interleaved schedule options = [YES, NO]. 8 | INTERLEAVED=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set interleaved schedule options. 18 | if [ ${INTERLEAVED} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${INTERLEAVED} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 128]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and tensor-parallel size options. 18 | TP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | MBS=1 23 | NLS=32 24 | HS=20480 25 | NAH=128 26 | DDP=local 27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 28 | NNODES=8 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and data-parallel size options. 18 | DP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | TP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32]. 8 | TP=2 9 | 10 | # Batch size (global batch size) options = [32, 128, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set tensor-parallel and data-parallel size options. 18 | DP=$((64/TP)) 19 | 20 | 21 | # Other params. 22 | PP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Microbatch size options = [1, 2, 4, 8]. 8 | MBS=1 9 | 10 | # Batch size (global batch size) options = [128, 512]. 11 | GBS=128 12 | 13 | 14 | 15 | 16 | 17 | # Other params. 18 | TP=8 19 | PP=8 20 | NLS=32 21 | HS=15360 22 | NAH=128 23 | DDP=local 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | NNODES=8 26 | 27 | 28 | # Name of the job. 29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} 30 | 31 | 32 | # Import the configs. 33 | . `pwd`/CONFIG.sh 34 | 35 | 36 | # Submit the job. 37 | . `pwd`/SBATCH.sh 38 | 39 | 40 | exit 0 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_17.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Activation recomputation options = [YES, NO]. 8 | ACTIVATION_RECOMPUTATION=YES 9 | 10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256]. 11 | GBS=1 12 | 13 | 14 | 15 | 16 | 17 | # Set activation recomputation. 18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="" 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=16 31 | MBS=1 32 | NLS=80 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=16 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/academic_paper_scripts/sc21/run_figure_18.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Scatter-gather communication optimization options = [YES, NO]. 8 | SCATTER_GATHER=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set scatter-gather communication optimization options. 18 | if [ ${SCATTER_GATHER} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${SCATTER_GATHER} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/bert/README.md: -------------------------------------------------------------------------------- 1 | # BERT MODEL 2 | 3 | ## Table of contents 4 | - [1. Training Setup](#1-training-setup) 5 | - [2. Configurations](#2-configurations) 6 | 7 | ## 1. Training setup 8 | 9 | 10 | To run the model using a docker container run it as follows 11 | ``` 12 | PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 13 | CHECKPOINT_PATH="" # 14 | TENSORBOARD_LOGS_PATH=""# 15 | VOCAB_FILE="" #//bert-vocab.txt 16 | DATA_PATH="" #_text_document 17 | 18 | docker run \ 19 | --gpus=all \ 20 | --ipc=host \ 21 | --workdir /workspace/megatron-lm \ 22 | -v /path/to/data:/path/to/data \ 23 | -v /path/to/megatron-lm:/workspace/megatron-lm \ 24 | megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \ 25 | bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH " 26 | 27 | ``` 28 | NOTE: Depending on the environment you are running it the above command might like slightly different. 29 | 30 | 31 | ## 2. Configurations 32 | 33 | The example in this folder shows you how to run 340m large model. There are other configs you could run as well 34 | 35 | ### 4B 36 | ``` 37 | --num-layers 48 \ 38 | --hidden-size 2560 \ 39 | --num-attention-heads 32 \ 40 | --tensor-model-parallel-size 1 \ 41 | --pipeline-model-parallel-size 1 \ 42 | 43 | ``` 44 | 45 | ### 20B 46 | ``` 47 | --num-layers 48 \ 48 | --hidden-size 6144 \ 49 | --num-attention-heads 96 \ 50 | --tensor-model-parallel-size 4 \ 51 | --pipeline-model-parallel-size 4 \ 52 | 53 | ``` -------------------------------------------------------------------------------- /examples/gpt3/README.md: -------------------------------------------------------------------------------- 1 | # GPT3 MODEL 2 | 3 | ## Table of contents 4 | - [1. Training Setup](#1-training-setup) 5 | - [2. Configurations](#2-configurations) 6 | - [3. Training Results](#3-training-results) 7 | 8 | ## 1. Training setup 9 | 10 | 11 | To run the model using a docker container run it as follows 12 | ``` 13 | PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 14 | CHECKPOINT_PATH="" # 15 | TENSORBOARD_LOGS_PATH=""# 16 | VOCAB_FILE="" #/gpt2-vocab.json 17 | MERGE_FILE="" #/gpt2-merges.txt 18 | DATA_PATH="" #_text_document 19 | 20 | docker run \ 21 | --gpus=all \ 22 | --ipc=host \ 23 | --workdir /workspace/megatron-lm \ 24 | -v /path/to/data:/path/to/data \ 25 | -v /path/to/megatron-lm:/workspace/megatron-lm \ 26 | megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \ 27 | bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH " 28 | 29 | ``` 30 | NOTE: Depending on the environment you are running it the above command might like slightly different. 31 | 32 | 33 | ## 2. Configurations 34 | 35 | The example in this folder shows you how to run 175B model. There are other configs you could run as well 36 | 37 | ### 345M 38 | ``` 39 | --num-layers 12 \ 40 | --hidden-size 512 \ 41 | --num-attention-heads 8 \ 42 | --seq-length 1024 \ 43 | --tensor-model-parallel-size 1 \ 44 | --pipeline-model-parallel-size 1 \ 45 | 46 | ``` 47 | 48 | ### 857M 49 | ``` 50 | --num-layers 24 \ 51 | --hidden-size 1024 \ 52 | --num-attention-heads 16 \ 53 | --seq-length 2048 \ 54 | --tensor-model-parallel-size 1 \ 55 | --pipeline-model-parallel-size 1 \ 56 | 57 | ``` 58 | -------------------------------------------------------------------------------- /examples/inference/run_text_generation_server_345M.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model. 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | export CUDA_DEVICE_MAX_CONNECTIONS=1 14 | 15 | pip install flask-restful 16 | 17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 18 | --tensor-model-parallel-size 1 \ 19 | --pipeline-model-parallel-size 1 \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --load ${CHECKPOINT} \ 23 | --num-attention-heads 16 \ 24 | --max-position-embeddings 1024 \ 25 | --tokenizer-type GPT2BPETokenizer \ 26 | --fp16 \ 27 | --micro-batch-size 1 \ 28 | --seq-length 1024 \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --seed 42 32 | -------------------------------------------------------------------------------- /examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | pip install flask-restful 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 16 | --tensor-model-parallel-size 8 \ 17 | --pipeline-model-parallel-size 1 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --load ${CHECKPOINT} \ 21 | --num-attention-heads 16 \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --micro-batch-size 1 \ 26 | --seq-length 1024 \ 27 | --vocab-file $VOCAB_FILE \ 28 | --merge-file $MERGE_FILE \ 29 | --seed 42 30 | -------------------------------------------------------------------------------- /examples/mamba/.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints/ 2 | data-cache/ 3 | tensorboard/ 4 | triton-cache/ 5 | -------------------------------------------------------------------------------- /examples/mamba/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.01-py3 2 | 3 | RUN pip uninstall -y triton && \ 4 | pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful 5 | 6 | # The causal-conv1d and mamba-ssm packages below are built from scratch here 7 | # (which takes significant time) because there are no wheels available on PyPI 8 | # for these relatively newer versions of the packages that are compatible with 9 | # the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we 10 | # are using (in the NGC base container). Generally, if the package is not 11 | # compatible with the PyTorch version, then it will generate a Python import 12 | # error. The package authors tend to only release wheels for new versions of 13 | # these pacakges which are compatible with the versions of regular PyTorch and 14 | # NGC-variant PyTorch that are newer at the time of release. So, to use newer 15 | # versions of these packages with relatively older versions of the NGC PyTorch 16 | # container, we tend to have to build the packages from scratch. 17 | 18 | RUN cd /tmp && \ 19 | git clone https://github.com/Dao-AILab/causal-conv1d.git && \ 20 | cd causal-conv1d && \ 21 | git checkout v1.2.2.post1 && \ 22 | CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \ 23 | cd .. && \ 24 | rm -rf causal-conv1d 25 | 26 | RUN cd /tmp && \ 27 | git clone https://github.com/state-spaces/mamba.git && \ 28 | cd mamba && \ 29 | git checkout v2.0.3 && \ 30 | MAMBA_FORCE_BUILD=TRUE pip install . && \ 31 | cd .. && \ 32 | rm -rf mamba 33 | -------------------------------------------------------------------------------- /examples/mamba/run_text_gen_server_8b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Use: ./run_text_gen_server_8b.sh 4 | # To launch the client: python ../../tools/text_generation_cli.py 5 | 6 | CHECKPOINT_PATH=$1 7 | TOKENIZER_PATH=$2 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | export NCCL_IB_SL=1 16 | export CUDA_DEVICE_MAX_CONNECTIONS=1 17 | export NCCL_IB_TIMEOUT=19 18 | export NCCL_IB_QPS_PER_CONNECTION=4 19 | 20 | export TRITON_CACHE_DIR="./triton-cache/" 21 | export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager" 22 | 23 | torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \ 24 | --tensor-model-parallel-size 1 \ 25 | --pipeline-model-parallel-size 1 \ 26 | --untie-embeddings-and-output-weights \ 27 | --num-layers 56 \ 28 | --hidden-size 4096 \ 29 | --load ${CHECKPOINT_PATH} \ 30 | --num-attention-heads 32 \ 31 | --group-query-attention \ 32 | --num-query-groups 8 \ 33 | --hybrid-attention-ratio 0.08 \ 34 | --hybrid-mlp-ratio 0.5 \ 35 | --attention-dropout 0.0 \ 36 | --hidden-dropout 0.0 \ 37 | --disable-bias-linear \ 38 | --normalization RMSNorm \ 39 | --seq-length 4096 \ 40 | --max-position-embeddings 4096 \ 41 | --position-embedding-type none \ 42 | --tokenizer-type GPTSentencePieceTokenizer \ 43 | --tokenizer-model ${TOKENIZER_PATH} \ 44 | --distributed-backend nccl \ 45 | --distributed-timeout-minutes 1440 \ 46 | --bf16 \ 47 | --micro-batch-size 1 \ 48 | --use-mcore-models \ 49 | --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ 50 | --seed 42 51 | -------------------------------------------------------------------------------- /examples/mamba/run_text_gen_server_8b_gpt3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Use: ./run_text_gen_server_8b_gpt3.sh 4 | # To launch the client: python ../../tools/text_generation_cli.py 5 | 6 | CHECKPOINT_PATH=$1 7 | TOKENIZER_PATH=$2 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | export NCCL_IB_SL=1 16 | export CUDA_DEVICE_MAX_CONNECTIONS=1 17 | export NCCL_IB_TIMEOUT=19 18 | export NCCL_IB_QPS_PER_CONNECTION=4 19 | 20 | torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \ 21 | --tensor-model-parallel-size 1 \ 22 | --pipeline-model-parallel-size 1 \ 23 | --use-flash-attn \ 24 | --apply-layernorm-1p \ 25 | --untie-embeddings-and-output-weights \ 26 | --num-layers 32 \ 27 | --hidden-size 4096 \ 28 | --load ${CHECKPOINT_PATH} \ 29 | --num-attention-heads 32 \ 30 | --attention-dropout 0.0 \ 31 | --hidden-dropout 0.0 \ 32 | --disable-bias-linear \ 33 | --seq-length 4096 \ 34 | --max-position-embeddings 4096 \ 35 | --position-embedding-type rope \ 36 | --rotary-percent 0.5 \ 37 | --squared-relu \ 38 | --tokenizer-type GPTSentencePieceTokenizer \ 39 | --tokenizer-model ${TOKENIZER_PATH} \ 40 | --distributed-backend nccl \ 41 | --distributed-timeout-minutes 1440 \ 42 | --bf16 \ 43 | --micro-batch-size 1 \ 44 | --use-mcore-models \ 45 | --transformer-impl local \ 46 | --seed 42 47 | -------------------------------------------------------------------------------- /examples/multimodal/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:24.02-py3 2 | 3 | RUN apt update && \ 4 | apt -y upgrade && \ 5 | apt install -y --no-install-recommends \ 6 | software-properties-common \ 7 | build-essential \ 8 | python3-pip \ 9 | python3-dev \ 10 | bash \ 11 | git \ 12 | vim \ 13 | python-is-python3 \ 14 | default-jre 15 | 16 | RUN pip install --upgrade pip 17 | RUN pip install einops einops-exts sentencepiece braceexpand webdataset 18 | RUN pip install transformers datasets 19 | RUN pip install pytest-cov pytest_mock nltk wrapt 20 | RUN pip install zarr "tensorstore==0.1.45" 21 | RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main 22 | RUN pip install black==19.10b0 isort click==8.0.2 23 | RUN pip install pycocoevalcap megatron-energon 24 | RUN pip install git+https://github.com/openai/CLIP.git 25 | # Use --no-deps for the following to avoid outdated and unnecessary dependencies. 26 | RUN pip install mmf --no-deps 27 | RUN pip install open-flamingo[eval] --no-deps 28 | -------------------------------------------------------------------------------- /examples/multimodal/assets/pretrain_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/examples/multimodal/assets/pretrain_curves.png -------------------------------------------------------------------------------- /examples/multimodal/combine_mistral_clip.sh: -------------------------------------------------------------------------------- 1 | 2 | MCORE_MISTRAL= 3 | MCORE_CLIP= 4 | OUTPUT_DIR= 5 | 6 | python examples/multimodal/combine_state_dicts.py \ 7 | --input \ 8 | ${MCORE_MISTRAL}/iter_0000001/mp_rank_00/model_optim_rng.pt \ 9 | ${MCORE_CLIP}/iter_0000001/mp_rank_00/model_optim_rng.pt \ 10 | ${MCORE_MISTRAL}/iter_0000001/mp_rank_01/model_optim_rng.pt \ 11 | ${MCORE_CLIP}/iter_0000001/mp_rank_01/model_optim_rng.pt \ 12 | ${MCORE_MISTRAL}/iter_0000001/mp_rank_02/model_optim_rng.pt \ 13 | ${MCORE_CLIP}/vit-mcore-336px-tp4/iter_0000001/mp_rank_02/model_optim_rng.pt \ 14 | ${MCORE_MISTRAL}/iter_0000001/mp_rank_03/model_optim_rng.pt \ 15 | ${MCORE_CLIP}/iter_0000001/mp_rank_03/model_optim_rng.pt \ 16 | --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \ 17 | --output \ 18 | ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_00/model_optim_rng.pt \ 19 | ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_01/model_optim_rng.pt \ 20 | ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_02/model_optim_rng.pt \ 21 | ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_03/model_optim_rng.pt -------------------------------------------------------------------------------- /examples/multimodal/convert_llava_pretrain_to_wds.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import webdataset as wds 4 | 5 | from tqdm import tqdm 6 | 7 | llava_pretrain_dir = '' 8 | 9 | # Paths to the dataset files 10 | json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json') 11 | output = os.path.join(llava_pretrain_dir, 'wds') 12 | 13 | if not os.path.exists(output): 14 | os.mkdir(output) 15 | 16 | # Load data 17 | with open(json_file, 'r') as f: 18 | data = json.load(f) 19 | 20 | with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer: 21 | for entry in tqdm(data): 22 | with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file: 23 | image_data = img_file.read() 24 | sample = { 25 | "__key__": entry['id'], 26 | "jpg": image_data, 27 | "json": json.dumps(entry['conversations']).encode("utf-8"), 28 | } 29 | shard_writer.write(sample) 30 | 31 | print(f"Dataset successfully converted to wds") 32 | -------------------------------------------------------------------------------- /examples/multimodal/evaluate_vqav2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import json 4 | 5 | from open_flamingo.eval.vqa_metric import compute_vqa_accuracy 6 | 7 | 8 | def merge_input_files(input_path): 9 | """Merge input files to a format compatible with the evaluator.""" 10 | output_file_path = input_path + "-VQAv2-merged.json" 11 | 12 | pattern = input_path + "-VQAv2-[0-9].*jsonl" 13 | input_file_paths = glob.glob(pattern) 14 | 15 | results = [] 16 | 17 | for input_file_path in input_file_paths: 18 | with open(input_file_path, "r") as input_file: 19 | for line in input_file: 20 | res = json.loads(line) 21 | res["question_id"] = res["sample_id"] 22 | 23 | results.append(res) 24 | 25 | with open(output_file_path, "w") as output_file: 26 | json.dump(results, output_file) 27 | 28 | return output_file_path 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--input-path', type=str, help="Path to input file(s)") 34 | parser.add_argument('--groundtruth-path', type=str, help="Path to groundtruth file") 35 | parser.add_argument('--question-path', type=str, help="Path to questions file") 36 | args = parser.parse_args() 37 | 38 | result_file = merge_input_files(args.input_path) 39 | 40 | accuracy = compute_vqa_accuracy(result_file, args.question_path, args.groundtruth_path) 41 | print(accuracy) 42 | -------------------------------------------------------------------------------- /examples/multimodal/manual_prompts.json: -------------------------------------------------------------------------------- 1 | { 2 | "Captioning": { 3 | "raw": [ 4 | "Can you briefly explain what you see in the image?", 5 | "Describe what's happening in this image in one short sentence.", 6 | "Write a short caption that accurately represents the content of this image.", 7 | "Please generate a descriptive caption for the image provided.", 8 | "How would you summarize the scene depicted in the picture in short?" 9 | ] 10 | }, 11 | "OCR": { 12 | "raw": [ 13 | "Can you read the text from image and output here?", 14 | "Extract and document the text from the provided image.", 15 | "Converting the text embedded in this image into a readable document.", 16 | "Transcribe all the text you find.", 17 | "Can you extract all visible text from the image here?" 18 | ] 19 | }, 20 | "VQA": { 21 | "raw": [ 22 | "Given the image, answer the following question with few words.", 23 | "Answer the following question: ", 24 | "What is the answer to this question?", 25 | "Write the answer: ", 26 | "Please answer this question: " 27 | ] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /examples/multimodal/pretrain_dataset.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 1. 7 | path: 8 | subflavors: 9 | augmentation: false 10 | val: 11 | datasets: 12 | - weight: 1. 13 | path: 14 | subflavors: 15 | augmentation: false 16 | -------------------------------------------------------------------------------- /examples/multimodal/sft_dataset.yaml: -------------------------------------------------------------------------------- 1 | __module__: megatron.energon 2 | __class__: Metadataset 3 | splits: 4 | train: 5 | datasets: 6 | - weight: 1. 7 | path: 8 | subflavors: 9 | augmentation: false 10 | val: 11 | datasets: 12 | - weight: 1. 13 | path: 14 | subflavors: 15 | augmentation: false 16 | -------------------------------------------------------------------------------- /examples/t5/t5_mcore_train_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/examples/t5/t5_mcore_train_curve.png -------------------------------------------------------------------------------- /images/expt-pp32-flops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/expt-pp32-flops.png -------------------------------------------------------------------------------- /images/expt-pp32-mem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/expt-pp32-mem.png -------------------------------------------------------------------------------- /images/model_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/model_table.png -------------------------------------------------------------------------------- /images/schedule-interlaced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/schedule-interlaced.png -------------------------------------------------------------------------------- /images/schedule-vocab-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/schedule-vocab-1.png -------------------------------------------------------------------------------- /images/schedule-vocab-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/schedule-vocab-2.png -------------------------------------------------------------------------------- /images/st-passes-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/st-passes-1.png -------------------------------------------------------------------------------- /images/st-passes-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/st-passes-2.png -------------------------------------------------------------------------------- /images/strong_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/strong_scaling.png -------------------------------------------------------------------------------- /images/weak_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/weak_scaling.png -------------------------------------------------------------------------------- /input_store.py: -------------------------------------------------------------------------------- 1 | from megatron.core import mpu 2 | from megatron.training import get_args 3 | 4 | class InputStore: 5 | """ 6 | For storing and retrieving batch input that are partially unused. 7 | """ 8 | 9 | cache = [] 10 | 11 | @classmethod 12 | def save_batch(cls, microbatch_id, data): 13 | while len(cls.cache) <= microbatch_id: 14 | cls.cache.append(None) 15 | cls.cache[microbatch_id] = data 16 | 17 | @classmethod 18 | def get_batch(cls, microbatch_id): 19 | contents = cls.cache[microbatch_id] 20 | if ( 21 | mpu.get_virtual_vocab_parallel_chunk() == 3 22 | ): 23 | cls.cache[microbatch_id] = None 24 | elif ( 25 | ((not mpu.is_pipeline_last_stage()) or (get_args().use_interlaced_schedule)) 26 | and (mpu.get_virtual_vocab_parallel_chunk() == 1) 27 | ): 28 | cls.cache[microbatch_id] = None 29 | return contents 30 | -------------------------------------------------------------------------------- /megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | import megatron.core.tensor_parallel 2 | import megatron.core.utils 3 | from megatron.core import parallel_state 4 | from megatron.core.distributed import DistributedDataParallel 5 | from megatron.core.inference_params import InferenceParams 6 | from megatron.core.model_parallel_config import ModelParallelConfig 7 | from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator 8 | from megatron.core.package_info import ( 9 | __contact_emails__, 10 | __contact_names__, 11 | __description__, 12 | __download_url__, 13 | __homepage__, 14 | __keywords__, 15 | __license__, 16 | __package_name__, 17 | __repository_url__, 18 | __shortversion__, 19 | __version__, 20 | ) 21 | from megatron.core.timers import Timers 22 | 23 | # Alias parallel_state as mpu, its legacy name 24 | mpu = parallel_state 25 | 26 | __all__ = [ 27 | "parallel_state", 28 | "tensor_parallel", 29 | "utils", 30 | "DistributedDataParallel", 31 | "InferenceParams", 32 | "init_num_microbatches_calculator", 33 | "ModelParallelConfig", 34 | "Timers", 35 | ] 36 | -------------------------------------------------------------------------------- /megatron/core/datasets/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /megatron/core/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/datasets/__init__.py -------------------------------------------------------------------------------- /megatron/core/datasets/retro/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .config import RetroGPTChunkDatasets 4 | from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig 5 | from .query.retro_dataset import get_retro_datasets 6 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - Embedder: Base class for all Bert embedders. 7 | - RetroBertEmbedders: Container class for in-memory and on-disk embedders. 8 | - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing. 9 | - RetroGPTChunkDatasets: Container class for train, valid, and test datasets. 10 | - RetroTokenizers: Container class for GPT and Bert tokenizers. 11 | """ 12 | 13 | from .bert_embedders import Embedder, RetroBertEmbedders 14 | from .config import RetroPreprocessingConfig 15 | from .gpt_chunk_datasets import RetroGPTChunkDatasets 16 | from .tokenizers import RetroTokenizers 17 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/bert_embedders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Container dataclass for holding both in-memory and on-disk Bert embedders.""" 4 | 5 | import abc 6 | from dataclasses import dataclass 7 | from typing import Any 8 | 9 | import numpy as np 10 | import torch 11 | 12 | 13 | class Embedder(abc.ABC): 14 | """Base class for all Bert embedders. 15 | 16 | All embedders should be able to embed either an entire text dataset (to a 2D 17 | numpy array), or a single text string (to a 1D numpy array). 18 | """ 19 | 20 | @abc.abstractmethod 21 | def embed_text_dataset(self, text_dataset: torch.utils.data.Dataset) -> np.ndarray: 22 | """Embed a text dataset. 23 | 24 | Args: 25 | text_dataset (torch.utils.data.Dataset): Text dataset to embed. Each sample of the text dataset should output a dict with a key 'text' and a string value. 26 | 27 | Returns: 28 | A 2D ndarray with shape (len(text_dataset), dimension(embedder)). 29 | """ 30 | 31 | @abc.abstractmethod 32 | def embed_text(self, text: str) -> np.ndarray: 33 | """Embed a simple string of text. 34 | 35 | Args: 36 | text (str): A single text sample. 37 | 38 | Returns: 39 | A 1D ndarray with shape (dimensions(embedder),). 40 | """ 41 | 42 | 43 | @dataclass 44 | class RetroBertEmbedders: 45 | """Container dataclass for in-memory and on-disk Bert embedders.""" 46 | 47 | disk: Embedder 48 | mem: Embedder 49 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/gpt_chunk_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Container dataclass for GPT chunk datasets (train, valid, and test).""" 4 | 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass 9 | class RetroGPTChunkDatasets: 10 | """Container dataclass for GPT chunk datasets.""" 11 | 12 | # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'. 13 | train: dict = None 14 | valid: dict = None 15 | test: dict = None 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/config/tokenizers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Container class for GPT and Bert tokenizers.""" 4 | 5 | from dataclasses import dataclass 6 | 7 | from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer 8 | 9 | 10 | @dataclass 11 | class RetroTokenizers: 12 | """Container class for GPT and Bert tokenizers.""" 13 | 14 | gpt: MegatronTokenizer = None 15 | bert: MegatronTokenizer = None 16 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/db/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - build_db: Build a chunk database from a list of indexed datasets. 7 | """ 8 | 9 | from .build import build_db 10 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Required external libraries for Retro preprocessing.""" 4 | 5 | import importlib 6 | 7 | required_libs = [ 8 | "faiss", 9 | "h5py", 10 | "transformers", # for huggingface bert 11 | ] 12 | 13 | for lib in required_libs: 14 | try: 15 | globals()[lib] = importlib.import_module(lib) 16 | except ImportError as e: 17 | raise Exception( 18 | f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'." 19 | ) 20 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - train_index: Train an index on representative vectors. 7 | - add_to_index: Add vectors to a trained index. 8 | - build_index: Wrapper function that calls above two functions. 9 | """ 10 | 11 | from .build import add_to_index, build_index, train_index 12 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/index/factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """The IndexFactory constructs an index from an index type string.""" 4 | 5 | from megatron.core.datasets.retro.index.index import Index 6 | 7 | from .indexes import FaissBaseIndex, FaissParallelAddIndex 8 | 9 | 10 | class IndexFactory: 11 | """Get index. 12 | 13 | Index type generally read from argument '--retro-index-ty'. 14 | """ 15 | 16 | @classmethod 17 | def get_index_class(cls, index_type: str) -> type: 18 | """Get an index class, given a type string. 19 | 20 | Args: 21 | index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add(). 22 | 23 | Returns: 24 | An `Index` sub-type corresponding to the `index_type`. 25 | """ 26 | return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex,}[index_type] 27 | 28 | @classmethod 29 | def get_index(cls, index_type: str) -> Index: 30 | """Construct an index from an index type string. 31 | 32 | Args: 33 | index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add(). 34 | 35 | Returns: 36 | An `Index` instance corresponding to the `index_type`. 37 | """ 38 | index_class = cls.get_index_class(index_type) 39 | index = index_class() 40 | return index 41 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/index/indexes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | - FaissBaseIndex: Unoptimized Faiss index wrapper 6 | - FaissParallelAddIndex: Optimized index.add() for Faiss index. 7 | """ 8 | 9 | from .faiss_base import FaissBaseIndex 10 | from .faiss_par_add import FaissParallelAddIndex 11 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/datasets/retro/query/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for querying the pretraining dataset.""" 4 | 5 | import os 6 | 7 | from megatron.core.datasets.megatron_dataset import MegatronDataset 8 | 9 | 10 | def get_query_dir(project_dir: str) -> str: 11 | """Get root directory of all saved query data. 12 | 13 | Args: 14 | project_dir (str): Retro project dir. 15 | 16 | Returns: 17 | Path to query sub-directory in Retro project. 18 | """ 19 | return os.path.join(project_dir, "query") 20 | 21 | 22 | def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str: 23 | """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test). 24 | 25 | Args: 26 | project_dir (str): Retro project dir. 27 | key (str): Dataset split key; 'train', 'valid', or 'test'. 28 | dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors. 29 | 30 | Returns: 31 | Path to directory containing this dataset's neighbors within Retro project. 32 | """ 33 | return os.path.join( 34 | get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}"), 35 | ) 36 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .core import check_is_distributed_checkpoint 4 | from .mapping import LocalNonpersistentObject, LocalNonpersitentObject, ShardedTensor 5 | from .serialization import ( 6 | load, 7 | load_common_state_dict, 8 | load_plain_tensors, 9 | load_tensors_metadata, 10 | save, 11 | ) 12 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Various loading and saving strategies """ 4 | 5 | from .common import _import_trigger 6 | -------------------------------------------------------------------------------- /megatron/core/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .distributed_data_parallel import DistributedDataParallel 4 | from .distributed_data_parallel_config import DistributedDataParallelConfig 5 | from .finalize_model_grads import finalize_model_grads 6 | from .param_and_grad_buffer import ParamAndGradBuffer, shard_buffer 7 | -------------------------------------------------------------------------------- /megatron/core/distributed/distributed_data_parallel_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | 6 | 7 | @dataclass 8 | class DistributedDataParallelConfig: 9 | """Configuration for DistributedDataParallel.""" 10 | 11 | grad_reduce_in_fp32: bool = False 12 | """If true, reduce grads in fp32.""" 13 | 14 | overlap_grad_reduce: bool = False 15 | """If true, overlap grad all-reduce / reduce-scatter with backward compute.""" 16 | 17 | use_distributed_optimizer: bool = False 18 | """If true, issue reduce-scatter collectives to aggregate gradients and clean up 19 | originally allocated model parameters, otherwise issue all-reduce collectives. 20 | """ 21 | 22 | check_for_nan_in_grad: bool = False 23 | """ If true, check for NaNs in gradients _before_ communication collective.""" 24 | 25 | bucket_size: Optional[int] = None 26 | """Maximum number of parameters in each bucket. If unspecified, MCore uses a default 27 | value of max(40000000, 1000000 * dp_size) parameters (larger DP sizes need larger 28 | buckets to ensure collectives do not become latency-bound).""" 29 | 30 | average_in_collective: bool = False 31 | """If true, compute average in collective directly, as opposed to dividing by the 32 | dp_size first and then computing sum in the collective.""" 33 | -------------------------------------------------------------------------------- /megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | class ModelType(enum.Enum): 7 | encoder_or_decoder = 1 8 | encoder_and_decoder = 2 9 | retro_encoder = 3 10 | retro_decoder = 4 11 | -------------------------------------------------------------------------------- /megatron/core/fusions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/fusions/__init__.py -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core.jit import jit_fuser 6 | 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 8 | # 1/sqrt(2*pi)-> 0.3989423 9 | # 1/sqrt(2) -> 0.70710678 10 | # sqrt(2/pi) -> 0.79788456 11 | # this function is tanh approximation of gelu 12 | # actual gelu is: 13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 14 | 15 | 16 | @jit_fuser 17 | def bias_gelu(bias, y): 18 | x = bias + y 19 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 20 | 21 | 22 | # gradient of tanh approximation of gelu 23 | # gradient of actual gelu is: 24 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 25 | @jit_fuser 26 | def bias_gelu_back(g, bias, y): 27 | x = bias + y 28 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 29 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 30 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( 31 | 1 + tanh_out 32 | ) 33 | return ff * g 34 | 35 | 36 | class GeLUFunction(torch.autograd.Function): 37 | @staticmethod 38 | # bias is an optional argument 39 | def forward(ctx, input, bias): 40 | ctx.save_for_backward(input, bias) 41 | return bias_gelu(bias, input) 42 | 43 | @staticmethod 44 | def backward(ctx, grad_output): 45 | input, bias = ctx.saved_tensors 46 | tmp = bias_gelu_back(grad_output, bias, input) 47 | return tmp, tmp 48 | 49 | 50 | bias_gelu_impl = GeLUFunction.apply 51 | -------------------------------------------------------------------------------- /megatron/core/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/ammo_support/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/ammo_support/__init__.py -------------------------------------------------------------------------------- /megatron/core/inference/ammo_support/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/core/inference/common_inference_params.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class CommonInferenceParams: 6 | """Inference parameters sent along with the prompts 7 | 8 | For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910 9 | """ 10 | 11 | temperature: float = 1.0 12 | top_k: int = 0 13 | top_p: float = 0.0 14 | return_log_probs: bool = False 15 | num_tokens_to_generate: int = 30 16 | 17 | def add_attributes(self, attribute_value_pair: dict): 18 | """Utility to add more attributes to inference params 19 | 20 | Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows 21 | c = CommonInferenceParams 22 | c.add_attributes({'min_length':4, 'eod_id':153}) 23 | 24 | Args: 25 | attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values. 26 | """ 27 | for key, value in attribute_value_pair.items(): 28 | setattr(self, key, value) 29 | -------------------------------------------------------------------------------- /megatron/core/inference/communication_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from megatron.core import parallel_state 4 | 5 | 6 | def _is_cuda(tensor): 7 | """Check if a tensor is not none and is cuda.""" 8 | assert tensor is not None 9 | assert tensor.is_cuda 10 | 11 | 12 | def broadcast_from_last_pipeline_stage(size, dtype, tensor=None): 13 | """Broadcast a tensor from last pipeline stage to all ranks.""" 14 | 15 | if parallel_state.is_pipeline_last_stage(): 16 | _is_cuda(tensor) 17 | assert tensor.is_contiguous() 18 | else: 19 | tensor = torch.empty(size, dtype=dtype, device=torch.cuda.current_device()) 20 | # Get the group and corresponding source rank. 21 | src = parallel_state.get_pipeline_model_parallel_last_rank() 22 | group = parallel_state.get_pipeline_model_parallel_group() 23 | torch.distributed.broadcast(tensor, src, group) 24 | return tensor 25 | 26 | 27 | def recv_from_prev_pipeline_rank_(recv_buffer=None): 28 | """Receive from previous pipeline stage and update the 29 | input buffer inplace.""" 30 | recv_prev_op = torch.distributed.P2POp( 31 | torch.distributed.irecv, recv_buffer, parallel_state.get_pipeline_model_parallel_prev_rank() 32 | ) 33 | reqs = torch.distributed.batch_isend_irecv([recv_prev_op]) 34 | for req in reqs: 35 | req.wait() 36 | # To protect against race condition when using batch_isend_irecv(). 37 | torch.cuda.synchronize() 38 | 39 | 40 | def send_to_next_pipeline_rank(tensor=None): 41 | """Send output to the next pipeline stage.""" 42 | send_next_op = torch.distributed.P2POp( 43 | torch.distributed.isend, tensor, parallel_state.get_pipeline_model_parallel_next_rank() 44 | ) 45 | reqs = torch.distributed.batch_isend_irecv([send_next_op]) 46 | for req in reqs: 47 | req.wait() 48 | # To protect against race condition when using batch_isend_irecv(). 49 | torch.cuda.synchronize() 50 | -------------------------------------------------------------------------------- /megatron/core/inference/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/engines/__init__.py -------------------------------------------------------------------------------- /megatron/core/inference/engines/abstract_engine.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | 5 | class AbstractEngine(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def generate(self) -> dict: 9 | """The abstract backend's generate function. 10 | 11 | To define a new backend, implement this and return the outputs as a dictionary. 12 | 13 | Returns: 14 | dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`. 15 | """ 16 | pass 17 | -------------------------------------------------------------------------------- /megatron/core/inference/inference_request.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | from typing import List 4 | 5 | import torch 6 | 7 | from megatron.core.inference.common_inference_params import CommonInferenceParams 8 | 9 | 10 | # class syntax 11 | class Status(Enum): 12 | WAITING_IN_QUEUE = 1 13 | ACTIVE_AND_GENERATING_TOKENS = 2 14 | ACTIVE_BUT_NOT_GENERATING_TOKENS = 3 15 | COMPLETED = 4 16 | 17 | 18 | @dataclass 19 | class InferenceRequest: 20 | request_id: str 21 | prompt: str 22 | inference_parameters: CommonInferenceParams 23 | prompt_tokens: List[int] 24 | arrival_time: float 25 | status: Status 26 | generated_text: str = None 27 | generated_tokens: torch.Tensor = None 28 | generated_log_probs: torch.Tensor = None 29 | generated_length: int = 0 30 | -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/model_inference_wrappers/__init__.py -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/gpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/model_inference_wrappers/gpt/__init__.py -------------------------------------------------------------------------------- /megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | 5 | 6 | @dataclass 7 | class InferenceWrapperConfig: 8 | """Config for the model inference wrapper 9 | 10 | NOTE : All the arguments here are obtained from arguments.py file 11 | """ 12 | 13 | hidden_size: int 14 | """Receive happens between the layers during PP with size [seq_len, batch_size, hidden_size]""" 15 | 16 | params_dtype: torch.dtype 17 | """Can be torch.float or torch.half if --fp16 is used, or torch.bfloat16 if --bf16 is used""" 18 | 19 | inference_batch_times_seqlen_threshold: int 20 | """if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.""" 21 | 22 | padded_vocab_size: int 23 | """The final padded vocab size (Padded to make it divisible by --make-vocab-size-divisible-by value)""" 24 | 25 | fp32_residual_connection: bool = False 26 | """Move residual connections to fp32. Obtained from arguments.py""" 27 | 28 | def add_attributes(self, attribute_value_pair: dict): 29 | """Utility to add more attributes to inference params 30 | 31 | Use this method to pass in a custom dictonary to add more config to the instance you created. Use as follows 32 | c = InferenceWrapperConfig 33 | c.add_attributes({'precision':'fp32'}) 34 | 35 | Args: 36 | attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values. 37 | """ 38 | for key, value in attribute_value_pair.items(): 39 | setattr(self, key, value) 40 | -------------------------------------------------------------------------------- /megatron/core/inference/text_generation_controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/text_generation_controllers/__init__.py -------------------------------------------------------------------------------- /megatron/core/inference/utils.py: -------------------------------------------------------------------------------- 1 | class Counter: 2 | """A simple counter class 3 | 4 | This class is responsible for assigning request ids to incoming requests 5 | """ 6 | 7 | def __init__(self, start: int = 0) -> None: 8 | self.counter = start 9 | 10 | def __next__(self) -> int: 11 | i = self.counter 12 | self.counter += 1 13 | return i 14 | 15 | def reset(self) -> None: 16 | self.counter = 0 17 | -------------------------------------------------------------------------------- /megatron/core/inference_params.py: -------------------------------------------------------------------------------- 1 | class InferenceParams: 2 | """Inference parameters that are passed to the main model in order 3 | to efficienly calculate and store the context during inference.""" 4 | 5 | def __init__(self, max_batch_size, max_sequence_length): 6 | self.max_sequence_length = max_sequence_length 7 | self.max_batch_size = max_batch_size 8 | self.sequence_len_offset = 0 9 | self.batch_size_offset = 0 10 | self.key_value_memory_dict = {} 11 | 12 | def swap_key_value_dict(self, batch_idx): 13 | "swap between batches" 14 | if len(self.key_value_memory_dict) == 0: 15 | raise ValueError("should not swap when dict in empty") 16 | 17 | for layer_number in self.key_value_memory_dict.keys(): 18 | inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number] 19 | assert ( 20 | len(batch_idx) == inference_key_memory.shape[1] 21 | ) # make sure batch size is the same 22 | new_inference_key_memory = inference_key_memory[:, batch_idx] 23 | new_inference_value_memory = inference_value_memory[:, batch_idx] 24 | self.key_value_memory_dict[layer_number] = ( 25 | new_inference_key_memory, 26 | new_inference_value_memory, 27 | ) 28 | 29 | def __str__(self): 30 | return f"InferenceParams(max_seq_len = {self.max_sequence_length}, max_batch_size = {self.max_batch_size}, sequence_len_offset = {self.sequence_len_offset}, batch_size_offset = {self.batch_size_offset}, key_value_memory_dict = {self.key_value_memory_dict.keys()})" 31 | -------------------------------------------------------------------------------- /megatron/core/jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | TORCH_MAJOR = int(torch.__version__.split(".")[0]) 6 | TORCH_MINOR = int(torch.__version__.split(".")[1]) 7 | 8 | jit_fuser = torch.jit.script 9 | # nvFuser is deprecated in PyTorch JIT starting from 2.2 10 | if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2): 11 | jit_fuser = torch.compile 12 | -------------------------------------------------------------------------------- /megatron/core/models/T5/__init__.py: -------------------------------------------------------------------------------- 1 | from .t5_model import T5Model 2 | -------------------------------------------------------------------------------- /megatron/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/bert/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/bert/bert_lm_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | from megatron.core.transformer.module import MegatronModule 5 | from megatron.core.transformer.transformer_config import TransformerConfig 6 | from megatron.core.transformer.utils import get_linear_layer 7 | 8 | try: 9 | import apex 10 | 11 | from megatron.core.fusions.fused_layer_norm import FusedLayerNorm 12 | 13 | HAVE_APEX = True 14 | LNImpl = FusedLayerNorm 15 | except ImportError: 16 | import warnings 17 | 18 | from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm 19 | 20 | warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm') 21 | LNImpl = WrappedTorchLayerNorm 22 | 23 | 24 | class BertLMHead(MegatronModule): 25 | """Masked LM head for Bert. 26 | 27 | Args: 28 | hidden_size: hidden size 29 | config (TransformerConfig): TransformerConfig object 30 | """ 31 | 32 | def __init__( 33 | self, 34 | hidden_size: int, 35 | config: TransformerConfig, 36 | ): 37 | super().__init__(config=config) 38 | 39 | # TODO: Should switch this to TE ? 40 | self.dense = get_linear_layer( 41 | hidden_size, hidden_size, config.init_method, config.perform_initialization 42 | ) 43 | 44 | setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel) 45 | setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel) 46 | 47 | self.layer_norm = LNImpl( 48 | config=config, 49 | hidden_size=hidden_size, 50 | eps=config.layernorm_epsilon, 51 | ) 52 | 53 | self.gelu = torch.nn.functional.gelu 54 | 55 | def forward(self, hidden_states: Tensor) -> Tensor: 56 | hidden_states = self.dense(hidden_states) 57 | hidden_states = self.gelu(hidden_states) 58 | hidden_states = self.layer_norm(hidden_states) 59 | return hidden_states 60 | -------------------------------------------------------------------------------- /megatron/core/models/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/embeddings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/embeddings/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/language_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/language_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/vision_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/vision_module.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | """Megatron Vision Module.""" 3 | 4 | from megatron.core.transformer.module import MegatronModule 5 | from megatron.core.transformer.transformer_config import TransformerConfig 6 | 7 | 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes. 9 | class VisionModule(MegatronModule): 10 | """Base vision module that has common helper functions used across CLIP, ViT, etc. 11 | 12 | Args: 13 | config (TransformerConfig): Input transformer config for the model 14 | """ 15 | 16 | def __init__(self, config: TransformerConfig) -> None: 17 | super().__init__(config=config) 18 | -------------------------------------------------------------------------------- /megatron/core/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_model import GPTModel 2 | -------------------------------------------------------------------------------- /megatron/core/models/mamba/__init__.py: -------------------------------------------------------------------------------- 1 | from .mamba_model import MambaModel 2 | -------------------------------------------------------------------------------- /megatron/core/models/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/multimodal/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/retro/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | Exports: 5 | 6 | - RetroConfig: configuration dataclass for RetroModel. 7 | - RetroModel: The Retro model. 8 | - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block. 9 | """ 10 | 11 | from .config import RetroConfig 12 | from .decoder_spec import get_retro_decoder_block_spec 13 | from .model import RetroModel 14 | -------------------------------------------------------------------------------- /megatron/core/models/retro/base_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Base class for decoder and encoder attention modules.""" 4 | 5 | from megatron.core.models.retro.config import RetroConfig 6 | from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules 7 | from megatron.core.transformer.enums import AttnMaskType 8 | from megatron.core.transformer.module import MegatronModule 9 | 10 | 11 | class BaseRetroCrossAttention(MegatronModule): 12 | 13 | """Base class for Retro cross attention, for both encoder & decoder layers. 14 | 15 | This class collects the retro arguments below (i.e., num neighbors, chunk 16 | length, and retrieve length) for use in Retro's custom cross attention 17 | operators. 18 | 19 | Args: 20 | config (RetroConfig): Retro config. 21 | submodules (CrossAttentionSubmodules): Cross attention submodules. 22 | layer_number (int): Layer number within transformer block. 23 | attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding'). 24 | """ 25 | 26 | def __init__( 27 | self, 28 | config: RetroConfig, 29 | submodules: CrossAttentionSubmodules, 30 | layer_number: int = 1, 31 | attn_mask_type: AttnMaskType = AttnMaskType.padding, 32 | ): 33 | super().__init__(config=config) 34 | 35 | self.attn = CrossAttention( 36 | config=config, 37 | submodules=submodules, 38 | layer_number=layer_number, 39 | attn_mask_type=attn_mask_type, 40 | ) 41 | 42 | self.retro_num_neighbors = config.retro_num_neighbors 43 | self.retro_chunk_length = config.retro_chunk_length 44 | self.retro_retrieved_length = config.retro_retrieved_length 45 | -------------------------------------------------------------------------------- /megatron/core/models/retro/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | import torch 6 | 7 | 8 | def get_config_path(project_dir: str) -> str: 9 | """Config copy stored within retro project dir.""" 10 | return os.path.join(project_dir, "config.json") 11 | 12 | 13 | def get_gpt_data_dir(project_dir: str) -> str: 14 | """Get project-relative directory of GPT bin/idx datasets.""" 15 | return os.path.join(project_dir, "data") 16 | 17 | 18 | # ** Note ** : Retro's compatibility between cross attention and Flash/Fused 19 | # Attention is currently a work in progress. We default to returning None for 20 | # now. 21 | # def get_all_true_mask(size, device): 22 | # return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device) 23 | def get_all_true_mask(size, device): 24 | return None 25 | -------------------------------------------------------------------------------- /megatron/core/models/vision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/vision/__init__.py -------------------------------------------------------------------------------- /megatron/core/package_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | MAJOR = 0 5 | MINOR = 9 6 | PATCH = 0 7 | PRE_RELEASE = 'rc0' 8 | 9 | # Use the following formatting: (major, minor, patch, pre-release) 10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) 11 | 12 | __shortversion__ = '.'.join(map(str, VERSION[:3])) 13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) 14 | 15 | __package_name__ = 'megatron_core' 16 | __contact_names__ = 'NVIDIA' 17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email 18 | __homepage__ = ( 19 | 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage 20 | ) 21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' 22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 23 | __description__ = ( 24 | 'Megatron Core - a library for efficient and scalable training of transformer based models' 25 | ) 26 | __license__ = 'BSD-3' 27 | __keywords__ = ( 28 | 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' 29 | ) 30 | -------------------------------------------------------------------------------- /megatron/core/packed_seq_params.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from torch import Tensor 4 | 5 | 6 | @dataclass 7 | class PackedSeqParams: 8 | # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format, 9 | qkv_format: str = None 10 | cu_seqlens_q: Tensor = None 11 | cu_seqlens_kv: Tensor = None 12 | max_seqlen_q: Tensor = None 13 | max_seqlen_kv: Tensor = None 14 | -------------------------------------------------------------------------------- /megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .schedules import get_forward_backward_func 2 | -------------------------------------------------------------------------------- /megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | torch -------------------------------------------------------------------------------- /megatron/core/ssm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/ssm/__init__.py -------------------------------------------------------------------------------- /megatron/core/ssm/triton_cache_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import socket 5 | from pathlib import Path 6 | 7 | import torch 8 | 9 | try: 10 | from triton.runtime.cache import FileCacheManager 11 | except ImportError: 12 | raise ImportError("triton is required by the Mamba model but cannot be imported") 13 | 14 | 15 | def get_rank(): 16 | return torch.distributed.get_rank() 17 | 18 | 19 | def default_cache_dir(): 20 | return os.path.join(Path.home(), ".triton", "cache") 21 | 22 | 23 | class ParallelFileCacheManager(FileCacheManager): 24 | 25 | # See https://github.com/triton-lang/triton/blob/main/python/triton/runtime/cache.py 26 | 27 | # When running Triton with multiple ranks, they each create their own cache manager. Their input 28 | # keys to that class are mostly (but not entirely) the same across ranks, which leads many ranks 29 | # to write to the same 'key' directories in the cache dir at the same time during compilation, 30 | # leading to conflicts. This works around that by making each cache dir be rank specific by 31 | # adding "rank__" to the cache directory. 32 | 33 | def __init__(self, key): 34 | self.key = key 35 | self.lock_path = None 36 | # create cache directory if it doesn't exist 37 | self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir()) 38 | self.cache_dir = os.path.join( 39 | self.cache_dir, "rank_{}_{}".format(socket.gethostname(), os.getpid()) 40 | ) 41 | if self.cache_dir: 42 | self.cache_dir = os.path.join(self.cache_dir, self.key) 43 | self.lock_path = os.path.join(self.cache_dir, "lock") 44 | os.makedirs(self.cache_dir, exist_ok=True) 45 | -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/vocab_input_store.py: -------------------------------------------------------------------------------- 1 | 2 | class VocabInputStore: 3 | """ 4 | For storing and retrieving intermediate results of the VocabParallelInput layer. 5 | """ 6 | 7 | forward_cache = [] 8 | backward_cache = [] 9 | 10 | @classmethod 11 | def forward_store(cls, output_tensor, handle): 12 | cls.forward_cache.append((output_tensor, handle)) 13 | 14 | @classmethod 15 | def forward_get(cls, remove=True): 16 | output_tensor, handle = cls.forward_cache[0] 17 | if handle is not None: 18 | handle.wait() 19 | if remove: 20 | cls.forward_cache.pop(0) 21 | else: 22 | cls.forward_cache[0] = (output_tensor, None) 23 | return output_tensor 24 | 25 | @classmethod 26 | def backward_store(cls, grad_output): 27 | cls.backward_cache.append(grad_output) 28 | 29 | @classmethod 30 | def backward_get(cls): 31 | contents = cls.backward_cache[0] 32 | cls.backward_cache.pop(0) 33 | return contents 34 | -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/vocab_output_store.py: -------------------------------------------------------------------------------- 1 | 2 | class VocabOutputStore: 3 | """ 4 | For storing and retrieving intermediate results of the VocabParallelOutput layer. 5 | """ 6 | 7 | microbatch_id = 0 8 | forward_cache = [] 9 | backward_cache = [] 10 | 11 | @classmethod 12 | def forward_store(cls, sum_exp_logits, logits_max, predicted_logits, target_mask, 13 | softmax_grad_input, ground_truth_grad_input): 14 | while len(cls.forward_cache) <= cls.microbatch_id: 15 | cls.forward_cache.append(None) 16 | cls.forward_cache[cls.microbatch_id] = ( 17 | sum_exp_logits, logits_max, predicted_logits, target_mask, softmax_grad_input, ground_truth_grad_input 18 | ) 19 | 20 | @classmethod 21 | def forward_get(cls): 22 | contents = cls.forward_cache[cls.microbatch_id] 23 | cls.forward_cache[cls.microbatch_id] = None 24 | return contents 25 | 26 | @classmethod 27 | def backward_store(cls, sum_exp_logits, logits_max, grad_output): 28 | while len(cls.backward_cache) <= cls.microbatch_id: 29 | cls.backward_cache.append(None) 30 | cls.backward_cache[cls.microbatch_id] = ( 31 | sum_exp_logits, logits_max, grad_output 32 | ) 33 | 34 | @classmethod 35 | def backward_get(cls): 36 | contents = cls.backward_cache[cls.microbatch_id] 37 | cls.backward_cache[cls.microbatch_id] = None 38 | return contents 39 | -------------------------------------------------------------------------------- /megatron/core/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .module import MegatronModule 4 | from .spec_utils import ModuleSpec, build_module 5 | from .transformer_config import TransformerConfig 6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules 7 | -------------------------------------------------------------------------------- /megatron/core/transformer/custom_layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/transformer/custom_layers/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | # can we get rid of this? 7 | # it's being used in pipeline schedules 8 | class ModelType(enum.Enum): 9 | encoder_or_decoder = 1 10 | encoder_and_decoder = 2 11 | 12 | 13 | # class LayerType(enum.Enum): 14 | # encoder = 1 15 | # decoder = 2 16 | 17 | 18 | class AttnType(enum.Enum): 19 | self_attn = 1 20 | cross_attn = 2 21 | 22 | 23 | class AttnMaskType(enum.Enum): 24 | padding = 1 25 | causal = 2 26 | no_mask = 3 # only used for TE 27 | padding_causal = 4 # only used for thd attention 28 | -------------------------------------------------------------------------------- /megatron/core/transformer/identity_op.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | 4 | 5 | class IdentityOp(torch.nn.Module): 6 | """ 7 | This is a placeholder for IdentityOp(x) -> x 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | 17 | class IdentityFuncOp(IdentityOp): 18 | """ 19 | This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x. 20 | Such a func is handy for ops like `bias_dropout_fusion` which themselves 21 | return a function at runtime based on passed arguments 22 | """ 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__() 26 | 27 | def forward(self, *args, **kwargs): 28 | return super().forward 29 | -------------------------------------------------------------------------------- /megatron/core/transformer/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/transformer/moe/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/moe/grouped_gemm_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | try: 4 | import grouped_gemm 5 | except ImportError: 6 | grouped_gemm = None 7 | 8 | 9 | def grouped_gemm_is_available(): 10 | return grouped_gemm is not None 11 | 12 | 13 | def assert_grouped_gemm_is_available(): 14 | assert grouped_gemm_is_available(), ( 15 | "Grouped GEMM is not available. Please run " 16 | "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0`." 17 | ) 18 | 19 | 20 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None 21 | -------------------------------------------------------------------------------- /megatron/core/transformer/torch_layer_norm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import torch 4 | 5 | from megatron.core.transformer import TransformerConfig 6 | 7 | 8 | class WrappedTorchLayerNorm(torch.nn.LayerNorm): 9 | 10 | def __init__( 11 | self, 12 | config: TransformerConfig, 13 | hidden_size: int, 14 | eps: float = 1e-5, 15 | persist_layer_norm: bool = False, ## TODO: unused arguments. See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/issues/223 16 | zero_centered_gamma: bool = False, 17 | normalization: str = "LayerNorm", # included to match TE interface 18 | ): 19 | self.config = config 20 | assert ( 21 | not self.config.layernorm_zero_centered_gamma 22 | ), f"zero_centered_gamma not supported by torch LayerNorm" 23 | 24 | assert ( 25 | self.config.normalization == "LayerNorm" 26 | ), f'({self.config.normalization}) is not supported in by torch Layernorm' 27 | 28 | assert ( 29 | not self.config.persist_layer_norm 30 | ), f"persist_layer_norm not supported by torch LayerNorm" 31 | 32 | assert ( 33 | not self.config.sequence_parallel 34 | ), f"sequence parallel not supported by torch LayerNorm" 35 | 36 | assert ( 37 | not self.config.memory_efficient_layer_norm 38 | ), f"memory_efficient_layer_norm not supported by torch LayerNorm" 39 | 40 | super().__init__( 41 | normalized_shape=hidden_size, ## applied to last len(normalized_shape.size) dimensions 42 | eps=eps, 43 | ) 44 | -------------------------------------------------------------------------------- /megatron/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/inference/arguments.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | def add_modelopt_args(parser): 5 | """Add additional arguments for using TensorRT Model Optimizer (modelopt) features.""" 6 | group = parser.add_argument_group(title="modelopt-generic") 7 | 8 | group.add_argument( 9 | "--export-legacy-megatron", 10 | action="store_true", 11 | help="Export a legacy megatron-lm checkpoint.", 12 | ) 13 | group.add_argument( 14 | "--export-te-mcore-model", 15 | action="store_true", 16 | help="Export a megatron-core transformer-engine checkpoint.", 17 | ) 18 | group.add_argument( 19 | "--export-quant-cfg", 20 | type=str, 21 | default=None, 22 | choices=["int8", "int8_sq", "fp8", "int4_awq", "w4a8_awq", "int4", "None"], 23 | help="Specify a quantization config from the supported choices.", 24 | ) 25 | 26 | return parser 27 | -------------------------------------------------------------------------------- /megatron/inference/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /megatron/inference/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /megatron/legacy/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/legacy/data/__init__.py -------------------------------------------------------------------------------- /megatron/legacy/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /megatron/legacy/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /megatron/legacy/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/legacy/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/legacy/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 4 | from .rms_norm import RMSNorm 5 | 6 | from .bert_model import BertModel 7 | from .gpt_model import GPTModel 8 | from .t5_model import T5Model 9 | from .language_model import get_language_model 10 | from .module import Float16Module 11 | -------------------------------------------------------------------------------- /megatron/legacy/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | retro_decoder_with_retriever = 5 11 | 12 | class AttnType(enum.Enum): 13 | self_attn = 1 14 | cross_attn = 2 15 | 16 | class AttnMaskType(enum.Enum): 17 | padding = 1 18 | causal = 2 19 | 20 | # For backward compatibility with old model checkpoints 21 | from megatron.core.enums import ModelType 22 | -------------------------------------------------------------------------------- /megatron/legacy/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from megatron.core.jit import jit_fuser 5 | 6 | 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 8 | # 1/sqrt(2*pi)-> 0.3989423 9 | # 1/sqrt(2) -> 0.70710678 10 | # sqrt(2/pi) -> 0.79788456 11 | # this function is tanh approximation of gelu 12 | # actual gelu is: 13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 14 | 15 | @jit_fuser 16 | def bias_gelu(bias, y): 17 | x = bias + y 18 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 19 | 20 | # gradient of tanh approximation of gelu 21 | # gradient of actual gelu is: 22 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 23 | @jit_fuser 24 | def bias_gelu_back(g, bias, y): 25 | x = bias + y 26 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 27 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 28 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 29 | return ff*g 30 | 31 | class GeLUFunction(torch.autograd.Function): 32 | @staticmethod 33 | # bias is an optional argument 34 | def forward(ctx, input, bias): 35 | ctx.save_for_backward(input, bias) 36 | return bias_gelu(bias, input) 37 | 38 | @staticmethod 39 | def backward(ctx, grad_output): 40 | input, bias = ctx.saved_tensors 41 | tmp = bias_gelu_back(grad_output, bias, input) 42 | return tmp, tmp 43 | 44 | bias_gelu_impl = GeLUFunction.apply 45 | -------------------------------------------------------------------------------- /megatron/legacy/model/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from torch import nn 5 | 6 | class RMSNorm(torch.nn.Module): 7 | 8 | def __init__(self, 9 | dim: int, 10 | eps: float = 1e-6, 11 | sequence_parallel: bool = False): 12 | """RMS Normaliation module 13 | 14 | Args: 15 | dim (int): The width of input, i.e. hidden size 16 | eps (float): epsilon to use for the norm, default to 1e-6 17 | sequence_parallel (bool): Set to true if sequence parallelism is being used, 18 | this marks the weights as needing to be allreduced. 19 | """ 20 | super().__init__() 21 | self.eps = eps 22 | self.weight = nn.Parameter(torch.ones(dim)) 23 | 24 | setattr(self.weight, 'sequence_parallel', sequence_parallel) 25 | 26 | def _norm(self, x): 27 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 28 | 29 | def forward(self, x): 30 | output = self._norm(x.float()).type_as(x) 31 | return output * self.weight 32 | -------------------------------------------------------------------------------- /megatron/legacy/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def resize(input, 7 | size=None, 8 | scale_factor=None, 9 | mode='nearest', 10 | align_corners=None, 11 | warning=True): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ((output_h > 1 and output_w > 1 and input_h > 1 18 | and input_w > 1) and (output_h - 1) % (input_h - 1) 19 | and (output_w - 1) % (input_w - 1)): 20 | warnings.warn( 21 | f'When align_corners={align_corners}, ' 22 | 'the output would more aligned if ' 23 | f'input size {(input_h, input_w)} is `x+1` and ' 24 | f'out size {(output_h, output_w)} is `nx+1`') 25 | if isinstance(size, torch.Size): 26 | size = tuple(int(x) for x in size) 27 | return F.interpolate(input, size, scale_factor, mode, align_corners) 28 | -------------------------------------------------------------------------------- /megatron/legacy/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/legacy/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/training/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args 6 | from .global_vars import get_signal_handler 7 | from .global_vars import get_tokenizer 8 | from .global_vars import get_tensorboard_writer 9 | from .global_vars import get_wandb_writer 10 | from .global_vars import get_one_logger 11 | from .global_vars import get_adlr_autoresume 12 | from .global_vars import get_timers 13 | from .initialize import initialize_megatron 14 | from .training import pretrain, get_model, get_train_valid_test_num_samples 15 | 16 | from .utils import (print_rank_0, 17 | is_last_rank, 18 | print_rank_last) 19 | -------------------------------------------------------------------------------- /megatron/training/activations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | try: 6 | jit_fuser = torch.compile 7 | except: 8 | jit_fuser = torch.jit.script 9 | 10 | 11 | @jit_fuser 12 | def squared_relu(x: torch.Tensor) -> torch.Tensor: 13 | return torch.pow(F.relu(x), 2) 14 | 15 | 16 | @jit_fuser 17 | def quick_gelu(x: torch.Tensor) -> torch.Tensor: 18 | return x * torch.sigmoid(1.702 * x) 19 | -------------------------------------------------------------------------------- /megatron/training/async_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ 4 | This module provides a singleton instance of AsyncCallsQueue which manages 5 | the async checkpoint save calls. 6 | """ 7 | import logging 8 | 9 | from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue, AsyncRequest 10 | from megatron.training import get_args 11 | from megatron.training.utils import print_rank_0 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | # Singleton manager of async calls 16 | _async_calls_queue = AsyncCallsQueue() 17 | 18 | 19 | def schedule_async_save(async_request: AsyncRequest): 20 | """ Schedule the async save request. 21 | 22 | Args: 23 | async_request (AsyncRequest): the async save request. 24 | """ 25 | _async_calls_queue.schedule_async_request(async_request) 26 | 27 | 28 | def maybe_finalize_async_save(blocking: bool = False): 29 | """ Finalizes active async save calls. 30 | 31 | Args: 32 | blocking (bool, optional): if True, will wait until all active requests 33 | are done. Otherwise, finalizes only the async request that already 34 | finished. Defaults to False. 35 | """ 36 | args = get_args() 37 | if not args.async_save: 38 | return 39 | 40 | if blocking and _async_calls_queue.get_num_unfinalized_calls() > 0: 41 | print_rank_0('Unfinalized async checkpoint saves. Finalizing them synchronously now.') 42 | 43 | _async_calls_queue.maybe_finalize_async_calls(blocking) 44 | -------------------------------------------------------------------------------- /megatron/training/log_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import sys 4 | from logging import LogRecord, StreamHandler 5 | 6 | BLACKLISTED_MODULES = ["torch.distributed"] 7 | 8 | 9 | class CustomHandler(StreamHandler): 10 | """ 11 | Custom handler to filter out logging from code outside of 12 | Megatron Core, and dump to stdout. 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__(stream=sys.stdout) 17 | 18 | def filter(self, record: LogRecord) -> bool: 19 | # Prevent log entries that come from the blacklisted modules 20 | # through (e.g., PyTorch Distributed). 21 | for blacklisted_module in BLACKLISTED_MODULES: 22 | if record.name.startswith(blacklisted_module): 23 | return False 24 | return True 25 | -------------------------------------------------------------------------------- /megatron/training/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | [build-system] 4 | requires = [ 5 | "setuptools", 6 | "pybind11", 7 | ] 8 | 9 | [tool.isort] 10 | profile = "black" # black-compatible 11 | line_length = 100 # should match black parameters 12 | py_version = 38 # python 3.8 as a target version 13 | known_first_party = ["megatron"] # FIRSTPARTY section 14 | known_third_party = ["transformer_engine"] # THIRDPARTY section 15 | sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] 16 | default_section = "THIRDPARTY" 17 | extend_skip = ["setup.py"] 18 | 19 | [tool.black] 20 | line_length = 100 21 | skip_string_normalization = true 22 | # recongized by future versions, disallows to reformat code with incompatible versions 23 | # Matches NeMO version so people working on both codebases don't need two different version of black installed 24 | required_version = "24" 25 | -------------------------------------------------------------------------------- /tasks/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework. 5 | 6 | ## Multi-Stage Dialogue Prompting 7 | 8 | ### Data Preparation 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/) 10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets. 11 | 12 | ### Stage-1: Prompting for Knowledge Generation 13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation. 14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation. 15 | 16 | ### Stage-2: Prompting for Response Generation 17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file). 18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation. 19 | 3. We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation. 20 | -------------------------------------------------------------------------------- /tasks/msdp/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Model evaluation""" 4 | 5 | from megatron.training import get_args 6 | from megatron.training import print_rank_0 7 | from tasks.msdp.metrics import F1Metric 8 | from tqdm import tqdm 9 | 10 | 11 | def evaluate_f1(guess_file, answer_file): 12 | """Evaluating F1 Score""" 13 | 14 | guess_list = [] 15 | print_rank_0('reading %s' % guess_file) 16 | with open(guess_file, "r") as f: 17 | for i, line in enumerate(tqdm(f)): 18 | line = line.strip() 19 | if "<|endoftext|>" in line: 20 | line = line.replace("<|endoftext|>", "") 21 | guess_list.append(line) 22 | 23 | answer_list = [] 24 | print_rank_0('reading %s' % answer_file) 25 | with open(answer_file, "r") as f: 26 | for i, line in enumerate(tqdm(f)): 27 | line = line.strip() 28 | if line == "no_passages_used": 29 | line = "" 30 | answer_list.append(line) 31 | 32 | assert len(guess_list) == len(answer_list), \ 33 | "lengths of guess and answer are different!" 34 | 35 | precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list) 36 | print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1)) 37 | 38 | print_rank_0('done :-)') 39 | 40 | 41 | def main(): 42 | args = get_args() 43 | 44 | evaluate_f1(args.guess_file, args.answer_file) 45 | 46 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | from megatron.training import get_args, print_rank_0 6 | from megatron.legacy.indexer import IndexBuilder 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator 8 | 9 | def main(): 10 | """ 11 | Main program 12 | """ 13 | 14 | args = get_args() 15 | 16 | """ 17 | Create a BlockData data structure by running an IndexBuilder over an 18 | ICT Dataset and then evaluate on NQ task 19 | """ 20 | 21 | print_rank_0("Starting index builder!") 22 | 23 | index_builder = IndexBuilder() 24 | index_builder.build_and_save_index() 25 | print_rank_0("Build and save indices: done!") 26 | 27 | 28 | print_rank_0("Starting evaluations!") 29 | 30 | # Set up the model and evaluator 31 | evaluator = ORQAEvaluator() 32 | 33 | # Run evaluation 34 | if args.qa_data_dev is not None: 35 | evaluator.evaluate(args.qa_data_dev, "DEV") 36 | 37 | if args.qa_data_test is not None: 38 | evaluator.evaluate(args.qa_data_test, "TEST") 39 | 40 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/functional_tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml: -------------------------------------------------------------------------------- 1 | type: basic 2 | format_version: 1 3 | maintainers: [maanug] 4 | loggers: [stdout] 5 | launchers: 6 | type:slurm: 7 | ntasks_per_node: '{gpus}' 8 | no_container_mount_home: 'true' 9 | spec: 10 | name: "{model}_{variant}_{scope}_\ 11 | mbs{mbs}_gbs{gbs}_\ 12 | {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ 13 | tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ 14 | {'_'+args_meta if args_meta else ''} 15 | _{platforms}_{nodes}N{gpus}G" 16 | model: gpt3-nemo 17 | variant: 126m 18 | build: mcore-nemo 19 | scope: mr 20 | nodes: 1 21 | gpus: 8 22 | platforms: dgx_a100 23 | steps: 50 24 | extra_args: null 25 | args_meta: null 26 | precision: bf16 27 | time_limit: 1200 28 | use_mcore: True 29 | use_te: True 30 | vp_size: null 31 | script: |- 32 | cd /opt/NeMo 33 | 34 | /opt/megatron-lm/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh \ 35 | TP_SIZE={tp_size} \ 36 | PP_SIZE={pp_size} \ 37 | NUM_NODES={nodes} \ 38 | MAX_STEPS={steps} \ 39 | VP_SIZE={vp_size if vp_size is not None else '""'} \ 40 | MBS={mbs} \ 41 | GBS={gbs} \ 42 | JOB_NAME={name} \ 43 | ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} 44 | products: 45 | - {tp_size: [1], pp_size: [1], mbs: [4], gbs: [64], vp_size: [null]} 46 | - {tp_size: [2], pp_size: [4], mbs: [1], gbs: [8], vp_size: [3], extra_args: ['"model.sequence_parallel=True model.overlap_p2p_comm=True model.batch_p2p_comm=False"'], args_meta: ["seq_par_overlap_p2p"]} 47 | -------------------------------------------------------------------------------- /tests/functional_tests/jet_recipes/build-pyt.yaml: -------------------------------------------------------------------------------- 1 | type: build 2 | format_version: 1 3 | maintainers: [maanug] 4 | spec: 5 | name: mcore-pyt 6 | platforms: [linux/amd64] 7 | source: 8 | # The image tag will be added via `jet-tests.yaml` 9 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 10 | image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci 11 | 12 | 13 | --- 14 | type: build 15 | format_version: 1 16 | maintainers: [maanug] 17 | spec: 18 | name: mcore-nemo 19 | platforms: [linux/amd64] 20 | source: 21 | # The image tag will be added via `jet-tests.yaml` 22 | # Tags are one of {buildcache, $CI_PIPELINE_ID} 23 | image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/functional_tests/python_test_utils/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 4 | import json 5 | import sys 6 | 7 | from tests.functional_tests.python_test_utils.common import read_tb_logs_as_list 8 | 9 | 10 | def collect_train_test_metrics(logs_dir, run_name): 11 | summaries = read_tb_logs_as_list(logs_dir) 12 | 13 | train_metrics = { 14 | metric_name: { 15 | "start_step": 0, 16 | "end_step": len(metric_values), 17 | "step_interval": 5, 18 | "values": metric_values[0 : len(metric_values) : 5], 19 | } 20 | for metric_name, metric_values in summaries.items() 21 | } 22 | print( 23 | f"\n ----------- Store the following metrics in tests/functional_tests/test_results/jet/{run_name}.json ----------" 24 | ) 25 | print(f"\n {json.dumps(train_metrics)}", flush=True) 26 | 27 | 28 | if __name__ == "__main__": 29 | args = sys.argv[1:] 30 | logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/ 31 | run_name = args[1] 32 | collect_train_test_metrics(logs_dir, run_name) 33 | -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/multitest_ci_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pytest 4 | import sys 5 | import glob 6 | from .common import read_tb_logs_as_list, TypeOfTest 7 | from .test_ci_pipeline import TestCIPipeline 8 | 9 | LOGS_DIR = os.getenv('LOGS_DIR') 10 | EXPECTED_METRICS_DIR = os.getenv('EXPECTED_METRICS_DIR') 11 | 12 | 13 | class TestBulkCIPipeline(TestCIPipeline): 14 | 15 | margin_loss, margin_time = 0.05, 0.1 16 | 17 | def _setup(self, config_name): 18 | self.config_name = config_name 19 | baseline_filename = config_name + '.json' 20 | 21 | filepath = os.path.join(EXPECTED_METRICS_DIR, baseline_filename) 22 | if os.path.exists(filepath): 23 | with open(filepath) as f: 24 | self.expected = json.load(f) 25 | else: 26 | raise FileNotFoundError(f"{baseline_filename} does not exist") 27 | 28 | def _get_actual(self, loss_type): 29 | return read_tb_logs_as_list(LOGS_DIR+'/'+self.config_name, loss_type) 30 | 31 | @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR)) 32 | def test_lm_loss_deterministic(self, config_name): 33 | # Expected training loss curve at different global steps. 34 | self._setup(config_name) 35 | self._test_helper("lm loss", TypeOfTest.DETERMINISTIC) 36 | 37 | @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR)) 38 | def test_lm_loss_approx(self, config_name): 39 | # Expected training loss curve at different global steps. 40 | self._setup(config_name) 41 | self._test_helper("lm loss", TypeOfTest.APPROX) 42 | 43 | @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR)) 44 | def test_num_zeros_deterministic(self, config_name): 45 | # Expected validation loss curve at different global steps. 46 | self._setup(config_name) 47 | self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC) 48 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49405, 10.48276, 10.49249, 10.47813, 10.46623, 10.35183, 10.17697, 10.07728, 9.8875, 9.68029]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2018.0, 2636.0, 2067.0, 2225.0, 2555.0, 2554.0, 2969.0, 2935.0, 2967.0, 2287.0]}, "iteration_timing_avg": 0.5847132352941178} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4681, 10.45734, 10.4491, 10.44102, 10.41779, 10.34626, 10.11378, 10.04382, 9.86692, 9.67893]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2373.0, 2593.0, 2187.0, 2403.0, 2412.0, 2617.0, 3083.0, 3341.0, 3558.0, 3213.0]}, "iteration_timing_avg": 0.8346488235294117} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.42107, 10.42897, 10.43577, 10.40787, 10.38455, 10.32433, 10.13158, 10.04316, 9.86274, 9.65777]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2229.0, 3600.0, 3300.0, 3311.0, 3522.0, 3498.0, 4076.0, 4135.0, 4709.0, 4350.0]}, "iteration_timing_avg": 1.8964105882352944} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50096, 10.48594, 10.4936, 10.48501, 10.50417, 10.4773, 10.42153, 10.29719, 10.15831, 9.9675]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [18201.0, 19789.0, 21743.0, 18735.0, 21941.0, 19700.0, 21781.0]}, "iteration_timing_avg": 0.4730702941176471} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.49275, 10.48836, 10.51349, 10.49399, 10.47549, 10.41922, 10.28044, 10.14255, 9.94736]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26212.0, 19433.0, 24101.0, 23509.0, 21539.0, 17889.0, 19123.0]}, "iteration_timing_avg": 1.6886158823529411} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44389, 10.35605, 10.13777, 10.04004, 9.86833, 9.67303]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2398.0, 2539.0, 2945.0, 3162.0, 3457.0, 3125.0]}, "iteration_timing_avg": 0.8110379411764704} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | { 2 | "lm loss": { 3 | "start_step": 0, 4 | "end_step": 50, 5 | "step_interval": 5, 6 | "values": [ 7 | 10.49566, 8 | 10.48166, 9 | 10.48045, 10 | 10.45348, 11 | 10.44393, 12 | 10.35605, 13 | 10.13787, 14 | 10.04034, 15 | 9.86836, 16 | 9.6732 17 | ] 18 | }, 19 | "num-zeros": { 20 | "start_step": 0, 21 | "end_step": 50, 22 | "step_interval": 5, 23 | "values": [ 24 | 2183.0, 25 | 2469.0, 26 | 2115.0, 27 | 2126.0, 28 | 2322.0, 29 | 2411.0, 30 | 2892.0, 31 | 3234.0, 32 | 3637.0, 33 | 2992.0 34 | ] 35 | }, 36 | "mem-allocated-bytes": { 37 | "start_step": 0, 38 | "end_step": 50, 39 | "step_interval": 5, 40 | "values": [ 41 | 1718216192.0, 42 | 1718216192.0, 43 | 1718216192.0, 44 | 1718216192.0, 45 | 1718216192.0, 46 | 1718216192.0, 47 | 1718216192.0, 48 | 1718216192.0, 49 | 1718216192.0, 50 | 1718216192.0 51 | ] 52 | }, 53 | "iteration-time": { 54 | "start_step": 0, 55 | "end_step": 50, 56 | "step_interval": 5, 57 | "values": [ 58 | 13.22827, 59 | 0.88854, 60 | 0.92588, 61 | 0.89793, 62 | 0.95437, 63 | 0.88007, 64 | 0.88504, 65 | 0.88703, 66 | 0.89866, 67 | 0.88756 68 | ] 69 | } 70 | } -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49533, 10.46662, 10.42395, 10.30693, 10.15979, 9.96957, 9.87618, 9.75265, 9.63628, 9.54659, 9.49973, 9.35968, 9.33181, 9.2626, 9.26439, 9.21492]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [18772.0, 19035.0, 22350.0, 18671.0, 20738.0, 23121.0, 22655.0, 27141.0, 24304.0, 25619.0, 17322.0, 32489.0, 28409.0, 21067.0, 37615.0, 30599.0, 26145.0]}, "iteration_timing_avg": 0.3927519402985073} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49569, 10.4596, 10.32846, 10.17265, 9.96951]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27627.0, 22759.0, 22567.0, 20671.0, 23229.0]}, "iteration_timing_avg": 0.7692817647058824} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/bert_mr_tp2_pp2_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.0958791176470588} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312, 9.8347, 9.61264, 9.67965, 9.68133, 9.60021, 9.06887, 9.46573, 9.06116, 9.32103, 9.51104]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0, 2686.0, 2671.0, 3014.0, 3152.0, 2960.0, 3015.0, 3735.0, 2675.0, 2947.0, 3414.0]}, "iteration_timing_avg": 0.08244119402985074} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0]}, "iteration_timing_avg": 0.11905411764705882} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.1541691176470588} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153, 9.83685, 9.60745, 9.68285, 9.6869, 9.60677, 9.07989, 9.47324, 9.07018, 9.33019, 9.51809]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0, 2540.0, 2588.0, 3110.0, 3059.0, 2924.0, 2894.0, 3694.0, 2720.0, 2635.0, 3456.0]}, "iteration_timing_avg": 0.150555671641791} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404, 9.85697, 9.65534, 9.71837, 9.74563, 9.63824, 9.13952, 9.51114, 9.10678, 9.3932, 9.56085]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0, 4218.0, 4359.0, 4468.0, 5080.0, 4575.0, 4964.0, 5755.0, 4852.0, 4092.0, 5592.0]}, "iteration_timing_avg": 0.33336671641791044} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81916, 10.86702, 10.85724, 10.80665, 10.71115, 10.63679, 10.16197, 10.277, 10.18384, 9.88281, 9.89125, 9.67734, 9.74917, 9.75758, 9.65591, 9.15592, 9.52069, 9.11526, 9.4051, 9.56814]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [7138.0, 8525.0, 8821.0, 8718.0, 7682.0, 8227.0, 7158.0, 8514.0, 9143.0, 9624.0, 9298.0, 10386.0, 10352.0, 12164.0, 10941.0, 12318.0, 13902.0, 11709.0, 10898.0, 12956.0]}, "iteration_timing_avg": 0.33394373134328353} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0]}, "iteration_timing_avg": 0.33478764705882363} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81916, 10.86661, 10.85683, 10.80678, 10.7112, 10.63712, 10.16253, 10.27882, 10.18795, 9.88907]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [12923.0, 15794.0, 16416.0, 15771.0, 14114.0, 15096.0, 12918.0, 15842.0, 16657.0, 17467.0]}, "iteration_timing_avg": 0.340485} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0]}, "iteration_timing_avg": 0.27329441176470587} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.27828194029850745} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.2851294029850746} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.8893, 10.84864, 10.6962, 10.63918, 10.5393, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1298.0, 1352.0, 1590.0, 1403.0, 1435.0, 1266.0, 1195.0]}, "iteration_timing_avg": 0.07655911764705883} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.88931, 10.84864, 10.6962, 10.63918, 10.53931, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1131.0, 1173.0, 1218.0, 1783.0, 1278.0, 1244.0, 1555.0]}, "iteration_timing_avg": 0.07975499999999999} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0]}, "iteration_timing_avg": 0.3671870588235294} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.12348235294117646} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.74049, 10.81937, 10.84178, 10.75551, 10.69818, 10.63091, 10.20265, 10.36288, 10.25632, 9.94256]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2527.0, 2937.0, 2975.0, 2749.0, 2580.0, 2593.0, 2320.0, 2616.0, 2541.0, 2393.0]}, "iteration_timing_avg": 0.12725500000000006} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.90105, 10.91104, 10.91635, 10.84822, 10.70727, 10.63018, 10.15241, 10.26052, 10.15994, 9.83162]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727086.0, 23021732.0, 22500940.0, 22830674.0, 22739332.0, 22547236.0, 22955516.0, 22590012.0, 22659588.0, 22884630.0]}, "iteration_timing_avg": 0.1246464705882353} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87448, 10.87794, 10.79507, 10.68154, 10.59412, 10.09987, 10.20952, 10.13639, 9.80012]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1734.0, 1884.0, 1684.0, 1815.0, 1766.0, 1601.0, 1904.0, 2361.0, 2347.0]}, "iteration_timing_avg": 0.12273676470588235} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12168999999999999} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12873676470588236} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9359, 10.93547, 10.94238, 10.88073, 10.75653, 10.66332, 10.1672, 10.27241, 10.19577, 9.86006]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727686.0, 23020980.0, 22501260.0, 22830024.0, 22739772.0, 22548148.0, 22955712.0, 22589816.0, 22660000.0, 22884332.0]}, "iteration_timing_avg": 0.12799705882352944} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86506, 10.87184, 10.80703, 10.71158, 10.63915, 10.1929, 10.30937, 10.21969, 9.91592]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37021.0, 37806.0, 36157.0, 33974.0, 34873.0, 30957.0, 35062.0, 36419.0, 37713.0]}, "iteration_timing_avg": 0.35529294117647064} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86462, 10.87239, 10.80678, 10.7118, 10.63911, 10.19319, 10.30944, 10.21988, 9.91603]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37033.0, 37783.0, 36040.0, 33452.0, 34761.0, 30933.0, 35487.0, 36392.0, 37655.0]}, "iteration_timing_avg": 0.3566726470588235} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86367, 10.80237, 10.71665, 10.6452, 10.21186, 10.32279, 10.22474, 9.93034]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38080.0, 36072.0, 33389.0, 34302.0, 30262.0, 35071.0, 36081.0, 36818.0]}, "iteration_timing_avg": 0.2153429411764706} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86435, 10.80239, 10.7159, 10.6454, 10.21181, 10.32236, 10.22471, 9.92956]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38276.0, 36278.0, 32946.0, 34291.0, 30145.0, 35217.0, 36060.0, 37032.0]}, "iteration_timing_avg": 0.21900323529411767} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86861, 10.87486, 10.7986, 10.66452, 10.58021, 10.05487, 10.18533, 10.097, 9.75749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [26144.0, 31960.0, 32510.0, 31451.0, 28954.0, 30872.0, 29506.0, 33312.0, 34558.0, 36855.0]}, "iteration_timing_avg": 0.28211852941176474} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93628, 10.89335, 10.87322, 10.7487, 10.65379, 10.15754, 10.2464, 10.15175, 9.83801]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [68.0, 64.0, 61.0, 58.0, 55.0, 85.0, 77.0, 68.0, 78.0, 63.0]}} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62396, 10.53554, 10.25187, 10.20873, 9.96714, 9.96605, 9.92367, 9.79179, 9.26742, 9.61926, 9.18974, 9.46019, 9.62277]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2078.0, 2328.0, 2420.0, 2256.0, 2180.0, 2078.0, 2313.0, 2933.0, 2712.0, 2270.0, 2872.0, 3003.0, 3555.0, 3066.0, 3103.0, 3098.0, 3762.0]}, "iteration_timing_avg": 0.13093716417910448} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/gpt3_mr_tp2_pp2_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13995, 9.14036, 9.13054, 9.12408, 9.0791, 9.06608, 9.01164, 8.97073, 8.93805, 8.85873]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2852600.0, 2939939.0, 2850191.0, 2774638.0, 3035015.0, 2853397.0, 2787109.0, 2832834.0, 2809354.0, 2940633.0]}, "iteration_timing_avg": 0.2253964705882353} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp2_pp4_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13682, 9.13803, 9.13233, 9.12379, 9.09228, 9.07609, 9.02997, 8.99391, 8.96074, 8.89575]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2918419.0, 3005942.0, 2916151.0, 2840544.0, 3100625.0, 2919164.0, 2852935.0, 2898444.0, 2875057.0, 3006499.0]}, "iteration_timing_avg": 0.2253964705882353} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/jet/t5_220m_mr_mcore_tp2_pp4_dgx_a100_1N8G.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.39452, 9.22332, 8.69422, 8.39796, 8.11874, 8.01176, 7.72419, 7.44126, 7.3078, 7.2363]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [115739.0, 111092.0, 117169.0, 112383.0, 118597.0, 117024.0, 111417.0, 114098.0, 118529.0, 117033.0]}, "iteration_timing_avg": 0.2253964705882353} -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- 1 | import torch._dynamo 2 | torch._dynamo.config.suppress_errors = True -------------------------------------------------------------------------------- /tests/unit_tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/data/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/data/test_multimodal_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | ## 4 | # Compile megatron.core.datasets.helpers dependencies before BlendedDataset import 5 | ## 6 | 7 | import torch 8 | 9 | from megatron.core.datasets.utils import compile_helpers 10 | from tests.unit_tests.test_utilities import Utils 11 | 12 | if torch.distributed.is_available(): 13 | Utils.initialize_distributed() 14 | if torch.distributed.get_rank() == 0: 15 | compile_helpers() 16 | torch.distributed.barrier() 17 | else: 18 | compile_helpers() 19 | 20 | ## 21 | # Done 22 | ## 23 | 24 | from types import SimpleNamespace 25 | 26 | from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder 27 | from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig 28 | from megatron.training.tokenizer.tokenizer import _NullTokenizer 29 | 30 | _MOCK_VOCAB_SIZE = 8192 31 | 32 | 33 | def test_mock_multimodal_dataset(): 34 | config = MultimodalDatasetConfig( 35 | random_seed=1234, 36 | sequence_length=1024, 37 | reset_position_ids=False, 38 | reset_attention_mask=False, 39 | eod_mask_loss=True, 40 | image_h=336, 41 | image_w=336, 42 | split="990,9,1", 43 | tokenizer=_NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE), 44 | ) 45 | 46 | datasets = BlendedMegatronDatasetBuilder( 47 | MockMultimodalDataset, [100, 100, 100], lambda: True, config 48 | ).build() 49 | 50 | for ds in datasets: 51 | sample = ds[0] 52 | assert "image" in sample 53 | assert sample["image"].shape == torch.Size([3, 336, 336]) 54 | assert "tokens" in sample 55 | 56 | 57 | if __name__ == "__main__": 58 | test_mock_multimodal_dataset() 59 | -------------------------------------------------------------------------------- /tests/unit_tests/dist_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import weakref 3 | from pathlib import Path 4 | from shutil import rmtree 5 | from tempfile import TemporaryDirectory 6 | from typing import Union, Optional 7 | 8 | from tests.unit_tests.test_utilities import Utils 9 | 10 | 11 | def empty_dir(path: Path): 12 | if Utils.rank > 0: 13 | return 14 | for p in path.iterdir(): 15 | if p.is_dir(): 16 | rmtree(p) 17 | else: 18 | p.unlink() 19 | 20 | 21 | 22 | class TempNamedDir(TemporaryDirectory): 23 | """ TemporaryDirectory with a fully named directory. Empties the dir if not empty. """ 24 | def __init__(self, name: Union[str, Path], sync=True, 25 | ignore_cleanup_errors=False) -> None: 26 | self.name = str(name) 27 | if Utils.rank == 0: 28 | os.makedirs(name, exist_ok=True) 29 | empty_dir(Path(name)) 30 | 31 | self._ignore_cleanup_errors = ignore_cleanup_errors 32 | self._finalizer = weakref.finalize( 33 | self, self._cleanup, self.name, 34 | warn_message="Implicitly cleaning up {!r}".format(self)) 35 | self.sync = sync 36 | 37 | def cleanup(self, override_sync: Optional[bool] = None) -> None: 38 | sync = self.sync if override_sync is None else override_sync 39 | if sync : 40 | import torch 41 | torch.distributed.barrier() 42 | 43 | if Utils.rank == 0: 44 | super().cleanup() 45 | 46 | def __enter__(self): 47 | path = Path(super().__enter__()) 48 | if self.sync: 49 | import torch 50 | torch.distributed.barrier() 51 | return path 52 | 53 | def __exit__(self, exc_type, exc_val, exc_tb): 54 | raised = exc_type is not None 55 | if not raised: 56 | self.cleanup() 57 | 58 | -------------------------------------------------------------------------------- /tests/unit_tests/dist_checkpointing/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from unittest import mock 3 | 4 | import pytest 5 | 6 | from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy 7 | from tests.unit_tests.dist_checkpointing import TempNamedDir 8 | from tests.unit_tests.test_utilities import Utils 9 | 10 | 11 | @pytest.fixture(scope="session") 12 | def tmp_path_dist_ckpt(tmp_path_factory) -> Path: 13 | """ Common directory for saving the checkpoint. 14 | 15 | Can't use pytest `tmp_path_factory` directly because directory must be shared between processes. """ 16 | 17 | tmp_dir = tmp_path_factory.mktemp('ignored', numbered=False) 18 | tmp_dir = tmp_dir.parent.parent / 'tmp_dist_ckpt' 19 | 20 | if Utils.rank == 0: 21 | with TempNamedDir(tmp_dir, sync=False): 22 | yield tmp_dir 23 | 24 | else: 25 | yield tmp_dir 26 | 27 | 28 | @pytest.fixture(scope='session', autouse=True) 29 | def set_default_dist_ckpt_strategy(): 30 | def get_pyt_dist_save_sharded_strategy(): 31 | return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1) 32 | 33 | with mock.patch( 34 | 'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy', 35 | new=get_pyt_dist_save_sharded_strategy, 36 | ) as _fixture: 37 | yield _fixture 38 | -------------------------------------------------------------------------------- /tests/unit_tests/dist_checkpointing/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/dist_checkpointing/models/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/fusions/test_torch_softmax.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax 5 | from megatron.core.transformer.enums import AttnMaskType 6 | from megatron.core.transformer.utils import attention_mask_func 7 | 8 | 9 | class TestTorchSoftmax: 10 | def setup_method(self, method): 11 | # The important settings tested are forward_torch_softmax path 12 | # with locally generated casual mask for attention_mask_func: 13 | self.softmax = FusedScaleMaskSoftmax( 14 | input_in_fp16=False, 15 | input_in_bf16=False, 16 | attn_mask_type=AttnMaskType.causal, 17 | scaled_masked_softmax_fusion=False, 18 | mask_func=attention_mask_func, 19 | softmax_in_fp32=True, 20 | scale=None, 21 | ) 22 | 23 | def test_output_shape(self): 24 | x = torch.randn(8, 2, 4, 4, device="cuda") 25 | y = self.softmax(x, None) 26 | assert x.shape == y.shape 27 | 28 | def test_causal_mask_input_shape_assert(self): 29 | x = torch.randn(1, 1, 4, 16, device="cuda") 30 | with pytest.raises(AssertionError): 31 | self.softmax(x, None) 32 | 33 | def test_causal_mask_equal_scores(self): 34 | # For equal input values (e.g. zero) correctly masked softmax should 35 | # produce equal scores among non-masked elements. For example, in case 36 | # sq == sk == 2 the expected output is (ignoring b and np dimensions): 37 | # [[1.0, 0.0], 38 | # [0.5, 0.5]] 39 | b, np, sq, sk = 8, 2, 32, 32 40 | x = torch.zeros([b, np, sq, sk]).cuda() 41 | y = self.softmax(x, None) 42 | y_expected = torch.tril(torch.ones(b, np, sq, sk, device="cuda")) 43 | y_expected /= torch.arange(1, sq + 1, device="cuda").reshape((-1, 1)) 44 | assert torch.allclose(y, y_expected, rtol=1e-08, atol=1e-08) 45 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/engines/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/model_inference_wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/model_inference_wrappers/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig 3 | 4 | class TestModelInferenceWrapperConfig: 5 | 6 | def test_inference_params(self): 7 | inference_parameters = InferenceWrapperConfig( 8 | hidden_size=10, 9 | inference_batch_times_seqlen_threshold=10, 10 | padded_vocab_size=10, 11 | params_dtype=torch.float, 12 | fp32_residual_connection=False 13 | ) 14 | inference_parameters.add_attributes({"abc": 45}) 15 | assert inference_parameters.abc == 45, f"min tokens not set correctly. it is {inference_parameters.min_tokens}" -------------------------------------------------------------------------------- /tests/unit_tests/inference/test_common_inference_params.py: -------------------------------------------------------------------------------- 1 | from megatron.core.inference.common_inference_params import CommonInferenceParams 2 | 3 | class TestCommonInferenceParams: 4 | 5 | def test_inference_params(self): 6 | inference_parameters = CommonInferenceParams() 7 | inference_parameters.add_attributes({"min_tokens": 45}) 8 | assert inference_parameters.min_tokens == 45, f"min tokens not set correctly. it is {inference_parameters.min_tokens}" -------------------------------------------------------------------------------- /tests/unit_tests/inference/test_inference_utils.py: -------------------------------------------------------------------------------- 1 | from megatron.core.inference.utils import Counter 2 | 3 | class TestInferenceUtils: 4 | 5 | def test_counter(self): 6 | counter = Counter() 7 | r = next(counter) 8 | assert r == 0, f'Counter return value should be 0 but it is {r}' 9 | assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}' 10 | counter.reset() 11 | assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}' 12 | -------------------------------------------------------------------------------- /tests/unit_tests/inference/text_generation_controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/text_generation_controllers/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/models/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/pipeline_parallel/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/tensor_parallel/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy 2 | import torch 3 | from tests.unit_tests.test_utilities import Utils 4 | import numpy as np 5 | 6 | def test_vocab_parallel_cross_entropy(): 7 | Utils.initialize_model_parallel(4,2) 8 | vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda() 9 | target = torch.arange(0,32,2).cuda() 10 | output = vocab_parallel_cross_entropy(vocab_parallel_logits, target) 11 | expected_output = torch.tensor([10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309, 12 | 10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309]).cuda() 13 | assert(torch.equal(torch.round(expected_output), torch.round(output))) 14 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_data.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.data import broadcast_data 2 | import torch 3 | from tests.unit_tests.test_utilities import Utils 4 | 5 | def test_broadcast_data(): 6 | Utils.initialize_model_parallel(2,4) 7 | input_data = { 8 | 0 : torch.ones((8,8)).cuda() * 0.0, 9 | 1 : torch.ones((8,8)).cuda() * 1.0, 10 | 2 : torch.ones((8,8)).cuda() * 2.0, 11 | 3 : torch.ones((8,8)).cuda() * 3.0, 12 | 4 : torch.ones((8,8)).cuda() * 4.0, 13 | 5 : torch.ones((8,8)).cuda() * 5.0, 14 | 6 : torch.ones((8,8)).cuda() * 6.0, 15 | 7 : torch.ones((8,8)).cuda() * 7.0 16 | } 17 | dtype = torch.float32 18 | actual_output = broadcast_data([0,1],input_data, dtype) 19 | assert(torch.equal(actual_output[0], input_data[0])) 20 | assert(torch.equal(actual_output[1], input_data[1])) 21 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_random.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed,get_cuda_rng_tracker 3 | from megatron.core.tensor_parallel.random import checkpoint 4 | from tests.unit_tests.test_utilities import Utils 5 | import pytest 6 | import torch 7 | 8 | def test_cuda_rng_states_tracker(): 9 | rng_tracker = CudaRNGStatesTracker() 10 | rng_tracker.set_states({"state1":1234}) 11 | assert(rng_tracker.get_states()["state1"] == 1234) 12 | rng_tracker.reset() 13 | assert(rng_tracker.get_states() == {}) 14 | seed = 1111 15 | rng_tracker.add("state2",seed) 16 | with pytest.raises(Exception): 17 | assert(rng_tracker.add("state3",seed)) 18 | with pytest.raises(Exception): 19 | assert(rng_tracker.add("state2",111)) 20 | assert(rng_tracker.get_states()['state2'] is not None) 21 | with pytest.raises(Exception): 22 | assert() 23 | 24 | rng_tracker.fork("state2") 25 | torch.cuda.manual_seed(seed) 26 | rng_state = torch.cuda.get_rng_state() 27 | assert torch.equal(rng_tracker.get_states()['state2'], rng_state) 28 | 29 | def test_model_parallel_cuda_manual_seed(): 30 | Utils.initialize_model_parallel(4,2) 31 | model_parallel_cuda_manual_seed(0) 32 | rng_tracker = get_cuda_rng_tracker() 33 | assert(rng_tracker.get_states()['model-parallel-rng'] is not None) 34 | Utils.destroy_model_parallel() 35 | 36 | def test_checkpoint(): 37 | def test_forward(*input): 38 | return input[0]+input[1] 39 | assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2))) 40 | Utils.initialize_model_parallel() 41 | input1 = torch.ones((4,4)) 42 | checkpoint(test_forward, True, input1, torch.ones((4,4))*2) 43 | assert(torch.equal(torch.ones(input1.numel()).cuda(), input1)) 44 | Utils.destroy_model_parallel() 45 | -------------------------------------------------------------------------------- /tests/unit_tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /tests/unit_tests/test_local_multi_tensor_fns.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from megatron.core.utils import ( 3 | local_multi_tensor_applier, 4 | local_multi_tensor_l2_norm, 5 | local_multi_tensor_scale 6 | ) 7 | import pytest 8 | import torch 9 | 10 | def test_local_multi_tensor_l2_norm_and_scale(): 11 | amp_C = pytest.importorskip("amp_C") 12 | multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply") 13 | 14 | torch.manual_seed(42) 15 | 16 | tensor_list = [torch.rand(5,5).cuda() for _ in range(10)] 17 | tensor_list_copy = copy.deepcopy(tensor_list) 18 | 19 | norm_apex, _ = multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False) 20 | norm_local, _ = multi_tensor_apply.multi_tensor_applier(local_multi_tensor_l2_norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list_copy], False) 21 | torch.testing.assert_close(norm_apex, norm_local) 22 | 23 | clip_coeff = 0.05 24 | multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_scale, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list, tensor_list], clip_coeff) 25 | multi_tensor_apply.multi_tensor_applier(local_multi_tensor_scale, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list_copy, tensor_list_copy], clip_coeff) 26 | torch.testing.assert_close(tensor_list, tensor_list_copy) 27 | 28 | def test_local_multi_tensor_apply(): 29 | amp_C = pytest.importorskip("amp_C") 30 | multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply") 31 | 32 | tensor_list = [torch.rand(5,5).cuda() for _ in range(10)] 33 | 34 | norm_apex, _ = multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False) 35 | norm_local, _ = local_multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False) 36 | torch.testing.assert_close(norm_apex, norm_local) 37 | -------------------------------------------------------------------------------- /tests/unit_tests/test_training.py: -------------------------------------------------------------------------------- 1 | from types import SimpleNamespace 2 | 3 | from megatron.training.global_vars import set_args 4 | from megatron.training.training import build_train_valid_test_data_iterators 5 | from tests.unit_tests.test_utilities import Utils 6 | 7 | 8 | def mock_train_valid_test_datasets_provider(train_val_test_num_samples): 9 | return 1, 2, 3 10 | 11 | 12 | def create_test_args(): 13 | # Set dummy values for the args. 14 | args = SimpleNamespace() 15 | args.iteration = 0 16 | args.train_samples = 1 17 | args.train_iters = 1 18 | args.eval_interval = 1 19 | args.eval_iters = 1 20 | args.global_batch_size = 1 21 | args.consumed_train_samples = 1 22 | args.consumed_valid_samples = 1 23 | args.dataloader_type = "external" 24 | args.skip_train = False 25 | 26 | return args 27 | 28 | 29 | class TestTraining: 30 | def setup_method(self, method): 31 | Utils.initialize_model_parallel(1, 1) 32 | args = create_test_args() 33 | set_args(args) 34 | 35 | def test_build_train_valid_test_data_iterators(self): 36 | train_iter, valid_iter, test_iter = build_train_valid_test_data_iterators( 37 | mock_train_valid_test_datasets_provider 38 | ) 39 | 40 | assert (train_iter, valid_iter, test_iter) == (1, 2, 3) 41 | 42 | def teardown_method(self, method): 43 | Utils.destroy_model_parallel() 44 | -------------------------------------------------------------------------------- /tests/unit_tests/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/transformer/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/transformer/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/transformer/moe/__init__.py -------------------------------------------------------------------------------- /tools/autoformat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euox pipefail 3 | 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 5 | CHECK_ONLY=${CHECK_ONLY:-false} 6 | CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/main megatron/core | grep '\.py$' || true) 7 | ADDITIONAL_ARGS="" 8 | 9 | if [[ $CHECK_ONLY == true ]]; then 10 | ADDITIONAL_ARGS="--check " 11 | fi 12 | 13 | # for now we just format core 14 | if [[ -n "$CHANGED_FILES" ]]; then 15 | black $ADDITIONAL_ARGS --verbose --diff $CHANGED_FILES 16 | isort $ADDITIONAL_ARGS $CHANGED_FILES 17 | else 18 | echo Changeset is empty, all good. 19 | fi 20 | -------------------------------------------------------------------------------- /tools/bert_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder 4 | -------------------------------------------------------------------------------- /tools/bert_embedding/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from megatron.training import get_args, get_tokenizer 7 | 8 | 9 | class BertEmbeddingDataset(torch.utils.data.Dataset): 10 | '''Dataset to convert a text dataset to Bert tokens.''' 11 | 12 | def __init__(self, text_dataset, max_seq_length): 13 | 14 | super().__init__() 15 | 16 | args = get_args() 17 | 18 | # Dataset, tokenizer. 19 | self.text_dataset = text_dataset 20 | self.max_seq_length = max_seq_length 21 | self.bert_tokenizer = get_tokenizer() 22 | 23 | def __len__(self): 24 | return len(self.text_dataset) 25 | 26 | @classmethod 27 | def build_sample(cls, tokenizer, token_ids): 28 | get_constant_array = lambda c : np.full((len(token_ids) + 2,), c, "int64") 29 | return { 30 | "text" : np.array([ tokenizer.cls, *token_ids, tokenizer.sep ], dtype="int64"), 31 | "types" : get_constant_array(0), 32 | "labels" : get_constant_array(-1), 33 | "is_random" : 0, 34 | "loss_mask" : get_constant_array(0), 35 | "padding_mask" : get_constant_array(1), 36 | "truncated" : 0, 37 | } 38 | 39 | def __getitem__(self, idx): 40 | 41 | # Text. 42 | text_sample = self.text_dataset[idx] 43 | text = text_sample["text"] 44 | text = text.replace("<|endoftext|>", "") 45 | 46 | # Bert/Wordpiece tokens (+truncate). 47 | bert_token_ids = self.bert_tokenizer.tokenize(text) 48 | bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep. 49 | if not bert_token_ids: 50 | bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq 51 | 52 | # Bert sample. 53 | sample = self.build_sample(self.bert_tokenizer, bert_token_ids) 54 | 55 | return sample 56 | -------------------------------------------------------------------------------- /tools/bert_embedding/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "h5py", 7 | "transformers", # for huggingface bert 8 | ] 9 | 10 | for lib in required_libs: 11 | try: 12 | globals()[lib] = importlib.import_module(lib) 13 | except ImportError as e: 14 | raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.") 15 | -------------------------------------------------------------------------------- /tools/checkpoint/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import psutil 4 | 5 | 6 | def print_memory_usage(key, rank, num_ranks): 7 | '''Print memory usage.''' 8 | process = psutil.Process() 9 | mem_info = process.memory_info() 10 | print("> memory usage: '%s', rank %d / %d, mem %.1f/%.1f gb." % ( 11 | key, 12 | rank, 13 | num_ranks, 14 | mem_info.rss / 1024**3, 15 | 100 * mem_info.rss / process.memory_percent() / 1024**3, 16 | )) 17 | 18 | 19 | def get_mcore_transformer_block_key(model_key): 20 | return { 21 | "GPT" : "decoder", 22 | "BERT" : "encoder", 23 | }[model_key] 24 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | 8 | """ 9 | This code adds id to each json object in a json file. User can add prefix 10 | to the ids. 11 | """ 12 | 13 | if __name__ == '__main__': 14 | 15 | print('parsing the arguments ...') 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 19 | ' json file where id needs to be added') 20 | parser.add_argument('--output-file', type=str, default=None, help=\ 21 | 'Output file name with id') 22 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 23 | 'Id prefix') 24 | parser.add_argument('--log-interval', type=int, default=100, 25 | help='Log interval') 26 | args = parser.parse_args() 27 | 28 | print('Adding ids to dataset ...') 29 | 30 | f_input = open(args.input_file, 'r', encoding='utf-8') 31 | f_output = open(args.output_file, 'wb') 32 | 33 | unique_ids = 1 34 | start_time = time.time() 35 | for row in f_input: 36 | each_row = json.loads(row) 37 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 38 | each_row['adlr_id'] = adlr_id_string 39 | myjson = json.dumps(each_row, ensure_ascii=False) 40 | 41 | f_output.write(myjson.encode('utf-8')) 42 | f_output.write('\n'.encode('utf-8')) 43 | 44 | if unique_ids % args.log_interval == 0: 45 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 46 | unique_ids, time.time() - start_time), flush=True) 47 | 48 | unique_ids += 1 49 | 50 | # Close the file. 51 | f_input.close() 52 | f_output.close() 53 | 54 | print('done :-)', flush=True) 55 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import glob 5 | import sys 6 | import json 7 | import argparse 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--json_path", type=str, default=".", 13 | help="path where all the json files are located") 14 | 15 | parser.add_argument("--output_file", type=str, default="merged_output.json", 16 | help="filename where the merged json should go") 17 | 18 | args = parser.parse_args() 19 | 20 | json_path = args.json_path 21 | out_file = args.output_file 22 | 23 | json_files = glob.glob(json_path + '/*.json') 24 | 25 | counter = 0 26 | 27 | with open(out_file, 'w') as outfile: 28 | for fname in json_files: 29 | counter += 1 30 | 31 | if counter % 1024 == 0: 32 | print("Merging at ", counter, flush=True) 33 | 34 | with open(fname, 'r') as infile: 35 | for row in infile: 36 | each_row = json.loads(row) 37 | outfile.write(row) 38 | 39 | 40 | print("Merged file", out_file, flush=True) 41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/report_theoretical_memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Computes theoretical memory footprint for model training without instantiating 4 | a model and running training iterations on GPU(s).""" 5 | 6 | from megatron.training import get_args 7 | from megatron.training.initialize import initialize_megatron 8 | from megatron.training.theoretical_memory_usage import report_theoretical_memory 9 | 10 | if __name__ == "__main__": 11 | initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True) 12 | args = get_args() 13 | 14 | report_theoretical_memory(args, verbose=True) 15 | -------------------------------------------------------------------------------- /tools/retro/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .cli import retro 4 | -------------------------------------------------------------------------------- /tools/retro/cli/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | from . import retro 6 | 7 | 8 | if __name__ == "__main__": 9 | retro.init(os.environ["RETRO_PROJECT_DIR"]) 10 | -------------------------------------------------------------------------------- /tools/retro/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3 2 | 3 | RUN pip install -U faiss-gpu 4 | 5 | RUN apt update 6 | 7 | RUN apt install -qy htop 8 | 9 | RUN pip install -U transformers 10 | 11 | RUN pip install --upgrade google-api-python-client 12 | 13 | RUN pip install sentencepiece 14 | 15 | RUN pip install h5py 16 | 17 | RUN pip install nltk 18 | 19 | RUN pip install einops 20 | -------------------------------------------------------------------------------- /tools/retro/sft/README.md: -------------------------------------------------------------------------------- 1 | ## Note 2 | 3 | The content within this `sft` directory is still under active development and will be updated soon. -------------------------------------------------------------------------------- /tools/retro/sft/open_inst.sh: -------------------------------------------------------------------------------- 1 | DATA_BLEND="1.0 open_inst" 2 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import sys 3 | import json 4 | import requests 5 | 6 | 7 | if __name__ == "__main__": 8 | url = sys.argv[1] 9 | url = 'http://' + url + '/api' 10 | headers = {'Content-Type': 'application/json'} 11 | 12 | while True: 13 | sentence = input("Enter prompt: ") 14 | tokens_to_generate = int(eval(input("Enter number of tokens to generate: "))) 15 | 16 | data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} 17 | response = requests.put(url, data=json.dumps(data), headers=headers) 18 | 19 | if response.status_code != 200: 20 | print(f"Error {response.status_code}: {response.json()['message']}") 21 | else: 22 | print("Megatron Response: ") 23 | print(response.json()['text'][0]) 24 | --------------------------------------------------------------------------------