├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── enhancement.md
    │   ├── question.md
    │   └── regression.md
    └── workflows
    │   └── stale.yml
├── .gitignore
├── .gitlab-ci.yml
├── CODEOWNERS
├── CONTRIBUTING.md
├── Dockerfile.ci
├── Dockerfile.linting
├── LICENSE
├── MANIFEST.in
├── README.md
├── artifact
    ├── README.md
    ├── example-results
    │   ├── full-exp.txt
    │   └── quick-exp.txt
    ├── exp_one_host.csv
    ├── full_exp.sh
    ├── quick_exp.sh
    └── show_result_full_exp.py
├── docs
    ├── llama_mistral.md
    └── source
    │   ├── api-guide
    │       ├── context_parallel.rst
    │       ├── datasets.rst
    │       ├── dist_checkpointing.rst
    │       ├── dist_checkpointing.strategies.rst
    │       ├── distributed.rst
    │       ├── fusions.rst
    │       ├── index.rst
    │       ├── models.bert.rst
    │       ├── models.gpt.rst
    │       ├── models.rst
    │       ├── models.t5.rst
    │       ├── moe.rst
    │       ├── num_microbatches_calculator.rst
    │       ├── pipeline_parallel.rst
    │       ├── tensor_parallel.rst
    │       └── transformer.rst
    │   ├── distrib_optimizer.md
    │   ├── images
    │       ├── context_parallel
    │       │   ├── CP_overview.png
    │       │   └── CP_results.png
    │       └── distrib_optimizer
    │       │   ├── data_flow.png
    │       │   └── sharding_scheme.png
    │   ├── index.rst
    │   └── user-guide
    │       └── index.rst
├── examples
    ├── academic_paper_scripts
    │   ├── detxoify_lm
    │   │   ├── README.md
    │   │   ├── annotations
    │   │   │   ├── filter-selfgeneration.py
    │   │   │   ├── perspective_api_annotate.py
    │   │   │   └── preprocess.sh
    │   │   ├── finetune_gpt.py
    │   │   ├── finetune_gpt_distributed-1.3b.sh
    │   │   ├── generate-1.3b.sh
    │   │   ├── generate_samples_gpt.py
    │   │   ├── perspective_api.py
    │   │   └── self_generation
    │   │   │   └── selfgenerate-1.3b-unconditional.sh
    │   ├── msdp
    │   │   ├── README.md
    │   │   ├── data_processing.sh
    │   │   ├── eval_knwl_generation.sh
    │   │   ├── eval_resp_generation.sh
    │   │   ├── prep_resp_gen.sh
    │   │   ├── prompt_knwl_gen.sh
    │   │   └── prompt_resp_gen.sh
    │   └── sc21
    │   │   ├── CONFIG.sh
    │   │   ├── README.md
    │   │   ├── SBATCH.sh
    │   │   ├── SRUN.sh
    │   │   ├── run_figure_11.sh
    │   │   ├── run_figure_12.sh
    │   │   ├── run_figure_13.sh
    │   │   ├── run_figure_14.sh
    │   │   ├── run_figure_15.sh
    │   │   ├── run_figure_16.sh
    │   │   ├── run_figure_17.sh
    │   │   ├── run_figure_18.sh
    │   │   └── run_table_1.sh
    ├── bert
    │   ├── README.md
    │   └── train_bert_340m_distributed.sh
    ├── gpt3
    │   ├── README.md
    │   ├── gpt_config.yaml
    │   └── train_gpt3_175b_distributed.sh
    ├── inference
    │   ├── README.md
    │   ├── gpt
    │   │   └── simple_gpt_batch_inference.py
    │   ├── quantization
    │   │   ├── README.md
    │   │   ├── ptq_trtllm_llama_7b.sh
    │   │   ├── ptq_trtllm_nemotron3_8b.sh
    │   │   ├── text_generation_ptq.py
    │   │   └── trtllm_text_generation.py
    │   ├── run_text_generation_server_345M.sh
    │   └── run_text_generation_server_345M_8_tensor_parallel.sh
    ├── mamba
    │   ├── .gitignore
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── run_text_gen_server_8b.sh
    │   ├── run_text_gen_server_8b_gpt3.sh
    │   └── train.sh
    ├── mixtral
    │   ├── README.md
    │   └── train_mixtral_8x7b_distributed.sh
    ├── multimodal
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── assets
    │   │   └── pretrain_curves.png
    │   ├── clip_converter.py
    │   ├── combine_mistral_clip.sh
    │   ├── combine_state_dicts.py
    │   ├── config.py
    │   ├── convert_llava_pretrain_to_wds.py
    │   ├── dataloader_provider.py
    │   ├── dataset_helpers.py
    │   ├── evaluate_coco.py
    │   ├── evaluate_mmmu.py
    │   ├── evaluate_textvqa.py
    │   ├── evaluate_vqav2.py
    │   ├── layer_specs.py
    │   ├── manual_prompts.json
    │   ├── pretrain_dataset.yaml
    │   ├── pretrain_mistral_clip.sh
    │   ├── run_text_generation.py
    │   ├── sft_dataset.yaml
    │   ├── sft_mistral_clip.sh
    │   ├── text_generation_mistral_clip.sh
    │   └── train.py
    ├── retro
    │   ├── README.md
    │   ├── preprocess_data.sh
    │   └── train_retro_2b_distributed.sh
    ├── run_simple_mcore_train_loop.py
    └── t5
    │   ├── README.md
    │   ├── t5_mcore_train_curve.png
    │   └── train_t5_220m_distributed.sh
├── images
    ├── expt-pp32-flops.png
    ├── expt-pp32-mem.png
    ├── model_table.png
    ├── schedule-interlaced.png
    ├── schedule-vocab-1.png
    ├── schedule-vocab-2.png
    ├── st-passes-1.png
    ├── st-passes-2.png
    ├── strong_scaling.png
    └── weak_scaling.png
├── input_store.py
├── jet-tests.yml
├── megatron
    ├── core
    │   ├── QuickStart.md
    │   ├── README.md
    │   ├── README_STRAGGLER.md
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── Makefile
    │   │   ├── __init__.py
    │   │   ├── bert_dataset.py
    │   │   ├── blended_dataset.py
    │   │   ├── blended_megatron_dataset_builder.py
    │   │   ├── blended_megatron_dataset_config.py
    │   │   ├── gpt_dataset.py
    │   │   ├── helpers.cpp
    │   │   ├── indexed_dataset.py
    │   │   ├── masked_dataset.py
    │   │   ├── megatron_dataset.py
    │   │   ├── megatron_tokenizer.py
    │   │   ├── multimodal_dataset.py
    │   │   ├── readme.md
    │   │   ├── retro
    │   │   │   ├── __init__.py
    │   │   │   ├── config
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bert_embedders.py
    │   │   │   │   ├── config.py
    │   │   │   │   ├── gpt_chunk_datasets.py
    │   │   │   │   └── tokenizers.py
    │   │   │   ├── db
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build.py
    │   │   │   │   ├── dataset.py
    │   │   │   │   └── utils.py
    │   │   │   ├── external_libs.py
    │   │   │   ├── index
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build.py
    │   │   │   │   ├── factory.py
    │   │   │   │   ├── index.py
    │   │   │   │   ├── indexes
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── faiss_base.py
    │   │   │   │   │   └── faiss_par_add.py
    │   │   │   │   ├── utils.py
    │   │   │   │   └── validate.py
    │   │   │   ├── query
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── gpt_chunk_dataset.py
    │   │   │   │   ├── multi_split_gpt_dataset.py
    │   │   │   │   ├── query.py
    │   │   │   │   ├── retro_dataset.py
    │   │   │   │   └── utils.py
    │   │   │   └── utils.py
    │   │   ├── t5_dataset.py
    │   │   ├── utils.py
    │   │   └── utils_s3.py
    │   ├── dist_checkpointing
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── dict_utils.py
    │   │   ├── mapping.py
    │   │   ├── optimizer.py
    │   │   ├── serialization.py
    │   │   ├── strategies
    │   │   │   ├── __init__.py
    │   │   │   ├── async_utils.py
    │   │   │   ├── base.py
    │   │   │   ├── common.py
    │   │   │   ├── filesystem_async.py
    │   │   │   ├── fully_parallel.py
    │   │   │   ├── resharding.py
    │   │   │   ├── state_dict_saver.py
    │   │   │   ├── tensorstore.py
    │   │   │   ├── torch.py
    │   │   │   ├── two_stage.py
    │   │   │   └── zarr.py
    │   │   ├── utils.py
    │   │   └── validation.py
    │   ├── distributed
    │   │   ├── __init__.py
    │   │   ├── distributed_data_parallel.py
    │   │   ├── distributed_data_parallel_config.py
    │   │   ├── finalize_model_grads.py
    │   │   └── param_and_grad_buffer.py
    │   ├── enums.py
    │   ├── fusions
    │   │   ├── __init__.py
    │   │   ├── fused_bias_dropout.py
    │   │   ├── fused_bias_geglu.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_bias_swiglu.py
    │   │   ├── fused_cross_entropy.py
    │   │   ├── fused_layer_norm.py
    │   │   └── fused_softmax.py
    │   ├── inference
    │   │   ├── __init__.py
    │   │   ├── ammo_support
    │   │   │   ├── __init__.py
    │   │   │   └── gpt
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── model_specs.py
    │   │   │   │   └── state_dict_hooks.py
    │   │   ├── common_inference_params.py
    │   │   ├── communication_utils.py
    │   │   ├── engines
    │   │   │   ├── __init__.py
    │   │   │   ├── abstract_engine.py
    │   │   │   └── mcore_engine.py
    │   │   ├── inference_request.py
    │   │   ├── model_inference_wrappers
    │   │   │   ├── __init__.py
    │   │   │   ├── abstract_model_inference_wrapper.py
    │   │   │   ├── gpt
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── gpt_inference_wrapper.py
    │   │   │   └── inference_wrapper_config.py
    │   │   ├── scheduler.py
    │   │   ├── text_generation_controllers
    │   │   │   ├── __init__.py
    │   │   │   └── simple_text_generation_controller.py
    │   │   └── utils.py
    │   ├── inference_params.py
    │   ├── jit.py
    │   ├── model_parallel_config.py
    │   ├── models
    │   │   ├── T5
    │   │   │   ├── __init__.py
    │   │   │   ├── t5_model.py
    │   │   │   └── t5_spec.py
    │   │   ├── __init__.py
    │   │   ├── bert
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_layer_specs.py
    │   │   │   ├── bert_lm_head.py
    │   │   │   ├── bert_model.py
    │   │   │   └── pooler.py
    │   │   ├── common
    │   │   │   ├── __init__.py
    │   │   │   ├── embeddings
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── language_model_embedding.py
    │   │   │   │   └── rotary_pos_embedding.py
    │   │   │   ├── language_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── language_module.py
    │   │   │   └── vision_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── vision_module.py
    │   │   ├── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── gpt_layer_specs.py
    │   │   │   └── gpt_model.py
    │   │   ├── mamba
    │   │   │   ├── __init__.py
    │   │   │   ├── mamba_layer_specs.py
    │   │   │   └── mamba_model.py
    │   │   ├── multimodal
    │   │   │   ├── __init__.py
    │   │   │   ├── llava_model.py
    │   │   │   └── llava_spec.py
    │   │   ├── retro
    │   │   │   ├── __init__.py
    │   │   │   ├── base_attention.py
    │   │   │   ├── config.py
    │   │   │   ├── decoder_attention.py
    │   │   │   ├── decoder_spec.py
    │   │   │   ├── encoder_attention.py
    │   │   │   ├── encoder_spec.py
    │   │   │   ├── model.py
    │   │   │   └── utils.py
    │   │   └── vision
    │   │   │   ├── __init__.py
    │   │   │   ├── clip_vit_model.py
    │   │   │   ├── multimodal_projector.py
    │   │   │   └── vit_layer_specs.py
    │   ├── num_microbatches_calculator.py
    │   ├── optimizer
    │   │   ├── __init__.py
    │   │   ├── clip_grads.py
    │   │   ├── distrib_optimizer.py
    │   │   ├── grad_scaler.py
    │   │   ├── optimizer.py
    │   │   └── optimizer_config.py
    │   ├── package_info.py
    │   ├── packed_seq_params.py
    │   ├── parallel_state.py
    │   ├── pipeline_parallel
    │   │   ├── __init__.py
    │   │   ├── interlaced_schedule.py
    │   │   ├── p2p_communication.py
    │   │   ├── schedule_timers.py
    │   │   ├── schedules.py
    │   │   └── vocab_parallel_schedule.py
    │   ├── requirements.txt
    │   ├── ssm
    │   │   ├── __init__.py
    │   │   ├── mamba_block.py
    │   │   ├── mamba_hybrid_layer_allocation.py
    │   │   ├── mamba_layer.py
    │   │   ├── mamba_mixer.py
    │   │   └── triton_cache_manager.py
    │   ├── tensor_parallel
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   ├── utils.py
    │   │   ├── vocab_input.py
    │   │   ├── vocab_input_store.py
    │   │   ├── vocab_output.py
    │   │   └── vocab_output_store.py
    │   ├── timers.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── custom_layers
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_engine.py
    │   │   ├── dot_product_attention.py
    │   │   ├── enums.py
    │   │   ├── identity_op.py
    │   │   ├── mlp.py
    │   │   ├── module.py
    │   │   ├── moe
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── experts.py
    │   │   │   ├── grouped_gemm_util.py
    │   │   │   ├── moe_layer.py
    │   │   │   ├── moe_utils.py
    │   │   │   ├── router.py
    │   │   │   └── token_dispatcher.py
    │   │   ├── spec_utils.py
    │   │   ├── torch_layer_norm.py
    │   │   ├── transformer_block.py
    │   │   ├── transformer_config.py
    │   │   ├── transformer_layer.py
    │   │   └── utils.py
    │   └── utils.py
    ├── inference
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── checkpointing.py
    │   ├── gpt
    │   │   ├── __init__.py
    │   │   └── model_provider.py
    │   ├── static
    │   │   └── index.html
    │   ├── text_generation
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── beam_utils.py
    │   │   ├── communication.py
    │   │   ├── forward_step.py
    │   │   ├── generation.py
    │   │   ├── sampling.py
    │   │   └── tokenization.py
    │   └── text_generation_server.py
    ├── legacy
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── autoaugment.py
    │   │   ├── biencoder_dataset_utils.py
    │   │   ├── data_samplers.py
    │   │   ├── dataset_utils.py
    │   │   ├── ict_dataset.py
    │   │   ├── image_folder.py
    │   │   ├── multimodal_dataset.py
    │   │   ├── orqa_wiki_dataset.py
    │   │   ├── realm_dataset_utils.py
    │   │   ├── realm_index.py
    │   │   └── vit_dataset.py
    │   ├── fp16_deprecated
    │   │   └── loss_scaler.py
    │   ├── fused_kernels
    │   │   ├── __init__.py
    │   │   ├── compat.h
    │   │   ├── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_fused_kernels.py
    │   │   └── type_shim.h
    │   ├── indexer.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── bert_model.py
    │   │   ├── biencoder_model.py
    │   │   ├── classification.py
    │   │   ├── enums.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_layer_norm.py
    │   │   ├── fused_softmax.py
    │   │   ├── gpt_model.py
    │   │   ├── language_model.py
    │   │   ├── module.py
    │   │   ├── multiple_choice.py
    │   │   ├── realm_model.py
    │   │   ├── rms_norm.py
    │   │   ├── t5_model.py
    │   │   ├── transformer.py
    │   │   ├── utils.py
    │   │   └── vision
    │   │   │   ├── classification.py
    │   │   │   ├── dino.py
    │   │   │   ├── esvit_swin_backbone.py
    │   │   │   ├── inpainting.py
    │   │   │   ├── knn_monitor.py
    │   │   │   ├── mit_backbone.py
    │   │   │   ├── swin_backbone.py
    │   │   │   ├── utils.py
    │   │   │   └── vit_backbone.py
    │   └── mpu
    │   │   └── tests
    │   │       ├── __init__.py
    │   │       ├── commons.py
    │   │       ├── test_cross_entropy.py
    │   │       ├── test_data.py
    │   │       ├── test_initialize.py
    │   │       ├── test_layers.py
    │   │       └── test_random.py
    └── training
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── arguments.py
    │   ├── async_utils.py
    │   ├── checkpointing.py
    │   ├── dist_signal_handler.py
    │   ├── global_vars.py
    │   ├── initialize.py
    │   ├── log_handler.py
    │   ├── one_logger_utils.py
    │   ├── optimizer_param_scheduler.py
    │   ├── theoretical_memory_usage.py
    │   ├── tokenizer
    │       ├── __init__.py
    │       ├── bert_tokenization.py
    │       ├── gpt2_tokenization.py
    │       └── tokenizer.py
    │   ├── training.py
    │   ├── utils.py
    │   └── yaml_arguments.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_gpt.sh
├── pretrain_ict.py
├── pretrain_mamba.py
├── pretrain_retro.py
├── pretrain_t5.py
├── pretrain_vision_classify.py
├── pretrain_vision_dino.py
├── pretrain_vision_inpaint.py
├── pretrain_vlm.py
├── pyproject.toml
├── setup.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── main.py
    ├── msdp
    │   ├── README.md
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── prompt.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification
    │   │   ├── classification.py
    │   │   └── eval_utils.py
    │   ├── finetune_utils.py
    │   ├── main.py
    │   └── segmentation
    │   │   ├── cityscapes.py
    │   │   ├── data.py
    │   │   ├── finetune_segformer.py
    │   │   ├── finetune_setr.py
    │   │   ├── metrics.py
    │   │   ├── seg_heads.py
    │   │   ├── seg_models.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── __init__.py
    ├── functional_tests
    │   ├── __init__.py
    │   ├── jet_recipes
    │   │   ├── MR-bert.yaml
    │   │   ├── MR-gpt-nemo.yaml
    │   │   ├── MR-gpt.yaml
    │   │   ├── MR-multimodal.yaml
    │   │   ├── MR-t5.yaml
    │   │   ├── build-pyt.yaml
    │   │   ├── local-generator.py
    │   │   ├── nightly-bert.yaml
    │   │   ├── nightly-gpt.yaml
    │   │   ├── weekly-gpt.yaml
    │   │   └── weekly-t5.yaml
    │   ├── python_test_utils
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── get_test_results_from_tensorboard_logs.py
    │   │   ├── jet_test_pipeline.py
    │   │   ├── multitest_ci_pipeline.py
    │   │   ├── test_ci_pipeline.py
    │   │   ├── test_fp8_ci_pipeline.py
    │   │   └── test_resume_checkpoint_pipeline.py
    │   ├── shell_test_utils
    │   │   ├── _run_local_training.sh
    │   │   ├── restart_jet_log_jobs.sh
    │   │   └── run_release_record.sh
    │   ├── test_results
    │   │   └── jet
    │   │   │   ├── bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
    │   │   │   ├── bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json
    │   │   │   ├── bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
    │   │   │   ├── bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
    │   │   │   ├── bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
    │   │   │   ├── bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json
    │   │   │   ├── bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json
    │   │   │   ├── bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json
    │   │   │   ├── bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json
    │   │   │   ├── bert_mr_tp2_pp2_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json
    │   │   │   ├── gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json
    │   │   │   ├── gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json
    │   │   │   ├── gpt3_mr_tp2_pp2_dgx_a100_1N8G.json
    │   │   │   ├── multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json
    │   │   │   ├── multimodal_llava_mr_mcore_te_tp2_pp4_dgx_a100_1N8G.json
    │   │   │   └── t5_220m_mr_mcore_tp2_pp4_dgx_a100_1N8G.json
    │   └── test_scripts
    │   │   ├── bert
    │   │       └── pretrain_bert_distributed_test.sh
    │   │   ├── gpt3
    │   │       ├── pretrain_gpt3_distributed_test.sh
    │   │       └── pretrain_gpt3_nemo_test.sh
    │   │   ├── multimodal
    │   │       └── pretrain_llava_distributed_test.sh
    │   │   ├── retro
    │   │       └── pretrain_retro_distributed_test.sh
    │   │   └── t5
    │   │       └── pretrain_t5_distributed_test.sh
    └── unit_tests
    │   ├── __init__.py
    │   ├── data
    │       ├── __init__.py
    │       ├── test_bin_reader.py
    │       ├── test_builder.py
    │       ├── test_gpt_dataset.py
    │       ├── test_multimodal_dataset.py
    │       ├── test_preprocess_data.py
    │       └── test_preprocess_mmdata.py
    │   ├── dist_checkpointing
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   ├── common.py
    │       │   ├── test_bert_model.py
    │       │   ├── test_gpt_model.py
    │       │   ├── test_grouped_mlp.py
    │       │   ├── test_mlp_glu.py
    │       │   ├── test_retro_model.py
    │       │   ├── test_sequential_mlp.py
    │       │   └── test_t5_model.py
    │       ├── test_async_save.py
    │       ├── test_cached_metadata.py
    │       ├── test_flattened_resharding.py
    │       ├── test_fully_parallel.py
    │       ├── test_mapping.py
    │       ├── test_optimizer.py
    │       └── test_serialization.py
    │   ├── distributed
    │       └── test_param_and_grad_buffer.py
    │   ├── fusions
    │       └── test_torch_softmax.py
    │   ├── inference
    │       ├── __init__.py
    │       ├── engines
    │       │   ├── __init__.py
    │       │   └── test_mcore_engine.py
    │       ├── model_inference_wrappers
    │       │   ├── __init__.py
    │       │   ├── gpt
    │       │   │   └── test_gpt_inference_wrapper.py
    │       │   └── test_model_inference_wrapper_config.py
    │       ├── test_common_inference_params.py
    │       ├── test_inference_utils.py
    │       ├── test_modelopt_gpt_model.py
    │       ├── test_scheduler.py
    │       └── text_generation_controllers
    │       │   ├── __init__.py
    │       │   └── test_simple_text_generation_controller.py
    │   ├── models
    │       ├── __init__.py
    │       ├── test_base_embedding.py
    │       ├── test_bert_model.py
    │       ├── test_clip_vit_model.py
    │       ├── test_gpt_model.py
    │       ├── test_llava_model.py
    │       ├── test_mamba_model.py
    │       ├── test_multimodal_projector.py
    │       └── test_t5_model.py
    │   ├── pipeline_parallel
    │       ├── __init__.py
    │       └── test_schedules.py
    │   ├── tensor_parallel
    │       ├── __init__.py
    │       ├── test_cross_entropy.py
    │       ├── test_data.py
    │       ├── test_initialization.py
    │       ├── test_layers.py
    │       ├── test_mappings.py
    │       ├── test_random.py
    │       └── test_tensor_parallel_utils.py
    │   ├── test_basic.py
    │   ├── test_imports.py
    │   ├── test_local_multi_tensor_fns.py
    │   ├── test_num_microbatches_calculator.py
    │   ├── test_optimizer.py
    │   ├── test_parallel_state.py
    │   ├── test_training.py
    │   ├── test_utilities.py
    │   ├── test_utils.py
    │   └── transformer
    │       ├── __init__.py
    │       ├── moe
    │           ├── __init__.py
    │           ├── test_a2a_token_dispatcher.py
    │           ├── test_aux_loss.py
    │           ├── test_grouped_mlp.py
    │           ├── test_routers.py
    │           ├── test_sequential_mlp.py
    │           └── test_token_dispatcher.py
    │       ├── test_attention.py
    │       ├── test_attention_packed_seq.py
    │       ├── test_core_attention.py
    │       ├── test_mlp.py
    │       ├── test_module.py
    │       ├── test_retro_attention.py
    │       ├── test_spec_customization.py
    │       ├── test_transformer_block.py
    │       └── test_transformer_layer.py
└── tools
    ├── autoformat.sh
    ├── bert_embedding
        ├── __init__.py
        ├── dataset.py
        ├── embed.py
        ├── external_libs.py
        └── huggingface.py
    ├── checkpoint
        ├── convert.py
        ├── hybrid_conversion.py
        ├── loader_llama_mistral.py
        ├── loader_mcore.py
        ├── loader_megatron.py
        ├── loader_mixtral_hf.py
        ├── saver_mcore.py
        ├── saver_megatron.py
        ├── setter.py
        └── utils.py
    ├── linter.py
    ├── merge_datasets.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    ├── preprocess_data.py
    ├── preprocess_data_nmt.py
    ├── preprocess_mmdata.py
    ├── report_theoretical_memory.py
    ├── retro
        ├── README.md
        ├── build_db.md
        ├── cli
        │   ├── __init__.py
        │   ├── __main__.py
        │   └── cli.py
        ├── config_utils.py
        ├── docker
        │   └── Dockerfile
        ├── preprocess_data.py
        ├── sft
        │   ├── README.md
        │   ├── dataset_conv.py
        │   ├── open_inst.sh
        │   ├── sft_retro.py
        │   └── sft_retro_lm.sh
        └── text_generation
        │   ├── evaluate.py
        │   ├── metrics.py
        │   ├── retro_api.py
        │   ├── retro_generate.sh
        │   ├── retro_generation.py
        │   └── retro_text_generation.py
    ├── run_mamba_text_generation_server.py
    ├── run_text_generation_server.py
    ├── run_vlm_text_generation.py
    └── text_generation_cli.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [html]
2 | directory = coverage
3 | 
4 | [run]
5 | data_file = .coverage_$LOCAL_RANK
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: BUG
 3 | about: Report a bug that needs attention
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Stack trace/logs**
20 | If applicable, add the stack trace or logs from the time of the error.
21 | 
22 | **Environment (please complete the following information):**
23 |  - Megatron-LM commit ID
24 |  - PyTorch version
25 |  - CUDA version
26 |  - NCCL version
27 | 
28 | **Proposed fix**
29 | If you have a proposal for how to fix the issue state it here or link to a PR.
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: ENHANCEMENT
 3 | about: Suggest an idea to improve this project
 4 | title: "[ENHANCEMENT]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Proposed implementation**
20 | If you have a proposed implementation for the feature state it here or link to a PR.
21 | 
22 | **Additional context**
23 | Add any other context or screenshots about the feature request here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: QUESTION
 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
 4 |   request
 5 | title: "[QUESTION]"
 6 | labels: ''
 7 | assignees: ''
 8 | 
 9 | ---
10 | 
11 | **Your question**
12 | Ask a clear and concise question about Megatron-LM.
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/regression.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: REGRESSION
 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update
 4 | title: "[REGRESSION]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the regression**
11 | A clear and concise description of what the regression is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Previous performance**
17 | What speed or accuracy did you previously see.
18 | 
19 | **New performance**
20 | What speed or accuracy do you see after the update.
21 | 
22 | **Stack trace/logs**
23 | If applicable, add the stack trace or logs related to the regression.
24 | 
25 | **Environment (please complete the following information):**
26 |  - Previous Megatron-LM commit ID
27 |  - New Megatron-LM commit ID
28 |  - Previous PyTorch version
29 |  - New PyTorch version
30 |  - Previous CUDA version
31 |  - New CUDA version
32 |  - Previous NCCL version
33 |  - New NCCL version
34 | 
35 | **Proposed fix**
36 | If you have a proposal for how to fix the issue state it here or link to a PR.
37 | 
38 | **Additional context**
39 | Add any other context about the problem here.
40 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
 2 | #
 3 | # You can adjust the behavior by modifying this file.
 4 | # For more information, see:
 5 | # https://github.com/actions/stale
 6 | name: Mark stale issues and pull requests
 7 | 
 8 | on:
 9 |   schedule:
10 |   - cron: '15 18 * * *'
11 | 
12 | jobs:
13 |   stale:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       issues: write
18 |       pull-requests: write
19 | 
20 |     steps:
21 |     - uses: actions/stale@v5
22 |       with:
23 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
24 |         days-before-stale: 60
25 |         stale-issue-message: 'Marking as stale. No activity in 60 days.'
26 |         stale-pr-message: 'Marking as stale. No activity in 60 days.'
27 |         stale-issue-label: 'stale'
28 |         stale-pr-label: 'stale'
29 |         remove-stale-when-updated: true
30 |         operations-per-run: 1000
31 |         days-before-close: -1
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.so
 3 | build
 4 | .coverage_*
 5 | *.egg-info
 6 | *~
 7 | slurm*
 8 | logs
 9 | .vscode
10 | local/
11 | *.tar.gz
12 | *.tar.gz.*
13 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | [MCORE][3]
2 | megatron/core/ @shanmugamr @jcasper @eharper @terryk @okoenig
3 | 
4 | [TESTS]
5 | tests/ @shanmugamr @terryk @okoenig
6 | 
7 | [MODELOPT]
8 | examples/inference/quantization @chenhany @kmorabia
9 | 


--------------------------------------------------------------------------------
/Dockerfile.linting:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:experimental
 2 | 
 3 | ARG FROM_IMAGE_NAME
 4 | FROM $FROM_IMAGE_NAME
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
 8 |       /etc/apt/apt.conf.d/docker-clean
 9 | 
10 | 
11 | RUN pip3 install --no-cache-dir \
12 |       black==24.4.2 \
13 |       isort
14 | 
15 | COPY . /opt/megatron-lm
16 | 
17 | WORKDIR /opt/megatron-lm


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/core/requirements.txt
2 | 


--------------------------------------------------------------------------------
/artifact/example-results/quick-exp.txt:
--------------------------------------------------------------------------------
 1 | Method: baseline
 2 | Peak Memory: 33.7227 GB
 3 | MFU: 28.8833 %
 4 | 
 5 | Method: redis
 6 | Peak Memory: 33.7227 GB
 7 | MFU: 44.6221 %
 8 | 
 9 | Method: interlaced
10 | Peak Memory: 30.7168 GB
11 | MFU: 53.8638 %
12 | 
13 | Method: vocab-1
14 | Peak Memory: 27.3848 GB
15 | MFU: 53.9708 %
16 | 
17 | Method: vocab-2
18 | Peak Memory: 26.1094 GB
19 | MFU: 53.5333 %


--------------------------------------------------------------------------------
/docs/source/api-guide/dist_checkpointing.strategies.rst:
--------------------------------------------------------------------------------
 1 | dist\_checkpointing.strategies package
 2 | ======================================
 3 | 
 4 | Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies).
 5 | 
 6 | Strategies can be used for implementing new checkpoint formats or implementing new (more optimal for a given use case) ways of saving/loading of existing formats.
 7 | Strategies are passed to `dist_checkpointing.load` and `dist_checkpointing.save` functions and control the actual saving/loading procedure.
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | dist\_checkpointing.strategies.base module
13 | ------------------------------------------
14 | 
15 | .. automodule:: core.dist_checkpointing.strategies.base
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | dist\_checkpointing.strategies.tensorstore module
21 | -------------------------------------------------
22 | 
23 | .. automodule:: core.dist_checkpointing.strategies.tensorstore
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | dist\_checkpointing.strategies.two\_stage module
29 | ------------------------------------------------
30 | 
31 | .. automodule:: core.dist_checkpointing.strategies.two_stage
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | dist\_checkpointing.strategies.zarr module
37 | ------------------------------------------
38 | 
39 | .. automodule:: core.dist_checkpointing.strategies.zarr
40 |    :members:
41 |    :undoc-members:
42 |    :show-inheritance:
43 | 
44 | Module contents
45 | ---------------
46 | 
47 | .. automodule:: core.dist_checkpointing.strategies
48 |    :members:
49 |    :undoc-members:
50 |    :show-inheritance:
51 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/index.rst:
--------------------------------------------------------------------------------
 1 | API Guide
 2 | =========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 4
 6 | 
 7 |    models
 8 |    tensor_parallel
 9 |    context_parallel
10 |    pipeline_parallel
11 |    fusions
12 |    transformer
13 |    moe
14 |    dist_checkpointing
15 |    distributed
16 |    datasets
17 |    num_microbatches_calculator
18 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.bert.rst:
--------------------------------------------------------------------------------
 1 | models.bert package
 2 | ===================
 3 | Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 
 4 | 
 5 | Submodules
 6 | ----------
 7 | 
 8 | models.bert.bert\_model module
 9 | ------------------------------
10 | 
11 | .. automodule:: core.models.bert.bert_model
12 |    :members:
13 |    :undoc-members:
14 |    :show-inheritance:
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: core.models.bert
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.gpt.rst:
--------------------------------------------------------------------------------
 1 | models.gpt package
 2 | ==================
 3 | This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 
 4 | 
 5 | Submodules
 6 | ----------
 7 | 
 8 | models.gpt.gpt\_model module
 9 | ----------------------------
10 | 
11 | .. automodule:: core.models.gpt.gpt_model
12 |    :members:
13 |    :undoc-members:
14 |    :show-inheritance:
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: core.models.gpt
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.rst:
--------------------------------------------------------------------------------
 1 | models package
 2 | ==============
 3 | This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 
 4 | 
 5 | Subpackages
 6 | -----------
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 4
10 | 
11 |    models.gpt
12 |    models.t5
13 |    models.bert
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: core.models
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/models.t5.rst:
--------------------------------------------------------------------------------
 1 | models.t5 package
 2 | =================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | models.t5.t5\_model module
 8 | --------------------------
 9 | 
10 | .. automodule:: core.models.T5.t5_model
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: core.models.T5
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/moe.rst:
--------------------------------------------------------------------------------
1 | Mixture of Experts package
2 | ==========================
3 | 
4 | .. mdinclude :: ../../../megatron/core/transformer/moe/README.md
5 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/num_microbatches_calculator.rst:
--------------------------------------------------------------------------------
 1 | Microbatches Calculator
 2 | ==============
 3 | This api is used to calculate the number of microbatches required to fit a given model on a given batch size.
 4 | 
 5 | 
 6 | Module contents
 7 | ---------------
 8 | 
 9 | .. automodule:: core.num_microbatches_calculator
10 |    :members:
11 |    :undoc-members:
12 |    :show-inheritance:
13 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/pipeline_parallel.rst:
--------------------------------------------------------------------------------
 1 | pipeline\_parallel package
 2 | ==========================
 3 | 
 4 | This package contains implementations for two different pipeline parallelism
 5 | schedules (one without interleaving and one with interleaving, see `Efficient
 6 | Large-Scale Language Model Training on GPU Clusters Using Megatron-LM <https://arxiv.org/abs/2104.04473>`_
 7 | for details), and a default no-pipelining schedule. It also contains methods
 8 | for the point-to-point communication that is needed between pipeline stages.
 9 | 
10 | Submodules
11 | ----------
12 | 
13 | pipeline\_parallel.p2p\_communication module
14 | --------------------------------------------
15 | 
16 | Contains implementations for the various point-to-point communication needed
17 | (e.g., `recv_forward` and `recv_backward`) in the different pipeline parallelism
18 | schedules.
19 | 
20 | .. automodule:: core.pipeline_parallel.p2p_communication
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | pipeline\_parallel.schedules module
26 | -----------------------------------
27 | 
28 | Contains implementations for two pipeline parallelism schedules
29 | (`forward_backward_pipelining_with_interleaving`for pipeline parallelism with
30 | interleaving, `forward_backward_pipelining_without_interleaving` for pipeline
31 | parallelism without interleaving) and a default no-pipelining schedule
32 | (`forward_backward_no_pipelining`). `get_forward_backward_func` returns the right
33 | scheduling function to use based on the configuration being trained
34 | (e.g., if pipeline-parallel size is 1, use `forward_backward_no_pipelining`).
35 | 
36 | .. automodule:: core.pipeline_parallel.schedules
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 
41 | Module contents
42 | ---------------
43 | 
44 | .. automodule:: core.pipeline_parallel
45 |    :members:
46 |    :undoc-members:
47 |    :show-inheritance:
48 | 


--------------------------------------------------------------------------------
/docs/source/api-guide/tensor_parallel.rst:
--------------------------------------------------------------------------------
 1 | tensor\_parallel package
 2 | ========================
 3 | 
 4 | This package contains an implementation for tensor parallelism in transformer
 5 | models (see `Megatron-LM: Training Multi-Billion Parameter Language Models
 6 | Using Model Parallelism <https://arxiv.org/abs/1909.08053>`_ and `Reducing
 7 | Activation Recomputation in Large Transformer Models <https://arxiv.org/abs/2205.05198>`_
 8 | for details).
 9 | 
10 | Submodules
11 | ----------
12 | 
13 | tensor\_parallel.cross\_entropy module
14 | --------------------------------------
15 | 
16 | .. automodule:: core.tensor_parallel.cross_entropy
17 |    :members:
18 |    :undoc-members:
19 |    :show-inheritance:
20 | 
21 | tensor\_parallel.data module
22 | ----------------------------
23 | 
24 | .. automodule:: core.tensor_parallel.data
25 |    :members:
26 |    :undoc-members:
27 |    :show-inheritance:
28 | 
29 | tensor\_parallel.layers module
30 | ------------------------------
31 | 
32 | .. automodule:: core.tensor_parallel.layers
33 |    :members:
34 |    :undoc-members:
35 |    :show-inheritance:
36 | 
37 | tensor\_parallel.mappings module
38 | --------------------------------
39 | 
40 | .. automodule:: core.tensor_parallel.mappings
41 |    :members:
42 |    :undoc-members:
43 |    :show-inheritance:
44 | 
45 | tensor\_parallel.random module
46 | ------------------------------
47 | 
48 | .. automodule:: core.tensor_parallel.random
49 |    :members:
50 |    :undoc-members:
51 |    :show-inheritance:
52 | 
53 | tensor\_parallel.utils module
54 | -----------------------------
55 | 
56 | .. automodule:: core.tensor_parallel.utils
57 |    :members:
58 |    :undoc-members:
59 |    :show-inheritance:
60 | 
61 | Module contents
62 | ---------------
63 | 
64 | .. automodule:: core.tensor_parallel
65 |    :members:
66 |    :undoc-members:
67 |    :show-inheritance:
68 | 


--------------------------------------------------------------------------------
/docs/source/images/context_parallel/CP_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/context_parallel/CP_overview.png


--------------------------------------------------------------------------------
/docs/source/images/context_parallel/CP_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/context_parallel/CP_results.png


--------------------------------------------------------------------------------
/docs/source/images/distrib_optimizer/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/distrib_optimizer/data_flow.png


--------------------------------------------------------------------------------
/docs/source/images/distrib_optimizer/sharding_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/docs/source/images/distrib_optimizer/sharding_scheme.png


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Lumache documentation master file, created by
 2 |    sphinx-quickstart on Tue Aug 15 13:44:10 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Megatron Core User Guide
 7 | ===================================
 8 | 
 9 | **Megatron Core** is a Python library that has the core components required to build your language models. 
10 | A reference implementation of megatorn core can be found in  `NeMo <https://github.com/NVIDIA/NeMo/tree/main>`_ It offers a *simple* and
11 | *intuitive* API.
12 | 
13 | .. toctree::
14 |    :maxdepth: 2
15 |    :caption: User Guide
16 | 
17 |    user-guide/index
18 | 
19 | .. toctree::
20 |    :maxdepth: 3
21 |    :caption: API Guide
22 |    
23 |    api-guide/index
24 | 


--------------------------------------------------------------------------------
/docs/source/user-guide/index.rst:
--------------------------------------------------------------------------------
1 | USER GUIDE 
2 | ==========
3 | 
4 | .. mdinclude:: ../../../megatron/core/QuickStart.md


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh:
--------------------------------------------------------------------------------
 1 | VOCAB_FILE=pt2-vocab.json
 2 | MERGE_FILE=gpt2-merges.txt
 3 | 
 4 | python3 tools/preprocess_data.py \
 5 |     --input $1 \
 6 |     --output-prefix $2 \
 7 |     --vocab-file $VOCAB_FILE \
 8 |     --merge-file $MERGE_FILE \
 9 |     --tokenizer-type GPT2BPETokenizer \
10 |     --append-eod  --workers 20 --chunk-size 25
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | VOCAB_FILE=gpt2-vocab.json
 4 | MERGE_FILE=gpt2-merges.txt
 5 | 
 6 | GPUS_PER_NODE=1
 7 | # Change for multinode config
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=$(($RANDOM + 1024))
10 | NNODES=1
11 | NODE_RANK=0
12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
13 | NUM_SAMPLES=$(wc -l < $1)
14 | PREFIX=$(basename $2)
15 | SEED=$(($RANDOM))
16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl
17 | 
18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
19 | 
20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
21 |        --tensor-model-parallel-size 1 \
22 |        --num-layers 24 \
23 |        --hidden-size 2048 \
24 |        --load $CHECKPOINT_PATH \
25 |        --num-attention-heads 32 \
26 |        --max-position-embeddings 2048 \
27 |        --tokenizer-type GPT2BPETokenizer \
28 |        --fp16 \
29 |        --micro-batch-size 400 \
30 |        --seq-length 2048 \
31 |        --out-seq-length 20 \
32 |        --temperature 1.0 \
33 |        --vocab-file $VOCAB_FILE \
34 |        --merge-file $MERGE_FILE \
35 |        --sample-input-file $1 \
36 |        --sample-output-file $OUTPUT \
37 |        --num-samples $NUM_SAMPLES \
38 |        --max-tokens-to-oom 1200000 \
39 |        --top_p 0.9 \
40 |        --seed $SEED
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | SHARE_DATA=$PWD             # current work dir
 4 | VOCAB_FILE=gpt2-vocab.json  # Your gpt-2 vocab
 5 | MERGE_FILE=gpt2-merges.txt  # Your gpt-2 merge file
 6 | 
 7 | GPUS_PER_NODE=1
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=$(($RANDOM + 1024))
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | SEED=$3
15 | SUFFIX=$(basename $CHECKPOINT_PATH)
16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/
17 | mkdir -p $save_dir
18 | echo $save_dir/$SEED.out
19 | 
20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
21 | 
22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
23 |        --tensor-model-parallel-size 1 \
24 |        --num-layers 24 \
25 |        --hidden-size 2048 \
26 |        --load $CHECKPOINT_PATH \
27 |        --num-attention-heads 32 \
28 |        --max-position-embeddings 2048 \
29 |        --tokenizer-type GPT2BPETokenizer \
30 |        --fp16 \
31 |        --micro-batch-size 150 \
32 |        --seq-length 2048 \
33 |        --out-seq-length 1000 \
34 |        --temperature 1.0 \
35 |        --vocab-file $VOCAB_FILE \
36 |        --merge-file $MERGE_FILE \
37 |        --num-samples $1 \
38 |        --top_p 0.9 \
39 |        --max-tokens-to-oom 1200000 \
40 |        --genfile $save_dir/$SEED.out  \
41 |        --seed $SEED
42 | 
43 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
3 | 
4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #########################
 4 | # Evaluate the F1 scores.
 5 | #########################
 6 | 
 7 | WORLD_SIZE=1
 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 9 |                   --nnodes 1 \
10 |                   --node_rank 0 \
11 |                   --master_addr localhost \
12 |                   --master_port 6000"
13 |                   
14 | MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
15 |         (e.g., /testseen_knowledge_generations.txt)
16 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
17 |         (e.g., /testseen_knowledge_reference.txt)
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
20 |         --num-layers 24 \
21 |         --hidden-size 1024 \
22 |         --num-attention-heads 16 \
23 |         --seq-length 2048 \
24 |         --max-position-embeddings 2048 \
25 |         --micro-batch-size 4 \
26 |         --task MSDP-EVAL-F1 \
27 |         --guess-file ${MODEL_GEN_PATH} \
28 |         --answer-file ${GROUND_TRUTH_PATH}
29 | 
30 | 
31 | ############################################
32 | # Evaluate BLEU, METEOR, and ROUGE-L scores.
33 | ############################################
34 | 
35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 
37 | 
38 | # To evaluate on these metrics, please setup the environments based on 
39 | # the nlg-eval github, and run the corresponding evaluation commands.
40 | 
41 | nlg-eval \
42 |     --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
43 |     --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
44 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/prep_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Preparing the input file for the response generation (second-stage prompting)
 4 | 
 5 | DIR=`pwd`
 6 | 
 7 | TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
 8 |         (e.g., /testseen_processed.txt)
 9 | KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
10 |         (e.g., /testseen_knowledge_generations.txt)
11 | PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
12 |         (e.g., /testseen_processed_with_generated_knowledge.txt)
13 | 
14 | python ${DIR}/tasks/msdp/preprocessing.py \
15 |         --func prepare_input \
16 |         --test_file ${TEST_FILE} \
17 |         --knwl_gen_file ${KNOWLEDGE_FILE} \
18 |         --processed_file ${PROCESSED_FILE}
19 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge
 5 | # The size of the pretrained language model is 357M
 6 | 
 7 | WORLD_SIZE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
16 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
17 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
18 | INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \ 
19 |         (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
21 |         (e.g., /testseen_knowledge_prompts.json)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /testseen_knowledge_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type knowledge \
42 |         --num-prompt-examples 10 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response
 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1
 5 | # The output is the corresponding response.
 6 | # The size of the pretrained language model is 357M
 7 | 
 8 | WORLD_SIZE=8
 9 | 
10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
11 |                   --nnodes 1 \
12 |                   --node_rank 0 \
13 |                   --master_addr localhost \
14 |                   --master_port 6000"
15 | 
16 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
17 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
18 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
19 | INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
21 |         (e.g., /response_prompts.txt)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /output_testseen_response_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type response \
42 |         --num-prompt-examples 20 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/CONFIG.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # SLURM options.
 5 | export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
 6 | export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
 7 | 
 8 | 
 9 | # Source code.
10 | export MEGATRON_CODE_DIR=<megatron source code directory>
11 | 
12 | 
13 | # This variable is used to mount the relevant part of the filesystem
14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
15 | # launch directory already get mounted; this variable should be used to
16 | # mount the directories that contain the data and tokenizer files.
17 | export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
18 | 
19 | 
20 | # Data and tokenizer files.
21 | MEGATRON_DATA=<path to megatron processed data>
22 | BPE_VOCAB_FILE=<path to bpe vocab file>
23 | BPE_MERGE_FILE=<path to bpe merges file>
24 | 
25 | 
26 | # Megatron input parameters.
27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
28 | # that are not listed here. 
29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
30 | 	--tensor-model-parallel-size ${TP} \
31 | 	--pipeline-model-parallel-size ${PP} \
32 | 	--micro-batch-size ${MBS} \
33 | 	--global-batch-size ${GBS} \
34 |         --num-layers ${NLS} \
35 |         --hidden-size ${HS} \
36 |         --num-attention-heads ${NAH} \
37 | 	--DDP-impl ${DDP} \
38 | 	--data-path ${MEGATRON_DATA} \
39 | 	--vocab-file ${BPE_VOCAB_FILE} \
40 | 	--merge-file ${BPE_MERGE_FILE} \
41 |         --log-interval 5 \
42 |         --seq-length 2048 \
43 |         --max-position-embeddings 2048 \
44 |         --train-iters 500 \
45 |         --lr-decay-iters 320 \
46 |         --lr 0.0001 \
47 | 	--min-lr 0.00001 \
48 |         --lr-decay-style cosine \
49 |         --lr-warmup-fraction 0.01 \
50 |         --split 969,30,1 \
51 |         --eval-iters 100 \
52 |         --eval-interval 1000 \
53 |         --clip-grad 1.0 \
54 |         --fp16 \
55 | 	--loss-scale 8192 "
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/SBATCH.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | sbatch -p ${SLURM_PARTITION} \
 5 |        -A ${SLURM_ACCOUNT} \
 6 |        --job-name=${JOB_NAME} \
 7 |        --nodes=${NNODES} \
 8 |        --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 9 | 
10 | exit 0
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/SRUN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 4 | 
 5 | 
 6 | THIS_DIR=`pwd`
 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 8 | mkdir -p ${THIS_DIR}/logs
 9 | 
10 | 
11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
12 | 
13 | 
14 | srun -l \
15 |      --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
16 |      --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
17 |      --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [1, 2, 4, 8].
 8 | PP=1
 9 | 
10 | # Batch size (global batch size) options = [8, 128].
11 | GBS=8
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel size options.
18 | NLS=$((3*PP))
19 | NNODES=${PP}
20 | 
21 | 
22 | # Other params.
23 | TP=8
24 | MBS=1
25 | HS=20480
26 | NAH=128
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Interleaved schedule options = [YES, NO].
 8 | INTERLEAVED=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set interleaved schedule options.
18 | if [ ${INTERLEAVED} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${INTERLEAVED} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and tensor-parallel size options.
18 | TP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | MBS=1
23 | NLS=32
24 | HS=20480
25 | NAH=128
26 | DDP=local
27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
28 | NNODES=8
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and data-parallel size options.
18 | DP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | TP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32].
 8 | TP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set tensor-parallel and data-parallel size options.
18 | DP=$((64/TP))
19 | 
20 | 
21 | # Other params.
22 | PP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Microbatch size options = [1, 2, 4, 8].
 8 | MBS=1
 9 | 
10 | # Batch size (global batch size) options = [128, 512].
11 | GBS=128
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Other params.
18 | TP=8
19 | PP=8
20 | NLS=32
21 | HS=15360
22 | NAH=128
23 | DDP=local
24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
25 | NNODES=8
26 | 
27 | 
28 | # Name of the job.
29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
30 | 
31 | 
32 | # Import the configs.
33 | . `pwd`/CONFIG.sh
34 | 
35 | 
36 | # Submit the job.
37 | . `pwd`/SBATCH.sh
38 | 
39 | 
40 | exit 0
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_17.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Activation recomputation options = [YES, NO].
 8 | ACTIVATION_RECOMPUTATION=YES
 9 | 
10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256].
11 | GBS=1
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set activation recomputation.
18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS=""
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=16
31 | MBS=1
32 | NLS=80
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=16
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/academic_paper_scripts/sc21/run_figure_18.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Scatter-gather communication optimization options = [YES, NO].
 8 | SCATTER_GATHER=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set scatter-gather communication optimization options.
18 | if [ ${SCATTER_GATHER} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${SCATTER_GATHER} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/bert/README.md:
--------------------------------------------------------------------------------
 1 | # BERT MODEL
 2 | 
 3 | ## Table of contents
 4 | - [1. Training Setup](#1-training-setup)
 5 | - [2. Configurations](#2-configurations)
 6 | 
 7 | ## 1. Training setup
 8 | <a id="markdown-training-setup" name="training-setup"></a>
 9 | 
10 | To run the model using a docker container run it as follows
11 | ```
12 | PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
13 | CHECKPOINT_PATH="" #<Specify path>
14 | TENSORBOARD_LOGS_PATH=""#<Specify path>
15 | VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
16 | DATA_PATH="" #<Specify path and file prefix>_text_document
17 | 
18 | docker run \
19 |   --gpus=all \
20 |   --ipc=host \
21 |   --workdir /workspace/megatron-lm \
22 |   -v /path/to/data:/path/to/data \
23 |   -v /path/to/megatron-lm:/workspace/megatron-lm \
24 |   megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
25 |   bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
26 | 
27 | ```
28 | NOTE: Depending on the environment you are running it the above command might like slightly different.
29 | 
30 | 
31 | ## 2. Configurations
32 | <a id="markdown-configurations" name="configurations"></a>
33 | The example in this folder shows you how to run 340m large model. There are other configs you could run as well
34 | 
35 | ### 4B
36 | ```
37 |        --num-layers 48 \
38 |        --hidden-size 2560 \
39 |        --num-attention-heads 32 \
40 |        --tensor-model-parallel-size 1 \
41 |        --pipeline-model-parallel-size 1 \
42 | 
43 | ```
44 | 
45 | ### 20B
46 | ```
47 |        --num-layers 48 \
48 |        --hidden-size 6144 \
49 |        --num-attention-heads 96 \
50 |        --tensor-model-parallel-size 4 \
51 |        --pipeline-model-parallel-size 4 \
52 | 
53 | ```


--------------------------------------------------------------------------------
/examples/gpt3/README.md:
--------------------------------------------------------------------------------
 1 | # GPT3 MODEL
 2 | 
 3 | ## Table of contents
 4 | - [1. Training Setup](#1-training-setup)
 5 | - [2. Configurations](#2-configurations)
 6 | - [3. Training Results](#3-training-results)
 7 | 
 8 | ## 1. Training setup
 9 | <a id="markdown-training-setup" name="training-setup"></a>
10 | 
11 | To run the model using a docker container run it as follows
12 | ```
13 | PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
14 | CHECKPOINT_PATH="" #<Specify path>
15 | TENSORBOARD_LOGS_PATH=""#<Specify path>
16 | VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
17 | MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
18 | DATA_PATH="" #<Specify path and file prefix>_text_document
19 | 
20 | docker run \
21 |   --gpus=all \
22 |   --ipc=host \
23 |   --workdir /workspace/megatron-lm \
24 |   -v /path/to/data:/path/to/data \
25 |   -v /path/to/megatron-lm:/workspace/megatron-lm \
26 |   megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
27 |   bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
28 | 
29 | ```
30 | NOTE: Depending on the environment you are running it the above command might like slightly different.
31 | 
32 | 
33 | ## 2. Configurations
34 | <a id="markdown-configurations" name="configurations"></a>
35 | The example in this folder shows you how to run 175B model. There are other configs you could run as well
36 | 
37 | ### 345M
38 | ```
39 |        --num-layers 12 \
40 |        --hidden-size 512 \
41 |        --num-attention-heads 8 \
42 |        --seq-length 1024 \
43 |        --tensor-model-parallel-size 1 \
44 |        --pipeline-model-parallel-size 1 \
45 | 
46 | ```
47 | 
48 | ### 857M
49 | ```
50 |        --num-layers 24 \
51 |        --hidden-size 1024 \
52 |        --num-attention-heads 16 \
53 |        --seq-length 2048 \
54 |        --tensor-model-parallel-size 1 \
55 |        --pipeline-model-parallel-size 1 \
56 | 
57 | ```
58 | 


--------------------------------------------------------------------------------
/examples/inference/run_text_generation_server_345M.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model.
 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | export CUDA_DEVICE_MAX_CONNECTIONS=1
14 | 
15 | pip install flask-restful
16 | 
17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
18 |        --tensor-model-parallel-size 1  \
19 |        --pipeline-model-parallel-size 1  \
20 |        --num-layers 24  \
21 |        --hidden-size 1024  \
22 |        --load ${CHECKPOINT}  \
23 |        --num-attention-heads 16  \
24 |        --max-position-embeddings 1024  \
25 |        --tokenizer-type GPT2BPETokenizer  \
26 |        --fp16  \
27 |        --micro-batch-size 1  \
28 |        --seq-length 1024  \
29 |        --vocab-file $VOCAB_FILE  \
30 |        --merge-file $MERGE_FILE  \
31 |        --seed 42
32 | 


--------------------------------------------------------------------------------
/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel
 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | pip install flask-restful
14 | 
15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
16 |        --tensor-model-parallel-size 8  \
17 |        --pipeline-model-parallel-size 1  \
18 |        --num-layers 24  \
19 |        --hidden-size 1024  \
20 |        --load ${CHECKPOINT}  \
21 |        --num-attention-heads 16  \
22 |        --max-position-embeddings 1024  \
23 |        --tokenizer-type GPT2BPETokenizer  \
24 |        --fp16  \
25 |        --micro-batch-size 1  \
26 |        --seq-length 1024  \
27 |        --vocab-file $VOCAB_FILE  \
28 |        --merge-file $MERGE_FILE  \
29 |        --seed 42
30 | 


--------------------------------------------------------------------------------
/examples/mamba/.gitignore:
--------------------------------------------------------------------------------
1 | checkpoints/
2 | data-cache/
3 | tensorboard/
4 | triton-cache/
5 | 


--------------------------------------------------------------------------------
/examples/mamba/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:24.01-py3
 2 | 
 3 | RUN pip uninstall -y triton && \
 4 |     pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful
 5 | 
 6 | # The causal-conv1d and mamba-ssm packages below are built from scratch here
 7 | # (which takes significant time) because there are no wheels available on PyPI
 8 | # for these relatively newer versions of the packages that are compatible with
 9 | # the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
10 | # are using (in the NGC base container). Generally, if the package is not
11 | # compatible with the PyTorch version, then it will generate a Python import
12 | # error. The package authors tend to only release wheels for new versions of
13 | # these pacakges which are compatible with the versions of regular PyTorch and
14 | # NGC-variant PyTorch that are newer at the time of release. So, to use newer
15 | # versions of these packages with relatively older versions of the NGC PyTorch
16 | # container, we tend to have to build the packages from scratch.
17 | 
18 | RUN cd /tmp && \
19 |     git clone https://github.com/Dao-AILab/causal-conv1d.git && \
20 |     cd causal-conv1d && \
21 |     git checkout v1.2.2.post1 && \
22 |     CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
23 |     cd .. && \
24 |     rm -rf causal-conv1d
25 | 
26 | RUN cd /tmp && \
27 |     git clone https://github.com/state-spaces/mamba.git && \
28 |     cd mamba && \
29 |     git checkout v2.0.3 && \
30 |     MAMBA_FORCE_BUILD=TRUE pip install . && \
31 |     cd .. && \
32 |     rm -rf mamba
33 | 


--------------------------------------------------------------------------------
/examples/mamba/run_text_gen_server_8b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
 4 | # To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
 5 | 
 6 | CHECKPOINT_PATH=$1
 7 | TOKENIZER_PATH=$2
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | export NCCL_IB_SL=1
16 | export CUDA_DEVICE_MAX_CONNECTIONS=1
17 | export NCCL_IB_TIMEOUT=19
18 | export NCCL_IB_QPS_PER_CONNECTION=4
19 | 
20 | export TRITON_CACHE_DIR="./triton-cache/"
21 | export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
22 | 
23 | torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
24 |        --tensor-model-parallel-size 1  \
25 |        --pipeline-model-parallel-size 1  \
26 |        --untie-embeddings-and-output-weights \
27 |        --num-layers 56  \
28 |        --hidden-size 4096  \
29 |        --load ${CHECKPOINT_PATH}  \
30 |        --num-attention-heads 32  \
31 |        --group-query-attention \
32 |        --num-query-groups 8 \
33 |        --hybrid-attention-ratio 0.08 \
34 |        --hybrid-mlp-ratio 0.5 \
35 |        --attention-dropout 0.0 \
36 |        --hidden-dropout 0.0 \
37 |        --disable-bias-linear \
38 |        --normalization RMSNorm \
39 |        --seq-length 4096  \
40 |        --max-position-embeddings 4096  \
41 |        --position-embedding-type none \
42 |        --tokenizer-type GPTSentencePieceTokenizer  \
43 |        --tokenizer-model ${TOKENIZER_PATH} \
44 |        --distributed-backend nccl \
45 |        --distributed-timeout-minutes 1440 \
46 |        --bf16  \
47 |        --micro-batch-size 1  \
48 |        --use-mcore-models \
49 |        --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
50 |        --seed 42
51 | 


--------------------------------------------------------------------------------
/examples/mamba/run_text_gen_server_8b_gpt3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Use: ./run_text_gen_server_8b_gpt3.sh <checkpoint-path> <tokenizer-path>
 4 | # To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
 5 | 
 6 | CHECKPOINT_PATH=$1
 7 | TOKENIZER_PATH=$2
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | export NCCL_IB_SL=1
16 | export CUDA_DEVICE_MAX_CONNECTIONS=1
17 | export NCCL_IB_TIMEOUT=19
18 | export NCCL_IB_QPS_PER_CONNECTION=4
19 | 
20 | torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \
21 |        --tensor-model-parallel-size 1  \
22 |        --pipeline-model-parallel-size 1  \
23 |        --use-flash-attn \
24 |        --apply-layernorm-1p \
25 |        --untie-embeddings-and-output-weights \
26 |        --num-layers 32  \
27 |        --hidden-size 4096  \
28 |        --load ${CHECKPOINT_PATH}  \
29 |        --num-attention-heads 32  \
30 |        --attention-dropout 0.0 \
31 |        --hidden-dropout 0.0 \
32 |        --disable-bias-linear \
33 |        --seq-length 4096  \
34 |        --max-position-embeddings 4096  \
35 |        --position-embedding-type rope \
36 |        --rotary-percent 0.5 \
37 |        --squared-relu \
38 |        --tokenizer-type GPTSentencePieceTokenizer  \
39 |        --tokenizer-model ${TOKENIZER_PATH} \
40 |        --distributed-backend nccl \
41 |        --distributed-timeout-minutes 1440 \
42 |        --bf16  \
43 |        --micro-batch-size 1  \
44 |        --use-mcore-models \
45 |        --transformer-impl local \
46 |        --seed 42
47 | 


--------------------------------------------------------------------------------
/examples/multimodal/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:24.02-py3
 2 | 
 3 | RUN apt update && \
 4 |     apt -y upgrade && \
 5 |     apt install -y --no-install-recommends \
 6 |         software-properties-common \
 7 |         build-essential \
 8 |         python3-pip \
 9 |         python3-dev \
10 |         bash \
11 |         git \
12 |         vim \
13 |         python-is-python3 \
14 |         default-jre
15 | 
16 | RUN pip install --upgrade pip
17 | RUN pip install einops einops-exts sentencepiece braceexpand webdataset
18 | RUN pip install transformers datasets
19 | RUN pip install pytest-cov pytest_mock nltk wrapt
20 | RUN pip install zarr "tensorstore==0.1.45"
21 | RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main
22 | RUN pip install black==19.10b0 isort click==8.0.2
23 | RUN pip install pycocoevalcap megatron-energon
24 | RUN pip install git+https://github.com/openai/CLIP.git
25 | # Use --no-deps for the following to avoid outdated and unnecessary dependencies.
26 | RUN pip install mmf --no-deps
27 | RUN pip install open-flamingo[eval] --no-deps
28 | 


--------------------------------------------------------------------------------
/examples/multimodal/assets/pretrain_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/examples/multimodal/assets/pretrain_curves.png


--------------------------------------------------------------------------------
/examples/multimodal/combine_mistral_clip.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | MCORE_MISTRAL=<path_to_mcore_mistral_model_folder>
 3 | MCORE_CLIP=<path_to_mcore_clip_model_folder>
 4 | OUTPUT_DIR=<path_to_output_folder_for_combined_checkpoint>
 5 | 
 6 | python examples/multimodal/combine_state_dicts.py \
 7 |     --input \
 8 |     ${MCORE_MISTRAL}/iter_0000001/mp_rank_00/model_optim_rng.pt \
 9 |     ${MCORE_CLIP}/iter_0000001/mp_rank_00/model_optim_rng.pt \
10 |     ${MCORE_MISTRAL}/iter_0000001/mp_rank_01/model_optim_rng.pt \
11 |     ${MCORE_CLIP}/iter_0000001/mp_rank_01/model_optim_rng.pt \
12 |     ${MCORE_MISTRAL}/iter_0000001/mp_rank_02/model_optim_rng.pt \
13 |     ${MCORE_CLIP}/vit-mcore-336px-tp4/iter_0000001/mp_rank_02/model_optim_rng.pt \
14 |     ${MCORE_MISTRAL}/iter_0000001/mp_rank_03/model_optim_rng.pt \
15 |     ${MCORE_CLIP}/iter_0000001/mp_rank_03/model_optim_rng.pt \
16 |     --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
17 |     --output \
18 |     ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_00/model_optim_rng.pt \
19 |     ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_01/model_optim_rng.pt \
20 |     ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_02/model_optim_rng.pt \
21 |     ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_03/model_optim_rng.pt


--------------------------------------------------------------------------------
/examples/multimodal/convert_llava_pretrain_to_wds.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import webdataset as wds
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | llava_pretrain_dir = '<path_to_LLaVA-Pretrain>'
 8 | 
 9 | # Paths to the dataset files
10 | json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json')
11 | output = os.path.join(llava_pretrain_dir, 'wds')
12 | 
13 | if not os.path.exists(output):
14 |     os.mkdir(output)
15 | 
16 | # Load data
17 | with open(json_file, 'r') as f:
18 |     data = json.load(f)
19 | 
20 | with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer:
21 |     for entry in tqdm(data):
22 |         with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file:
23 |                 image_data = img_file.read()
24 |         sample = {
25 |             "__key__": entry['id'],
26 |             "jpg": image_data,
27 |             "json": json.dumps(entry['conversations']).encode("utf-8"),
28 |         }
29 |         shard_writer.write(sample)
30 | 
31 | print(f"Dataset successfully converted to wds")
32 | 


--------------------------------------------------------------------------------
/examples/multimodal/evaluate_vqav2.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import json
 4 | 
 5 | from open_flamingo.eval.vqa_metric import compute_vqa_accuracy
 6 | 
 7 | 
 8 | def merge_input_files(input_path):
 9 |     """Merge input files to a format compatible with the evaluator."""
10 |     output_file_path = input_path + "-VQAv2-merged.json"
11 | 
12 |     pattern = input_path + "-VQAv2-[0-9].*jsonl"
13 |     input_file_paths = glob.glob(pattern)
14 | 
15 |     results = []
16 | 
17 |     for input_file_path in input_file_paths:
18 |         with open(input_file_path, "r") as input_file:
19 |             for line in input_file:
20 |                 res = json.loads(line)
21 |                 res["question_id"] = res["sample_id"]
22 | 
23 |                 results.append(res)
24 | 
25 |     with open(output_file_path, "w") as output_file:
26 |         json.dump(results, output_file)
27 | 
28 |     return output_file_path
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--input-path', type=str, help="Path to input file(s)")
34 |     parser.add_argument('--groundtruth-path', type=str, help="Path to groundtruth file")
35 |     parser.add_argument('--question-path', type=str, help="Path to questions file")
36 |     args = parser.parse_args()
37 | 
38 |     result_file = merge_input_files(args.input_path)
39 | 
40 |     accuracy = compute_vqa_accuracy(result_file, args.question_path, args.groundtruth_path)
41 |     print(accuracy)
42 | 


--------------------------------------------------------------------------------
/examples/multimodal/manual_prompts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Captioning": {
 3 |         "raw": [
 4 |             "Can you briefly explain what you see in the image?",
 5 |             "Describe what's happening in this image in one short sentence.",
 6 |             "Write a short caption that accurately represents the content of this image.",
 7 |             "Please generate a descriptive caption for the image provided.",
 8 |             "How would you summarize the scene depicted in the picture in short?"
 9 |         ]
10 |     },
11 |     "OCR": {
12 |         "raw": [
13 |             "Can you read the text from image and output here?",
14 |             "Extract and document the text from the provided image.",
15 |             "Converting the text embedded in this image into a readable document.",
16 |             "Transcribe all the text you find.",
17 |             "Can you extract all visible text from the image here?"
18 |         ]
19 |     },
20 |     "VQA": {
21 |         "raw": [
22 |             "Given the image, answer the following question with few words.",
23 |             "Answer the following question: ",
24 |             "What is the answer to this question?",
25 |             "Write the answer: ",
26 |             "Please answer this question: "
27 |         ]
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/examples/multimodal/pretrain_dataset.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 1.
 7 |         path: <path_to_pretraining_dataset_in_energon_format>
 8 |         subflavors:
 9 |           augmentation: false
10 |   val:
11 |     datasets:
12 |       - weight: 1.
13 |         path: <path_to_pretraining_dataset_in_energon_format>
14 |         subflavors:
15 |           augmentation: false
16 | 


--------------------------------------------------------------------------------
/examples/multimodal/sft_dataset.yaml:
--------------------------------------------------------------------------------
 1 | __module__: megatron.energon
 2 | __class__: Metadataset
 3 | splits:
 4 |   train:
 5 |     datasets:
 6 |       - weight: 1.
 7 |         path: <path_to_sft_dataset_in_energon_format>
 8 |         subflavors:
 9 |           augmentation: false
10 |   val:
11 |     datasets:
12 |       - weight: 1.
13 |         path: <path_to_sft_dataset_in_energon_format>
14 |         subflavors:
15 |           augmentation: false
16 | 


--------------------------------------------------------------------------------
/examples/t5/t5_mcore_train_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/examples/t5/t5_mcore_train_curve.png


--------------------------------------------------------------------------------
/images/expt-pp32-flops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/expt-pp32-flops.png


--------------------------------------------------------------------------------
/images/expt-pp32-mem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/expt-pp32-mem.png


--------------------------------------------------------------------------------
/images/model_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/model_table.png


--------------------------------------------------------------------------------
/images/schedule-interlaced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/schedule-interlaced.png


--------------------------------------------------------------------------------
/images/schedule-vocab-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/schedule-vocab-1.png


--------------------------------------------------------------------------------
/images/schedule-vocab-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/schedule-vocab-2.png


--------------------------------------------------------------------------------
/images/st-passes-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/st-passes-1.png


--------------------------------------------------------------------------------
/images/st-passes-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/st-passes-2.png


--------------------------------------------------------------------------------
/images/strong_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/strong_scaling.png


--------------------------------------------------------------------------------
/images/weak_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/images/weak_scaling.png


--------------------------------------------------------------------------------
/input_store.py:
--------------------------------------------------------------------------------
 1 | from megatron.core import mpu
 2 | from megatron.training import get_args
 3 | 
 4 | class InputStore:
 5 |     """
 6 |     For storing and retrieving batch input that are partially unused.
 7 |     """
 8 | 
 9 |     cache = []
10 | 
11 |     @classmethod
12 |     def save_batch(cls, microbatch_id, data):
13 |         while len(cls.cache) <= microbatch_id:
14 |             cls.cache.append(None)
15 |         cls.cache[microbatch_id] = data
16 | 
17 |     @classmethod
18 |     def get_batch(cls, microbatch_id):
19 |         contents = cls.cache[microbatch_id]
20 |         if (
21 |             mpu.get_virtual_vocab_parallel_chunk() == 3
22 |         ):
23 |             cls.cache[microbatch_id] = None
24 |         elif (
25 |             ((not mpu.is_pipeline_last_stage()) or (get_args().use_interlaced_schedule))
26 |             and (mpu.get_virtual_vocab_parallel_chunk() == 1)
27 |         ):
28 |             cls.cache[microbatch_id] = None
29 |         return contents
30 | 


--------------------------------------------------------------------------------
/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import megatron.core.tensor_parallel
 2 | import megatron.core.utils
 3 | from megatron.core import parallel_state
 4 | from megatron.core.distributed import DistributedDataParallel
 5 | from megatron.core.inference_params import InferenceParams
 6 | from megatron.core.model_parallel_config import ModelParallelConfig
 7 | from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator
 8 | from megatron.core.package_info import (
 9 |     __contact_emails__,
10 |     __contact_names__,
11 |     __description__,
12 |     __download_url__,
13 |     __homepage__,
14 |     __keywords__,
15 |     __license__,
16 |     __package_name__,
17 |     __repository_url__,
18 |     __shortversion__,
19 |     __version__,
20 | )
21 | from megatron.core.timers import Timers
22 | 
23 | # Alias parallel_state as mpu, its legacy name
24 | mpu = parallel_state
25 | 
26 | __all__ = [
27 |     "parallel_state",
28 |     "tensor_parallel",
29 |     "utils",
30 |     "DistributedDataParallel",
31 |     "InferenceParams",
32 |     "init_num_microbatches_calculator",
33 |     "ModelParallelConfig",
34 |     "Timers",
35 | ]
36 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/datasets/__init__.py


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .config import RetroGPTChunkDatasets
4 | from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
5 | from .query.retro_dataset import get_retro_datasets
6 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - Embedder: Base class for all Bert embedders.
 7 |   - RetroBertEmbedders: Container class for in-memory and on-disk embedders.
 8 |   - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing.
 9 |   - RetroGPTChunkDatasets: Container class for train, valid, and test datasets.
10 |   - RetroTokenizers: Container class for GPT and Bert tokenizers.
11 | """
12 | 
13 | from .bert_embedders import Embedder, RetroBertEmbedders
14 | from .config import RetroPreprocessingConfig
15 | from .gpt_chunk_datasets import RetroGPTChunkDatasets
16 | from .tokenizers import RetroTokenizers
17 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/bert_embedders.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Container dataclass for holding both in-memory and on-disk Bert embedders."""
 4 | 
 5 | import abc
 6 | from dataclasses import dataclass
 7 | from typing import Any
 8 | 
 9 | import numpy as np
10 | import torch
11 | 
12 | 
13 | class Embedder(abc.ABC):
14 |     """Base class for all Bert embedders.
15 | 
16 |     All embedders should be able to embed either an entire text dataset (to a 2D
17 |     numpy array), or a single text string (to a 1D numpy array).
18 |     """
19 | 
20 |     @abc.abstractmethod
21 |     def embed_text_dataset(self, text_dataset: torch.utils.data.Dataset) -> np.ndarray:
22 |         """Embed a text dataset.
23 | 
24 |         Args:
25 |             text_dataset (torch.utils.data.Dataset): Text dataset to embed. Each sample of the text dataset should output a dict with a key 'text' and a string value.
26 | 
27 |         Returns:
28 |             A 2D ndarray with shape (len(text_dataset), dimension(embedder)).
29 |         """
30 | 
31 |     @abc.abstractmethod
32 |     def embed_text(self, text: str) -> np.ndarray:
33 |         """Embed a simple string of text.
34 | 
35 |         Args:
36 |             text (str): A single text sample.
37 | 
38 |         Returns:
39 |             A 1D ndarray with shape (dimensions(embedder),).
40 |         """
41 | 
42 | 
43 | @dataclass
44 | class RetroBertEmbedders:
45 |     """Container dataclass for in-memory and on-disk Bert embedders."""
46 | 
47 |     disk: Embedder
48 |     mem: Embedder
49 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/gpt_chunk_datasets.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Container dataclass for GPT chunk datasets (train, valid, and test)."""
 4 | 
 5 | from dataclasses import dataclass
 6 | 
 7 | 
 8 | @dataclass
 9 | class RetroGPTChunkDatasets:
10 |     """Container dataclass for GPT chunk datasets."""
11 | 
12 |     # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'.
13 |     train: dict = None
14 |     valid: dict = None
15 |     test: dict = None
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/config/tokenizers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Container class for GPT and Bert tokenizers."""
 4 | 
 5 | from dataclasses import dataclass
 6 | 
 7 | from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
 8 | 
 9 | 
10 | @dataclass
11 | class RetroTokenizers:
12 |     """Container class for GPT and Bert tokenizers."""
13 | 
14 |     gpt: MegatronTokenizer = None
15 |     bert: MegatronTokenizer = None
16 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/db/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - build_db: Build a chunk database from a list of indexed datasets.
 7 | """
 8 | 
 9 | from .build import build_db
10 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Required external libraries for Retro preprocessing."""
 4 | 
 5 | import importlib
 6 | 
 7 | required_libs = [
 8 |     "faiss",
 9 |     "h5py",
10 |     "transformers",  # for huggingface bert
11 | ]
12 | 
13 | for lib in required_libs:
14 |     try:
15 |         globals()[lib] = importlib.import_module(lib)
16 |     except ImportError as e:
17 |         raise Exception(
18 |             f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'."
19 |         )
20 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/index/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - train_index: Train an index on representative vectors.
 7 |   - add_to_index: Add vectors to a trained index.
 8 |   - build_index: Wrapper function that calls above two functions.
 9 | """
10 | 
11 | from .build import add_to_index, build_index, train_index
12 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/index/factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """The IndexFactory constructs an index from an index type string."""
 4 | 
 5 | from megatron.core.datasets.retro.index.index import Index
 6 | 
 7 | from .indexes import FaissBaseIndex, FaissParallelAddIndex
 8 | 
 9 | 
10 | class IndexFactory:
11 |     """Get index.
12 | 
13 |     Index type generally read from argument '--retro-index-ty'.
14 |     """
15 | 
16 |     @classmethod
17 |     def get_index_class(cls, index_type: str) -> type:
18 |         """Get an index class, given a type string.
19 | 
20 |         Args:
21 |             index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add().
22 | 
23 |         Returns:
24 |             An `Index` sub-type corresponding to the `index_type`.
25 |         """
26 |         return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex,}[index_type]
27 | 
28 |     @classmethod
29 |     def get_index(cls, index_type: str) -> Index:
30 |         """Construct an index from an index type string.
31 | 
32 |         Args:
33 |             index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add().
34 | 
35 |         Returns:
36 |             An `Index` instance corresponding to the `index_type`.
37 |         """
38 |         index_class = cls.get_index_class(index_type)
39 |         index = index_class()
40 |         return index
41 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/index/indexes/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | - FaissBaseIndex: Unoptimized Faiss index wrapper
 6 | - FaissParallelAddIndex: Optimized index.add() for Faiss index.
 7 | """
 8 | 
 9 | from .faiss_base import FaissBaseIndex
10 | from .faiss_par_add import FaissParallelAddIndex
11 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/retro/query/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Utilities for querying the pretraining dataset."""
 4 | 
 5 | import os
 6 | 
 7 | from megatron.core.datasets.megatron_dataset import MegatronDataset
 8 | 
 9 | 
10 | def get_query_dir(project_dir: str) -> str:
11 |     """Get root directory of all saved query data.
12 | 
13 |     Args:
14 |         project_dir (str): Retro project dir.
15 | 
16 |     Returns:
17 |         Path to query sub-directory in Retro project.
18 |     """
19 |     return os.path.join(project_dir, "query")
20 | 
21 | 
22 | def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str:
23 |     """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test).
24 | 
25 |     Args:
26 |         project_dir (str): Retro project dir.
27 |         key (str): Dataset split key; 'train', 'valid', or 'test'.
28 |         dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors.
29 | 
30 |     Returns:
31 |         Path to directory containing this dataset's neighbors within Retro project.
32 |     """
33 |     return os.path.join(
34 |         get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}"),
35 |     )
36 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from .core import check_is_distributed_checkpoint
 4 | from .mapping import LocalNonpersistentObject, LocalNonpersitentObject, ShardedTensor
 5 | from .serialization import (
 6 |     load,
 7 |     load_common_state_dict,
 8 |     load_plain_tensors,
 9 |     load_tensors_metadata,
10 |     save,
11 | )
12 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | """ Various loading and saving strategies """
4 | 
5 | from .common import _import_trigger
6 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .distributed_data_parallel import DistributedDataParallel
4 | from .distributed_data_parallel_config import DistributedDataParallelConfig
5 | from .finalize_model_grads import finalize_model_grads
6 | from .param_and_grad_buffer import ParamAndGradBuffer, shard_buffer
7 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/distributed_data_parallel_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Optional
 5 | 
 6 | 
 7 | @dataclass
 8 | class DistributedDataParallelConfig:
 9 |     """Configuration for DistributedDataParallel."""
10 | 
11 |     grad_reduce_in_fp32: bool = False
12 |     """If true, reduce grads in fp32."""
13 | 
14 |     overlap_grad_reduce: bool = False
15 |     """If true, overlap grad all-reduce / reduce-scatter with backward compute."""
16 | 
17 |     use_distributed_optimizer: bool = False
18 |     """If true, issue reduce-scatter collectives to aggregate gradients and clean up
19 |        originally allocated model parameters, otherwise issue all-reduce collectives.
20 |     """
21 | 
22 |     check_for_nan_in_grad: bool = False
23 |     """ If true, check for NaNs in gradients _before_ communication collective."""
24 | 
25 |     bucket_size: Optional[int] = None
26 |     """Maximum number of parameters in each bucket. If unspecified, MCore uses a default
27 |        value of max(40000000, 1000000 * dp_size) parameters (larger DP sizes need larger
28 |        buckets to ensure collectives do not become latency-bound)."""
29 | 
30 |     average_in_collective: bool = False
31 |     """If true, compute average in collective directly, as opposed to dividing by the
32 |        dp_size first and then computing sum in the collective."""
33 | 


--------------------------------------------------------------------------------
/megatron/core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | class ModelType(enum.Enum):
 7 |     encoder_or_decoder = 1
 8 |     encoder_and_decoder = 2
 9 |     retro_encoder = 3
10 |     retro_decoder = 4
11 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/fusions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from megatron.core.jit import jit_fuser
 6 | 
 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 8 | # 1/sqrt(2*pi)-> 0.3989423
 9 | # 1/sqrt(2)   -> 0.70710678
10 | # sqrt(2/pi)  -> 0.79788456
11 | # this function is tanh approximation of gelu
12 | # actual gelu is:
13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
14 | 
15 | 
16 | @jit_fuser
17 | def bias_gelu(bias, y):
18 |     x = bias + y
19 |     return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
20 | 
21 | 
22 | # gradient of tanh approximation of gelu
23 | # gradient of actual gelu is:
24 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
25 | @jit_fuser
26 | def bias_gelu_back(g, bias, y):
27 |     x = bias + y
28 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
29 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
30 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
31 |         1 + tanh_out
32 |     )
33 |     return ff * g
34 | 
35 | 
36 | class GeLUFunction(torch.autograd.Function):
37 |     @staticmethod
38 |     # bias is an optional argument
39 |     def forward(ctx, input, bias):
40 |         ctx.save_for_backward(input, bias)
41 |         return bias_gelu(bias, input)
42 | 
43 |     @staticmethod
44 |     def backward(ctx, grad_output):
45 |         input, bias = ctx.saved_tensors
46 |         tmp = bias_gelu_back(grad_output, bias, input)
47 |         return tmp, tmp
48 | 
49 | 
50 | bias_gelu_impl = GeLUFunction.apply
51 | 


--------------------------------------------------------------------------------
/megatron/core/inference/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/ammo_support/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/ammo_support/__init__.py


--------------------------------------------------------------------------------
/megatron/core/inference/ammo_support/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/core/inference/common_inference_params.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class CommonInferenceParams:
 6 |     """Inference parameters sent along with the prompts
 7 | 
 8 |     For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910
 9 |     """
10 | 
11 |     temperature: float = 1.0
12 |     top_k: int = 0
13 |     top_p: float = 0.0
14 |     return_log_probs: bool = False
15 |     num_tokens_to_generate: int = 30
16 | 
17 |     def add_attributes(self, attribute_value_pair: dict):
18 |         """Utility to add more attributes to inference params
19 | 
20 |         Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows
21 |         c = CommonInferenceParams
22 |         c.add_attributes({'min_length':4, 'eod_id':153})
23 | 
24 |         Args:
25 |             attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
26 |         """
27 |         for key, value in attribute_value_pair.items():
28 |             setattr(self, key, value)
29 | 


--------------------------------------------------------------------------------
/megatron/core/inference/communication_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from megatron.core import parallel_state
 4 | 
 5 | 
 6 | def _is_cuda(tensor):
 7 |     """Check if a tensor is not none and is cuda."""
 8 |     assert tensor is not None
 9 |     assert tensor.is_cuda
10 | 
11 | 
12 | def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
13 |     """Broadcast a tensor from last pipeline stage to all ranks."""
14 | 
15 |     if parallel_state.is_pipeline_last_stage():
16 |         _is_cuda(tensor)
17 |         assert tensor.is_contiguous()
18 |     else:
19 |         tensor = torch.empty(size, dtype=dtype, device=torch.cuda.current_device())
20 |     # Get the group and corresponding source rank.
21 |     src = parallel_state.get_pipeline_model_parallel_last_rank()
22 |     group = parallel_state.get_pipeline_model_parallel_group()
23 |     torch.distributed.broadcast(tensor, src, group)
24 |     return tensor
25 | 
26 | 
27 | def recv_from_prev_pipeline_rank_(recv_buffer=None):
28 |     """Receive from previous pipeline stage and update the
29 |     input buffer inplace."""
30 |     recv_prev_op = torch.distributed.P2POp(
31 |         torch.distributed.irecv, recv_buffer, parallel_state.get_pipeline_model_parallel_prev_rank()
32 |     )
33 |     reqs = torch.distributed.batch_isend_irecv([recv_prev_op])
34 |     for req in reqs:
35 |         req.wait()
36 |     # To protect against race condition when using batch_isend_irecv().
37 |     torch.cuda.synchronize()
38 | 
39 | 
40 | def send_to_next_pipeline_rank(tensor=None):
41 |     """Send output to the next pipeline stage."""
42 |     send_next_op = torch.distributed.P2POp(
43 |         torch.distributed.isend, tensor, parallel_state.get_pipeline_model_parallel_next_rank()
44 |     )
45 |     reqs = torch.distributed.batch_isend_irecv([send_next_op])
46 |     for req in reqs:
47 |         req.wait()
48 |     # To protect against race condition when using batch_isend_irecv().
49 |     torch.cuda.synchronize()
50 | 


--------------------------------------------------------------------------------
/megatron/core/inference/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/engines/__init__.py


--------------------------------------------------------------------------------
/megatron/core/inference/engines/abstract_engine.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | 
 5 | class AbstractEngine(ABC):
 6 |     @staticmethod
 7 |     @abstractmethod
 8 |     def generate(self) -> dict:
 9 |         """The abstract backend's generate function.
10 | 
11 |         To define a new backend, implement this and return the outputs as a dictionary.
12 | 
13 |         Returns:
14 |             dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`.
15 |         """
16 |         pass
17 | 


--------------------------------------------------------------------------------
/megatron/core/inference/inference_request.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from enum import Enum
 3 | from typing import List
 4 | 
 5 | import torch
 6 | 
 7 | from megatron.core.inference.common_inference_params import CommonInferenceParams
 8 | 
 9 | 
10 | # class syntax
11 | class Status(Enum):
12 |     WAITING_IN_QUEUE = 1
13 |     ACTIVE_AND_GENERATING_TOKENS = 2
14 |     ACTIVE_BUT_NOT_GENERATING_TOKENS = 3
15 |     COMPLETED = 4
16 | 
17 | 
18 | @dataclass
19 | class InferenceRequest:
20 |     request_id: str
21 |     prompt: str
22 |     inference_parameters: CommonInferenceParams
23 |     prompt_tokens: List[int]
24 |     arrival_time: float
25 |     status: Status
26 |     generated_text: str = None
27 |     generated_tokens: torch.Tensor = None
28 |     generated_log_probs: torch.Tensor = None
29 |     generated_length: int = 0
30 | 


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/model_inference_wrappers/__init__.py


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/gpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/model_inference_wrappers/gpt/__init__.py


--------------------------------------------------------------------------------
/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | @dataclass
 7 | class InferenceWrapperConfig:
 8 |     """Config for the model inference wrapper
 9 | 
10 |     NOTE : All the arguments here are obtained from arguments.py file
11 |     """
12 | 
13 |     hidden_size: int
14 |     """Receive happens between the layers during PP with size [seq_len, batch_size, hidden_size]"""
15 | 
16 |     params_dtype: torch.dtype
17 |     """Can be torch.float or torch.half if --fp16 is used, or torch.bfloat16 if --bf16 is used"""
18 | 
19 |     inference_batch_times_seqlen_threshold: int
20 |     """if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will."""
21 | 
22 |     padded_vocab_size: int
23 |     """The final padded vocab size (Padded to make it divisible by --make-vocab-size-divisible-by value)"""
24 | 
25 |     fp32_residual_connection: bool = False
26 |     """Move residual connections to fp32. Obtained from arguments.py"""
27 | 
28 |     def add_attributes(self, attribute_value_pair: dict):
29 |         """Utility to add more attributes to inference params
30 | 
31 |         Use this method to pass in a custom dictonary to add more config to the instance you created. Use as follows
32 |         c = InferenceWrapperConfig
33 |         c.add_attributes({'precision':'fp32'})
34 | 
35 |         Args:
36 |             attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
37 |         """
38 |         for key, value in attribute_value_pair.items():
39 |             setattr(self, key, value)
40 | 


--------------------------------------------------------------------------------
/megatron/core/inference/text_generation_controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/inference/text_generation_controllers/__init__.py


--------------------------------------------------------------------------------
/megatron/core/inference/utils.py:
--------------------------------------------------------------------------------
 1 | class Counter:
 2 |     """A simple counter class
 3 | 
 4 |     This class is responsible for assigning request ids to incoming requests
 5 |     """
 6 | 
 7 |     def __init__(self, start: int = 0) -> None:
 8 |         self.counter = start
 9 | 
10 |     def __next__(self) -> int:
11 |         i = self.counter
12 |         self.counter += 1
13 |         return i
14 | 
15 |     def reset(self) -> None:
16 |         self.counter = 0
17 | 


--------------------------------------------------------------------------------
/megatron/core/inference_params.py:
--------------------------------------------------------------------------------
 1 | class InferenceParams:
 2 |     """Inference parameters that are passed to the main model in order
 3 |     to efficienly calculate and store the context during inference."""
 4 | 
 5 |     def __init__(self, max_batch_size, max_sequence_length):
 6 |         self.max_sequence_length = max_sequence_length
 7 |         self.max_batch_size = max_batch_size
 8 |         self.sequence_len_offset = 0
 9 |         self.batch_size_offset = 0
10 |         self.key_value_memory_dict = {}
11 | 
12 |     def swap_key_value_dict(self, batch_idx):
13 |         "swap between batches"
14 |         if len(self.key_value_memory_dict) == 0:
15 |             raise ValueError("should not swap when dict in empty")
16 | 
17 |         for layer_number in self.key_value_memory_dict.keys():
18 |             inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
19 |             assert (
20 |                 len(batch_idx) == inference_key_memory.shape[1]
21 |             )  # make sure batch size is the same
22 |             new_inference_key_memory = inference_key_memory[:, batch_idx]
23 |             new_inference_value_memory = inference_value_memory[:, batch_idx]
24 |             self.key_value_memory_dict[layer_number] = (
25 |                 new_inference_key_memory,
26 |                 new_inference_value_memory,
27 |             )
28 | 
29 |     def __str__(self):
30 |         return f"InferenceParams(max_seq_len = {self.max_sequence_length}, max_batch_size = {self.max_batch_size}, sequence_len_offset = {self.sequence_len_offset}, batch_size_offset = {self.batch_size_offset}, key_value_memory_dict = {self.key_value_memory_dict.keys()})"
31 | 


--------------------------------------------------------------------------------
/megatron/core/jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | TORCH_MAJOR = int(torch.__version__.split(".")[0])
 6 | TORCH_MINOR = int(torch.__version__.split(".")[1])
 7 | 
 8 | jit_fuser = torch.jit.script
 9 | # nvFuser is deprecated in PyTorch JIT starting from 2.2
10 | if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2):
11 |     jit_fuser = torch.compile
12 | 


--------------------------------------------------------------------------------
/megatron/core/models/T5/__init__.py:
--------------------------------------------------------------------------------
1 | from .t5_model import T5Model
2 | 


--------------------------------------------------------------------------------
/megatron/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/bert/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/bert/bert_lm_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | 
 4 | from megatron.core.transformer.module import MegatronModule
 5 | from megatron.core.transformer.transformer_config import TransformerConfig
 6 | from megatron.core.transformer.utils import get_linear_layer
 7 | 
 8 | try:
 9 |     import apex
10 | 
11 |     from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
12 | 
13 |     HAVE_APEX = True
14 |     LNImpl = FusedLayerNorm
15 | except ImportError:
16 |     import warnings
17 | 
18 |     from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
19 | 
20 |     warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
21 |     LNImpl = WrappedTorchLayerNorm
22 | 
23 | 
24 | class BertLMHead(MegatronModule):
25 |     """Masked LM head for Bert.
26 | 
27 |     Args:
28 |         hidden_size: hidden size
29 |         config (TransformerConfig): TransformerConfig object
30 |     """
31 | 
32 |     def __init__(
33 |         self,
34 |         hidden_size: int,
35 |         config: TransformerConfig,
36 |     ):
37 |         super().__init__(config=config)
38 | 
39 |         # TODO: Should switch this to TE ?
40 |         self.dense = get_linear_layer(
41 |             hidden_size, hidden_size, config.init_method, config.perform_initialization
42 |         )
43 | 
44 |         setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
45 |         setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
46 | 
47 |         self.layer_norm = LNImpl(
48 |             config=config,
49 |             hidden_size=hidden_size,
50 |             eps=config.layernorm_epsilon,
51 |         )
52 | 
53 |         self.gelu = torch.nn.functional.gelu
54 | 
55 |     def forward(self, hidden_states: Tensor) -> Tensor:
56 |         hidden_states = self.dense(hidden_states)
57 |         hidden_states = self.gelu(hidden_states)
58 |         hidden_states = self.layer_norm(hidden_states)
59 |         return hidden_states
60 | 


--------------------------------------------------------------------------------
/megatron/core/models/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/embeddings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/embeddings/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/language_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/language_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/common/vision_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/vision_module.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | """Megatron Vision Module."""
 3 | 
 4 | from megatron.core.transformer.module import MegatronModule
 5 | from megatron.core.transformer.transformer_config import TransformerConfig
 6 | 
 7 | 
 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes.
 9 | class VisionModule(MegatronModule):
10 |     """Base vision module that has common helper functions used across CLIP, ViT, etc.
11 | 
12 |     Args:
13 |         config (TransformerConfig): Input transformer config for the model
14 |     """
15 | 
16 |     def __init__(self, config: TransformerConfig) -> None:
17 |         super().__init__(config=config)
18 | 


--------------------------------------------------------------------------------
/megatron/core/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_model import GPTModel
2 | 


--------------------------------------------------------------------------------
/megatron/core/models/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | from .mamba_model import MambaModel
2 | 


--------------------------------------------------------------------------------
/megatron/core/models/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/multimodal/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/retro/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """
 4 | Exports:
 5 | 
 6 |   - RetroConfig: configuration dataclass for RetroModel.
 7 |   - RetroModel: The Retro model.
 8 |   - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block.
 9 | """
10 | 
11 | from .config import RetroConfig
12 | from .decoder_spec import get_retro_decoder_block_spec
13 | from .model import RetroModel
14 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/base_attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Base class for decoder and encoder attention modules."""
 4 | 
 5 | from megatron.core.models.retro.config import RetroConfig
 6 | from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
 7 | from megatron.core.transformer.enums import AttnMaskType
 8 | from megatron.core.transformer.module import MegatronModule
 9 | 
10 | 
11 | class BaseRetroCrossAttention(MegatronModule):
12 | 
13 |     """Base class for Retro cross attention, for both encoder & decoder layers.
14 | 
15 |     This class collects the retro arguments below (i.e., num neighbors, chunk
16 |     length, and retrieve length) for use in Retro's custom cross attention
17 |     operators.
18 | 
19 |     Args:
20 |         config (RetroConfig): Retro config.
21 |         submodules (CrossAttentionSubmodules): Cross attention submodules.
22 |         layer_number (int): Layer number within transformer block.
23 |         attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
24 |     """
25 | 
26 |     def __init__(
27 |         self,
28 |         config: RetroConfig,
29 |         submodules: CrossAttentionSubmodules,
30 |         layer_number: int = 1,
31 |         attn_mask_type: AttnMaskType = AttnMaskType.padding,
32 |     ):
33 |         super().__init__(config=config)
34 | 
35 |         self.attn = CrossAttention(
36 |             config=config,
37 |             submodules=submodules,
38 |             layer_number=layer_number,
39 |             attn_mask_type=attn_mask_type,
40 |         )
41 | 
42 |         self.retro_num_neighbors = config.retro_num_neighbors
43 |         self.retro_chunk_length = config.retro_chunk_length
44 |         self.retro_retrieved_length = config.retro_retrieved_length
45 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def get_config_path(project_dir: str) -> str:
 9 |     """Config copy stored within retro project dir."""
10 |     return os.path.join(project_dir, "config.json")
11 | 
12 | 
13 | def get_gpt_data_dir(project_dir: str) -> str:
14 |     """Get project-relative directory of GPT bin/idx datasets."""
15 |     return os.path.join(project_dir, "data")
16 | 
17 | 
18 | # ** Note ** : Retro's compatibility between cross attention and Flash/Fused
19 | #   Attention is currently a work in progress. We default to returning None for
20 | #   now.
21 | # def get_all_true_mask(size, device):
22 | #     return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device)
23 | def get_all_true_mask(size, device):
24 |     return None
25 | 


--------------------------------------------------------------------------------
/megatron/core/models/vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/models/vision/__init__.py


--------------------------------------------------------------------------------
/megatron/core/package_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | 
 4 | MAJOR = 0
 5 | MINOR = 9
 6 | PATCH = 0
 7 | PRE_RELEASE = 'rc0'
 8 | 
 9 | # Use the following formatting: (major, minor, patch, pre-release)
10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
11 | 
12 | __shortversion__ = '.'.join(map(str, VERSION[:3]))
13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
14 | 
15 | __package_name__ = 'megatron_core'
16 | __contact_names__ = 'NVIDIA'
17 | __contact_emails__ = 'nemo-toolkit@nvidia.com'  # use NeMo Email
18 | __homepage__ = (
19 |     'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'  # use NeMo homepage
20 | )
21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
23 | __description__ = (
24 |     'Megatron Core - a library for efficient and scalable training of transformer based models'
25 | )
26 | __license__ = 'BSD-3'
27 | __keywords__ = (
28 |     'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
29 | )
30 | 


--------------------------------------------------------------------------------
/megatron/core/packed_seq_params.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from torch import Tensor
 4 | 
 5 | 
 6 | @dataclass
 7 | class PackedSeqParams:
 8 |     # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format,
 9 |     qkv_format: str = None
10 |     cu_seqlens_q: Tensor = None
11 |     cu_seqlens_kv: Tensor = None
12 |     max_seqlen_q: Tensor = None
13 |     max_seqlen_kv: Tensor = None
14 | 


--------------------------------------------------------------------------------
/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedules import get_forward_backward_func
2 | 


--------------------------------------------------------------------------------
/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | torch


--------------------------------------------------------------------------------
/megatron/core/ssm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/ssm/__init__.py


--------------------------------------------------------------------------------
/megatron/core/ssm/triton_cache_manager.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import os
 4 | import socket
 5 | from pathlib import Path
 6 | 
 7 | import torch
 8 | 
 9 | try:
10 |     from triton.runtime.cache import FileCacheManager
11 | except ImportError:
12 |     raise ImportError("triton is required by the Mamba model but cannot be imported")
13 | 
14 | 
15 | def get_rank():
16 |     return torch.distributed.get_rank()
17 | 
18 | 
19 | def default_cache_dir():
20 |     return os.path.join(Path.home(), ".triton", "cache")
21 | 
22 | 
23 | class ParallelFileCacheManager(FileCacheManager):
24 | 
25 |     # See https://github.com/triton-lang/triton/blob/main/python/triton/runtime/cache.py
26 | 
27 |     # When running Triton with multiple ranks, they each create their own cache manager. Their input
28 |     # keys to that class are mostly (but not entirely) the same across ranks, which leads many ranks
29 |     # to write to the same 'key' directories in the cache dir at the same time during compilation,
30 |     # leading to conflicts.  This works around that by making each cache dir be rank specific by
31 |     # adding "rank_<host>_<pid>" to the cache directory.
32 | 
33 |     def __init__(self, key):
34 |         self.key = key
35 |         self.lock_path = None
36 |         # create cache directory if it doesn't exist
37 |         self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir())
38 |         self.cache_dir = os.path.join(
39 |             self.cache_dir, "rank_{}_{}".format(socket.gethostname(), os.getpid())
40 |         )
41 |         if self.cache_dir:
42 |             self.cache_dir = os.path.join(self.cache_dir, self.key)
43 |             self.lock_path = os.path.join(self.cache_dir, "lock")
44 |             os.makedirs(self.cache_dir, exist_ok=True)
45 | 


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/vocab_input_store.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class VocabInputStore:
 3 |     """
 4 |     For storing and retrieving intermediate results of the VocabParallelInput layer.
 5 |     """
 6 | 
 7 |     forward_cache = []
 8 |     backward_cache = []
 9 | 
10 |     @classmethod
11 |     def forward_store(cls, output_tensor, handle):
12 |         cls.forward_cache.append((output_tensor, handle))
13 | 
14 |     @classmethod
15 |     def forward_get(cls, remove=True):
16 |         output_tensor, handle = cls.forward_cache[0]
17 |         if handle is not None:
18 |             handle.wait()
19 |         if remove:
20 |             cls.forward_cache.pop(0)
21 |         else:
22 |             cls.forward_cache[0] = (output_tensor, None)
23 |         return output_tensor
24 | 
25 |     @classmethod
26 |     def backward_store(cls, grad_output):
27 |         cls.backward_cache.append(grad_output)
28 | 
29 |     @classmethod
30 |     def backward_get(cls):
31 |         contents = cls.backward_cache[0]
32 |         cls.backward_cache.pop(0)
33 |         return contents
34 | 


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/vocab_output_store.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class VocabOutputStore:
 3 |     """
 4 |     For storing and retrieving intermediate results of the VocabParallelOutput layer.
 5 |     """
 6 | 
 7 |     microbatch_id = 0
 8 |     forward_cache = []
 9 |     backward_cache = []
10 | 
11 |     @classmethod
12 |     def forward_store(cls, sum_exp_logits, logits_max, predicted_logits, target_mask,
13 |                       softmax_grad_input, ground_truth_grad_input):
14 |         while len(cls.forward_cache) <= cls.microbatch_id:
15 |             cls.forward_cache.append(None)
16 |         cls.forward_cache[cls.microbatch_id] = (
17 |             sum_exp_logits, logits_max, predicted_logits, target_mask, softmax_grad_input, ground_truth_grad_input
18 |         )
19 | 
20 |     @classmethod
21 |     def forward_get(cls):
22 |         contents = cls.forward_cache[cls.microbatch_id]
23 |         cls.forward_cache[cls.microbatch_id] = None
24 |         return contents
25 | 
26 |     @classmethod
27 |     def backward_store(cls, sum_exp_logits, logits_max, grad_output):
28 |         while len(cls.backward_cache) <= cls.microbatch_id:
29 |             cls.backward_cache.append(None)
30 |         cls.backward_cache[cls.microbatch_id] = (
31 |             sum_exp_logits, logits_max, grad_output
32 |         )
33 | 
34 |     @classmethod
35 |     def backward_get(cls):
36 |         contents = cls.backward_cache[cls.microbatch_id]
37 |         cls.backward_cache[cls.microbatch_id] = None
38 |         return contents
39 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .module import MegatronModule
4 | from .spec_utils import ModuleSpec, build_module
5 | from .transformer_config import TransformerConfig
6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
7 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/custom_layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/transformer/custom_layers/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | # can we get rid of this?
 7 | # it's being used in pipeline schedules
 8 | class ModelType(enum.Enum):
 9 |     encoder_or_decoder = 1
10 |     encoder_and_decoder = 2
11 | 
12 | 
13 | # class LayerType(enum.Enum):
14 | #     encoder = 1
15 | #     decoder = 2
16 | 
17 | 
18 | class AttnType(enum.Enum):
19 |     self_attn = 1
20 |     cross_attn = 2
21 | 
22 | 
23 | class AttnMaskType(enum.Enum):
24 |     padding = 1
25 |     causal = 2
26 |     no_mask = 3  # only used for TE
27 |     padding_causal = 4  # only used for thd attention
28 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/identity_op.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | class IdentityOp(torch.nn.Module):
 6 |     """
 7 |     This is a placeholder for IdentityOp(x) -> x
 8 |     """
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 
16 | 
17 | class IdentityFuncOp(IdentityOp):
18 |     """
19 |     This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x.
20 |     Such a func is handy for ops like `bias_dropout_fusion` which themselves
21 |     return a function at runtime based on passed arguments
22 |     """
23 | 
24 |     def __init__(self, *args, **kwargs):
25 |         super().__init__()
26 | 
27 |     def forward(self, *args, **kwargs):
28 |         return super().forward
29 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/core/transformer/moe/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/grouped_gemm_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | try:
 4 |     import grouped_gemm
 5 | except ImportError:
 6 |     grouped_gemm = None
 7 | 
 8 | 
 9 | def grouped_gemm_is_available():
10 |     return grouped_gemm is not None
11 | 
12 | 
13 | def assert_grouped_gemm_is_available():
14 |     assert grouped_gemm_is_available(), (
15 |         "Grouped GEMM is not available. Please run "
16 |         "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0`."
17 |     )
18 | 
19 | 
20 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None
21 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/torch_layer_norm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import torch
 4 | 
 5 | from megatron.core.transformer import TransformerConfig
 6 | 
 7 | 
 8 | class WrappedTorchLayerNorm(torch.nn.LayerNorm):
 9 | 
10 |     def __init__(
11 |         self,
12 |         config: TransformerConfig,
13 |         hidden_size: int,
14 |         eps: float = 1e-5,
15 |         persist_layer_norm: bool = False,  ## TODO: unused arguments. See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/issues/223
16 |         zero_centered_gamma: bool = False,
17 |         normalization: str = "LayerNorm",  # included to match TE interface
18 |     ):
19 |         self.config = config
20 |         assert (
21 |             not self.config.layernorm_zero_centered_gamma
22 |         ), f"zero_centered_gamma not supported by torch LayerNorm"
23 | 
24 |         assert (
25 |             self.config.normalization == "LayerNorm"
26 |         ), f'({self.config.normalization}) is not supported in by torch Layernorm'
27 | 
28 |         assert (
29 |             not self.config.persist_layer_norm
30 |         ), f"persist_layer_norm not supported by torch LayerNorm"
31 | 
32 |         assert (
33 |             not self.config.sequence_parallel
34 |         ), f"sequence parallel not supported by torch LayerNorm"
35 | 
36 |         assert (
37 |             not self.config.memory_efficient_layer_norm
38 |         ), f"memory_efficient_layer_norm not supported by torch LayerNorm"
39 | 
40 |         super().__init__(
41 |             normalized_shape=hidden_size,  ## applied to last len(normalized_shape.size) dimensions
42 |             eps=eps,
43 |         )
44 | 


--------------------------------------------------------------------------------
/megatron/inference/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/inference/arguments.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | def add_modelopt_args(parser):
 5 |     """Add additional arguments for using TensorRT Model Optimizer (modelopt) features."""
 6 |     group = parser.add_argument_group(title="modelopt-generic")
 7 | 
 8 |     group.add_argument(
 9 |         "--export-legacy-megatron",
10 |         action="store_true",
11 |         help="Export a legacy megatron-lm checkpoint.",
12 |     )
13 |     group.add_argument(
14 |         "--export-te-mcore-model",
15 |         action="store_true",
16 |         help="Export a megatron-core transformer-engine checkpoint.",
17 |     )
18 |     group.add_argument(
19 |         "--export-quant-cfg",
20 |         type=str,
21 |         default=None,
22 |         choices=["int8", "int8_sq", "fp8", "int4_awq", "w4a8_awq", "int4", "None"],
23 |         help="Specify a quantization config from the supported choices.",
24 |     )
25 | 
26 |     return parser
27 | 


--------------------------------------------------------------------------------
/megatron/inference/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/megatron/inference/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/megatron/legacy/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/legacy/data/__init__.py


--------------------------------------------------------------------------------
/megatron/legacy/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/megatron/legacy/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/megatron/legacy/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/legacy/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/legacy/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 4 | from .rms_norm import RMSNorm
 5 | 
 6 | from .bert_model import BertModel
 7 | from .gpt_model import GPTModel
 8 | from .t5_model import T5Model
 9 | from .language_model import get_language_model
10 | from .module import Float16Module
11 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 |     retro_decoder_with_retriever = 5
11 |  
12 | class AttnType(enum.Enum):
13 |     self_attn = 1
14 |     cross_attn = 2
15 | 
16 | class AttnMaskType(enum.Enum):
17 |     padding = 1
18 |     causal = 2
19 | 
20 | # For backward compatibility with old model checkpoints
21 | from megatron.core.enums import ModelType
22 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from megatron.core.jit import jit_fuser
 5 | 
 6 | 
 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 8 | # 1/sqrt(2*pi)-> 0.3989423
 9 | # 1/sqrt(2)   -> 0.70710678
10 | # sqrt(2/pi)  -> 0.79788456
11 | # this function is tanh approximation of gelu
12 | # actual gelu is:
13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
14 | 
15 | @jit_fuser
16 | def bias_gelu(bias, y):
17 |     x = bias + y
18 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
19 | 
20 | # gradient of tanh approximation of gelu
21 | # gradient of actual gelu is:
22 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
23 | @jit_fuser
24 | def bias_gelu_back(g, bias, y):
25 |     x = bias + y
26 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
27 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
28 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
29 |     return ff*g
30 | 
31 | class GeLUFunction(torch.autograd.Function):
32 |     @staticmethod
33 |     # bias is an optional argument
34 |     def forward(ctx, input, bias):
35 |         ctx.save_for_backward(input, bias)
36 |         return bias_gelu(bias, input)
37 | 
38 |     @staticmethod
39 |     def backward(ctx, grad_output):
40 |         input, bias = ctx.saved_tensors
41 |         tmp = bias_gelu_back(grad_output, bias, input)
42 |         return tmp, tmp
43 | 
44 | bias_gelu_impl = GeLUFunction.apply
45 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | class RMSNorm(torch.nn.Module):
 7 | 
 8 |     def __init__(self,
 9 |                  dim: int,
10 |                  eps: float = 1e-6,
11 |                  sequence_parallel: bool = False):
12 |         """RMS Normaliation module
13 | 
14 |         Args:
15 |             dim (int): The width of input, i.e. hidden size
16 |             eps (float): epsilon to use for the norm, default to 1e-6
17 |             sequence_parallel (bool): Set to true if sequence parallelism is being used,
18 |               this marks the weights as needing to be allreduced.
19 |         """
20 |         super().__init__()
21 |         self.eps = eps
22 |         self.weight = nn.Parameter(torch.ones(dim))
23 | 
24 |         setattr(self.weight, 'sequence_parallel', sequence_parallel)
25 | 
26 |     def _norm(self, x):
27 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
28 | 
29 |     def forward(self, x):
30 |         output = self._norm(x.float()).type_as(x)
31 |         return output * self.weight
32 | 


--------------------------------------------------------------------------------
/megatron/legacy/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def resize(input,
 7 |            size=None,
 8 |            scale_factor=None,
 9 |            mode='nearest',
10 |            align_corners=None,
11 |            warning=True):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
18 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
19 |                         and (output_w - 1) % (input_w - 1)):
20 |                     warnings.warn(
21 |                         f'When align_corners={align_corners}, '
22 |                         'the output would more aligned if '
23 |                         f'input size {(input_h, input_w)} is `x+1` and '
24 |                         f'out size {(output_h, output_w)} is `nx+1`')
25 |     if isinstance(size, torch.Size):
26 |         size = tuple(int(x) for x in size)
27 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
28 | 


--------------------------------------------------------------------------------
/megatron/legacy/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/megatron/legacy/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/training/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args
 6 | from .global_vars import get_signal_handler
 7 | from .global_vars import get_tokenizer
 8 | from .global_vars import get_tensorboard_writer
 9 | from .global_vars import get_wandb_writer
10 | from .global_vars import get_one_logger
11 | from .global_vars import get_adlr_autoresume
12 | from .global_vars import get_timers
13 | from .initialize  import initialize_megatron
14 | from .training import pretrain, get_model, get_train_valid_test_num_samples
15 | 
16 | from .utils import (print_rank_0,
17 |                     is_last_rank,
18 |                     print_rank_last)
19 | 


--------------------------------------------------------------------------------
/megatron/training/activations.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | try:
 6 |     jit_fuser = torch.compile
 7 | except:
 8 |     jit_fuser = torch.jit.script
 9 | 
10 | 
11 | @jit_fuser
12 | def squared_relu(x: torch.Tensor) -> torch.Tensor:
13 |     return torch.pow(F.relu(x), 2)
14 | 
15 | 
16 | @jit_fuser
17 | def quick_gelu(x: torch.Tensor) -> torch.Tensor:
18 |     return x * torch.sigmoid(1.702 * x)
19 | 


--------------------------------------------------------------------------------
/megatron/training/async_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """
 4 | This module provides a singleton instance of AsyncCallsQueue which manages
 5 | the async checkpoint save calls.
 6 | """
 7 | import logging
 8 | 
 9 | from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue, AsyncRequest
10 | from megatron.training import get_args
11 | from megatron.training.utils import print_rank_0
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | # Singleton manager of async calls
16 | _async_calls_queue = AsyncCallsQueue()
17 | 
18 | 
19 | def schedule_async_save(async_request: AsyncRequest):
20 |     """ Schedule the async save request.
21 | 
22 |     Args:
23 |         async_request (AsyncRequest): the async save request.
24 |     """
25 |     _async_calls_queue.schedule_async_request(async_request)
26 | 
27 | 
28 | def maybe_finalize_async_save(blocking: bool = False):
29 |     """ Finalizes active async save calls.
30 | 
31 |     Args:
32 |         blocking (bool, optional): if True, will wait until all active requests
33 |             are done. Otherwise, finalizes only the async request that already
34 |             finished. Defaults to False.
35 |     """
36 |     args = get_args()
37 |     if not args.async_save:
38 |         return
39 | 
40 |     if blocking and _async_calls_queue.get_num_unfinalized_calls() > 0:
41 |         print_rank_0('Unfinalized async checkpoint saves. Finalizing them synchronously now.')
42 | 
43 |     _async_calls_queue.maybe_finalize_async_calls(blocking)
44 | 


--------------------------------------------------------------------------------
/megatron/training/log_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import sys
 4 | from logging import LogRecord, StreamHandler
 5 | 
 6 | BLACKLISTED_MODULES = ["torch.distributed"]
 7 | 
 8 | 
 9 | class CustomHandler(StreamHandler):
10 |     """
11 |     Custom handler to filter out logging from code outside of
12 |     Megatron Core, and dump to stdout.
13 |     """
14 | 
15 |     def __init__(self):
16 |         super().__init__(stream=sys.stdout)
17 | 
18 |     def filter(self, record: LogRecord) -> bool:
19 |         # Prevent log entries that come from the blacklisted modules
20 |         # through (e.g., PyTorch Distributed).
21 |         for blacklisted_module in BLACKLISTED_MODULES:
22 |             if record.name.startswith(blacklisted_module):
23 |                 return False
24 |         return True
25 | 


--------------------------------------------------------------------------------
/megatron/training/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | [build-system]
 4 | requires = [
 5 |     "setuptools",
 6 |     "pybind11",
 7 | ]
 8 | 
 9 | [tool.isort]
10 | profile = "black"  # black-compatible
11 | line_length = 100  # should match black parameters
12 | py_version = 38  # python 3.8 as a target version
13 | known_first_party = ["megatron"]  # FIRSTPARTY section
14 | known_third_party = ["transformer_engine"]  # THIRDPARTY section
15 | sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
16 | default_section = "THIRDPARTY"
17 | extend_skip = ["setup.py"]
18 | 
19 | [tool.black]
20 | line_length = 100
21 | skip_string_normalization = true
22 | # recongized by future versions, disallows to reformat code with incompatible versions
23 | # Matches NeMO version so people working on both codebases don't need two different version of black installed
24 | required_version = "24"  
25 | 


--------------------------------------------------------------------------------
/tasks/msdp/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
 3 | 
 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
 5 | 
 6 | ## Multi-Stage Dialogue Prompting
 7 | 
 8 | ### Data Preparation
 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets.
11 | 
12 | ### Stage-1: Prompting for Knowledge Generation
13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
15 | 
16 | ### Stage-2: Prompting for Response Generation
17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
19 | 3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
20 | 


--------------------------------------------------------------------------------
/tasks/msdp/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Model evaluation"""
 4 | 
 5 | from megatron.training import get_args
 6 | from megatron.training import print_rank_0
 7 | from tasks.msdp.metrics import F1Metric
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | def evaluate_f1(guess_file, answer_file):
12 |     """Evaluating F1 Score"""
13 | 
14 |     guess_list = []
15 |     print_rank_0('reading %s' % guess_file)
16 |     with open(guess_file, "r") as f:
17 |         for i, line in enumerate(tqdm(f)):
18 |             line = line.strip()
19 |             if "<|endoftext|>" in line:
20 |                 line = line.replace("<|endoftext|>", "")
21 |             guess_list.append(line)
22 | 
23 |     answer_list = []
24 |     print_rank_0('reading %s' % answer_file)
25 |     with open(answer_file, "r") as f:
26 |         for i, line in enumerate(tqdm(f)):
27 |             line = line.strip()
28 |             if line == "no_passages_used":
29 |                 line = ""
30 |             answer_list.append(line)
31 | 
32 |     assert len(guess_list) == len(answer_list), \
33 |         "lengths of guess and answer are different!"
34 | 
35 |     precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
36 |     print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
37 | 
38 |     print_rank_0('done :-)')
39 | 
40 | 
41 | def main():
42 |     args = get_args()
43 |     
44 |     evaluate_f1(args.guess_file, args.answer_file)
45 | 
46 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | from megatron.training import get_args, print_rank_0
 6 | from megatron.legacy.indexer import IndexBuilder
 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator
 8 | 
 9 | def main():
10 |     """
11 |     Main program
12 |     """
13 | 
14 |     args = get_args()
15 | 
16 |     """
17 |     Create a BlockData data structure by running an IndexBuilder over an
18 |     ICT Dataset and then evaluate on NQ task
19 |     """
20 | 
21 |     print_rank_0("Starting index builder!")
22 | 
23 |     index_builder = IndexBuilder()
24 |     index_builder.build_and_save_index()
25 |     print_rank_0("Build and save indices: done!")
26 | 
27 | 
28 |     print_rank_0("Starting evaluations!")
29 | 
30 |     # Set up the model and evaluator
31 |     evaluator = ORQAEvaluator()
32 | 
33 |     # Run evaluation
34 |     if args.qa_data_dev is not None:
35 |         evaluator.evaluate(args.qa_data_dev, "DEV")
36 | 
37 |     if args.qa_data_test is not None:
38 |         evaluator.evaluate(args.qa_data_test, "TEST")
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/functional_tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/jet_recipes/MR-gpt-nemo.yaml:
--------------------------------------------------------------------------------
 1 | type: basic
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | loggers: [stdout]
 5 | launchers:
 6 |   type:slurm:
 7 |     ntasks_per_node: '{gpus}'
 8 |     no_container_mount_home: 'true'
 9 | spec:
10 |   name: "{model}_{variant}_{scope}_\
11 |          mbs{mbs}_gbs{gbs}_\
12 |          {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\
13 |          tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\
14 |          {'_'+args_meta if args_meta else ''}
15 |          _{platforms}_{nodes}N{gpus}G"
16 |   model: gpt3-nemo
17 |   variant: 126m
18 |   build: mcore-nemo
19 |   scope: mr
20 |   nodes: 1
21 |   gpus: 8
22 |   platforms: dgx_a100
23 |   steps: 50
24 |   extra_args: null
25 |   args_meta: null
26 |   precision: bf16
27 |   time_limit: 1200
28 |   use_mcore: True
29 |   use_te: True
30 |   vp_size: null
31 |   script: |-
32 |     cd /opt/NeMo
33 | 
34 |     /opt/megatron-lm/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh \
35 |         TP_SIZE={tp_size} \
36 |         PP_SIZE={pp_size} \
37 |         NUM_NODES={nodes} \
38 |         MAX_STEPS={steps} \
39 |         VP_SIZE={vp_size if vp_size is not None else '""'} \
40 |         MBS={mbs} \
41 |         GBS={gbs} \
42 |         JOB_NAME={name} \
43 |         ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
44 | products:
45 |   - {tp_size: [1], pp_size: [1], mbs: [4], gbs: [64], vp_size: [null]}
46 |   - {tp_size: [2], pp_size: [4], mbs: [1], gbs: [8], vp_size: [3], extra_args: ['"model.sequence_parallel=True model.overlap_p2p_comm=True model.batch_p2p_comm=False"'], args_meta: ["seq_par_overlap_p2p"]}
47 | 


--------------------------------------------------------------------------------
/tests/functional_tests/jet_recipes/build-pyt.yaml:
--------------------------------------------------------------------------------
 1 | type: build
 2 | format_version: 1
 3 | maintainers: [maanug]
 4 | spec:
 5 |   name: mcore-pyt
 6 |   platforms: [linux/amd64]
 7 |   source:
 8 |     # The image tag will be added via `jet-tests.yaml`
 9 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
10 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci
11 |     
12 | 
13 | ---
14 | type: build
15 | format_version: 1
16 | maintainers: [maanug]
17 | spec:
18 |   name: mcore-nemo
19 |   platforms: [linux/amd64]
20 |   source:
21 |     # The image tag will be added via `jet-tests.yaml`
22 |     # Tags are one of {buildcache, $CI_PIPELINE_ID}
23 |     image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/functional_tests/python_test_utils/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
 4 | import json
 5 | import sys
 6 | 
 7 | from tests.functional_tests.python_test_utils.common import read_tb_logs_as_list
 8 | 
 9 | 
10 | def collect_train_test_metrics(logs_dir, run_name):
11 |     summaries = read_tb_logs_as_list(logs_dir)
12 | 
13 |     train_metrics = {
14 |         metric_name: {
15 |             "start_step": 0,
16 |             "end_step": len(metric_values),
17 |             "step_interval": 5,
18 |             "values": metric_values[0 : len(metric_values) : 5],
19 |         }
20 |         for metric_name, metric_values in summaries.items()
21 |     }
22 |     print(
23 |         f"\n ----------- Store the following metrics in tests/functional_tests/test_results/jet/{run_name}.json ----------"
24 |     )
25 |     print(f"\n {json.dumps(train_metrics)}", flush=True)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = sys.argv[1:]
30 |     logs_dir = args[0]  # eg /lustre/fsw/joc/shanmugamr/megatron/logs/
31 |     run_name = args[1]
32 |     collect_train_test_metrics(logs_dir, run_name)
33 | 


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/multitest_ci_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pytest
 4 | import sys
 5 | import glob
 6 | from .common import read_tb_logs_as_list, TypeOfTest
 7 | from .test_ci_pipeline import TestCIPipeline
 8 | 
 9 | LOGS_DIR = os.getenv('LOGS_DIR')
10 | EXPECTED_METRICS_DIR = os.getenv('EXPECTED_METRICS_DIR')
11 | 
12 | 
13 | class TestBulkCIPipeline(TestCIPipeline):
14 | 
15 |     margin_loss, margin_time = 0.05, 0.1
16 | 
17 |     def _setup(self, config_name):
18 |         self.config_name = config_name
19 |         baseline_filename = config_name + '.json'
20 | 
21 |         filepath = os.path.join(EXPECTED_METRICS_DIR, baseline_filename)
22 |         if os.path.exists(filepath):
23 |             with open(filepath) as f:
24 |                 self.expected = json.load(f)
25 |         else:
26 |             raise FileNotFoundError(f"{baseline_filename} does not exist")
27 | 
28 |     def _get_actual(self, loss_type):
29 |         return read_tb_logs_as_list(LOGS_DIR+'/'+self.config_name, loss_type)
30 | 
31 |     @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR))
32 |     def test_lm_loss_deterministic(self, config_name):
33 |         # Expected training loss curve at different global steps.
34 |         self._setup(config_name)
35 |         self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
36 | 
37 |     @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR))
38 |     def test_lm_loss_approx(self, config_name):
39 |         # Expected training loss curve at different global steps.
40 |         self._setup(config_name)
41 |         self._test_helper("lm loss", TypeOfTest.APPROX)
42 | 
43 |     @pytest.mark.parametrize("config_name", os.listdir(LOGS_DIR))
44 |     def test_num_zeros_deterministic(self, config_name):
45 |         # Expected validation loss curve at different global steps.
46 |         self._setup(config_name)
47 |         self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
48 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49405, 10.48276, 10.49249, 10.47813, 10.46623, 10.35183, 10.17697, 10.07728, 9.8875, 9.68029]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2018.0, 2636.0, 2067.0, 2225.0, 2555.0, 2554.0, 2969.0, 2935.0, 2967.0, 2287.0]}, "iteration_timing_avg": 0.5847132352941178}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4681, 10.45734, 10.4491, 10.44102, 10.41779, 10.34626, 10.11378, 10.04382, 9.86692, 9.67893]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2373.0, 2593.0, 2187.0, 2403.0, 2412.0, 2617.0, 3083.0, 3341.0, 3558.0, 3213.0]}, "iteration_timing_avg": 0.8346488235294117}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.42107, 10.42897, 10.43577, 10.40787, 10.38455, 10.32433, 10.13158, 10.04316, 9.86274, 9.65777]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2229.0, 3600.0, 3300.0, 3311.0, 3522.0, 3498.0, 4076.0, 4135.0, 4709.0, 4350.0]}, "iteration_timing_avg": 1.8964105882352944}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp1_pp2.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50096, 10.48594, 10.4936, 10.48501, 10.50417, 10.4773, 10.42153, 10.29719, 10.15831, 9.9675]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [18201.0, 19789.0, 21743.0, 18735.0, 21941.0, 19700.0, 21781.0]}, "iteration_timing_avg": 0.4730702941176471}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_345m_nightly_dgx_a100_1N8G_tp4_pp1.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.49275, 10.48836, 10.51349, 10.49399, 10.47549, 10.41922, 10.28044, 10.14255, 9.94736]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26212.0, 19433.0, 24101.0, 23509.0, 21539.0, 17889.0, 19123.0]}, "iteration_timing_avg": 1.6886158823529411}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44389, 10.35605, 10.13777, 10.04004, 9.86833, 9.67303]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2398.0, 2539.0, 2945.0, 3162.0, 3457.0, 3125.0]}, "iteration_timing_avg": 0.8110379411764704}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lm loss": {
 3 |         "start_step": 0,
 4 |         "end_step": 50,
 5 |         "step_interval": 5,
 6 |         "values": [
 7 |             10.49566,
 8 |             10.48166,
 9 |             10.48045,
10 |             10.45348,
11 |             10.44393,
12 |             10.35605,
13 |             10.13787,
14 |             10.04034,
15 |             9.86836,
16 |             9.6732
17 |         ]
18 |     },
19 |     "num-zeros": {
20 |         "start_step": 0,
21 |         "end_step": 50,
22 |         "step_interval": 5,
23 |         "values": [
24 |             2183.0,
25 |             2469.0,
26 |             2115.0,
27 |             2126.0,
28 |             2322.0,
29 |             2411.0,
30 |             2892.0,
31 |             3234.0,
32 |             3637.0,
33 |             2992.0
34 |         ]
35 |     },
36 |     "mem-allocated-bytes": {
37 |         "start_step": 0,
38 |         "end_step": 50,
39 |         "step_interval": 5,
40 |         "values": [
41 |             1718216192.0,
42 |             1718216192.0,
43 |             1718216192.0,
44 |             1718216192.0,
45 |             1718216192.0,
46 |             1718216192.0,
47 |             1718216192.0,
48 |             1718216192.0,
49 |             1718216192.0,
50 |             1718216192.0
51 |         ]
52 |     },
53 |     "iteration-time": {
54 |         "start_step": 0,
55 |         "end_step": 50,
56 |         "step_interval": 5,
57 |         "values": [
58 |             13.22827,
59 |             0.88854,
60 |             0.92588,
61 |             0.89793,
62 |             0.95437,
63 |             0.88007,
64 |             0.88504,
65 |             0.88703,
66 |             0.89866,
67 |             0.88756
68 |         ]
69 |     }
70 | }


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_mr_resume_tp1_pp2dgx_a100_1N8G_.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49533, 10.46662, 10.42395, 10.30693, 10.15979, 9.96957, 9.87618, 9.75265, 9.63628, 9.54659, 9.49973, 9.35968, 9.33181, 9.2626, 9.26439, 9.21492]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [18772.0, 19035.0, 22350.0, 18671.0, 20738.0, 23121.0, 22655.0, 27141.0, 24304.0, 25619.0, 17322.0, 32489.0, 28409.0, 21067.0, 37615.0, 30599.0, 26145.0]}, "iteration_timing_avg": 0.3927519402985073}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49569, 10.4596, 10.32846, 10.17265, 9.96951]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27627.0, 22759.0, 22567.0, 20671.0, 23229.0]}, "iteration_timing_avg": 0.7692817647058824}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/bert_mr_tp2_pp2_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.0958791176470588}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312, 9.8347, 9.61264, 9.67965, 9.68133, 9.60021, 9.06887, 9.46573, 9.06116, 9.32103, 9.51104]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0, 2686.0, 2671.0, 3014.0, 3152.0, 2960.0, 3015.0, 3735.0, 2675.0, 2947.0, 3414.0]}, "iteration_timing_avg": 0.08244119402985074}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0]}, "iteration_timing_avg": 0.11905411764705882}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.1541691176470588}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153, 9.83685, 9.60745, 9.68285, 9.6869, 9.60677, 9.07989, 9.47324, 9.07018, 9.33019, 9.51809]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0, 2540.0, 2588.0, 3110.0, 3059.0, 2924.0, 2894.0, 3694.0, 2720.0, 2635.0, 3456.0]}, "iteration_timing_avg": 0.150555671641791}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404, 9.85697, 9.65534, 9.71837, 9.74563, 9.63824, 9.13952, 9.51114, 9.10678, 9.3932, 9.56085]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0, 4218.0, 4359.0, 4468.0, 5080.0, 4575.0, 4964.0, 5755.0, 4852.0, 4092.0, 5592.0]}, "iteration_timing_avg": 0.33336671641791044}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_4experts2parallel.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81916, 10.86702, 10.85724, 10.80665, 10.71115, 10.63679, 10.16197, 10.277, 10.18384, 9.88281, 9.89125, 9.67734, 9.74917, 9.75758, 9.65591, 9.15592, 9.52069, 9.11526, 9.4051, 9.56814]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [7138.0, 8525.0, 8821.0, 8718.0, 7682.0, 8227.0, 7158.0, 8514.0, 9143.0, 9624.0, 9298.0, 10386.0, 10352.0, 12164.0, 10941.0, 12318.0, 13902.0, 11709.0, 10898.0, 12956.0]}, "iteration_timing_avg": 0.33394373134328353}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79589, 10.84021, 10.81376, 10.76508, 10.65703, 10.56193, 10.08837, 10.21303, 10.11641, 9.83404]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3018.0, 3528.0, 3496.0, 3388.0, 3149.0, 3337.0, 2811.0, 3403.0, 3728.0, 3648.0]}, "iteration_timing_avg": 0.33478764705882363}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_4experts2parallel.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81916, 10.86661, 10.85683, 10.80678, 10.7112, 10.63712, 10.16253, 10.27882, 10.18795, 9.88907]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [12923.0, 15794.0, 16416.0, 15771.0, 14114.0, 15096.0, 12918.0, 15842.0, 16657.0, 17467.0]}, "iteration_timing_avg": 0.340485}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0]}, "iteration_timing_avg": 0.27329441176470587}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.27828194029850745}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86126, 10.88643, 10.87769, 10.83107, 10.71638, 10.60596, 10.13128, 10.22754, 10.15911, 9.83462, 9.85168, 9.62946, 9.69489, 9.71388, 9.61776, 9.09854, 9.48539, 9.07183, 9.3531, 9.52651]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1752.0, 2067.0, 2155.0, 2021.0, 1955.0, 1968.0, 1742.0, 2198.0, 2491.0, 2606.0, 2850.0, 2977.0, 3220.0, 3391.0, 3297.0, 3203.0, 4083.0, 3048.0, 2939.0, 3838.0]}, "iteration_timing_avg": 0.2851294029850746}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.8893, 10.84864, 10.6962, 10.63918, 10.5393, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1298.0, 1352.0, 1590.0, 1403.0, 1435.0, 1266.0, 1195.0]}, "iteration_timing_avg": 0.07655911764705883}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.88931, 10.84864, 10.6962, 10.63918, 10.53931, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1131.0, 1173.0, 1218.0, 1783.0, 1278.0, 1244.0, 1555.0]}, "iteration_timing_avg": 0.07975499999999999}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0]}, "iteration_timing_avg": 0.3671870588235294}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.12348235294117646}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.74049, 10.81937, 10.84178, 10.75551, 10.69818, 10.63091, 10.20265, 10.36288, 10.25632, 9.94256]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2527.0, 2937.0, 2975.0, 2749.0, 2580.0, 2593.0, 2320.0, 2616.0, 2541.0, 2393.0]}, "iteration_timing_avg": 0.12725500000000006}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.90105, 10.91104, 10.91635, 10.84822, 10.70727, 10.63018, 10.15241, 10.26052, 10.15994, 9.83162]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727086.0, 23021732.0, 22500940.0, 22830674.0, 22739332.0, 22547236.0, 22955516.0, 22590012.0, 22659588.0, 22884630.0]}, "iteration_timing_avg": 0.1246464705882353}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87448, 10.87794, 10.79507, 10.68154, 10.59412, 10.09987, 10.20952, 10.13639, 9.80012]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1734.0, 1884.0, 1684.0, 1815.0, 1766.0, 1601.0, 1904.0, 2361.0, 2347.0]}, "iteration_timing_avg": 0.12273676470588235}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12168999999999999}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12873676470588236}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9359, 10.93547, 10.94238, 10.88073, 10.75653, 10.66332, 10.1672, 10.27241, 10.19577, 9.86006]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727686.0, 23020980.0, 22501260.0, 22830024.0, 22739772.0, 22548148.0, 22955712.0, 22589816.0, 22660000.0, 22884332.0]}, "iteration_timing_avg": 0.12799705882352944}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86506, 10.87184, 10.80703, 10.71158, 10.63915, 10.1929, 10.30937, 10.21969, 9.91592]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37021.0, 37806.0, 36157.0, 33974.0, 34873.0, 30957.0, 35062.0, 36419.0, 37713.0]}, "iteration_timing_avg": 0.35529294117647064}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79756, 10.86462, 10.87239, 10.80678, 10.7118, 10.63911, 10.19319, 10.30944, 10.21988, 9.91603]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31137.0, 37033.0, 37783.0, 36040.0, 33452.0, 34761.0, 30933.0, 35487.0, 36392.0, 37655.0]}, "iteration_timing_avg": 0.3566726470588235}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86367, 10.80237, 10.71665, 10.6452, 10.21186, 10.32279, 10.22474, 9.93034]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38080.0, 36072.0, 33389.0, 34302.0, 30262.0, 35071.0, 36081.0, 36818.0]}, "iteration_timing_avg": 0.2153429411764706}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80355, 10.86508, 10.86435, 10.80239, 10.7159, 10.6454, 10.21181, 10.32236, 10.22471, 9.92956]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31271.0, 37782.0, 38276.0, 36278.0, 32946.0, 34291.0, 30145.0, 35217.0, 36060.0, 37032.0]}, "iteration_timing_avg": 0.21900323529411767}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80324, 10.86861, 10.87486, 10.7986, 10.66452, 10.58021, 10.05487, 10.18533, 10.097, 9.75749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [26144.0, 31960.0, 32510.0, 31451.0, 28954.0, 30872.0, 29506.0, 33312.0, 34558.0, 36855.0]}, "iteration_timing_avg": 0.28211852941176474}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93628, 10.89335, 10.87322, 10.7487, 10.65379, 10.15754, 10.2464, 10.15175, 9.83801]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [68.0, 64.0, 61.0, 58.0, 55.0, 85.0, 77.0, 68.0, 78.0, 63.0]}}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_resume_dgx_a100_1N8G_tp1_pp2.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62396, 10.53554, 10.25187, 10.20873, 9.96714, 9.96605, 9.92367, 9.79179, 9.26742, 9.61926, 9.18974, 9.46019, 9.62277]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2078.0, 2328.0, 2420.0, 2256.0, 2180.0, 2078.0, 2313.0, 2933.0, 2712.0, 2270.0, 2872.0, 3003.0, 3555.0, 3066.0, 3103.0, 3098.0, 3762.0]}, "iteration_timing_avg": 0.13093716417910448}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/gpt3_mr_tp2_pp2_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13995, 9.14036, 9.13054, 9.12408, 9.0791, 9.06608, 9.01164, 8.97073, 8.93805, 8.85873]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2852600.0, 2939939.0, 2850191.0, 2774638.0, 3035015.0, 2853397.0, 2787109.0, 2832834.0, 2809354.0, 2940633.0]}, "iteration_timing_avg": 0.2253964705882353}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/multimodal_llava_mr_mcore_te_tp2_pp4_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13682, 9.13803, 9.13233, 9.12379, 9.09228, 9.07609, 9.02997, 8.99391, 8.96074, 8.89575]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2918419.0, 3005942.0, 2916151.0, 2840544.0, 3100625.0, 2919164.0, 2852935.0, 2898444.0, 2875057.0, 3006499.0]}, "iteration_timing_avg": 0.2253964705882353}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/jet/t5_220m_mr_mcore_tp2_pp4_dgx_a100_1N8G.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.39452, 9.22332, 8.69422, 8.39796, 8.11874, 8.01176, 7.72419, 7.44126, 7.3078, 7.2363]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [115739.0, 111092.0, 117169.0, 112383.0, 118597.0, 117024.0, 111417.0, 114098.0, 118529.0, 117033.0]}, "iteration_timing_avg": 0.2253964705882353}


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
1 | import torch._dynamo
2 | torch._dynamo.config.suppress_errors = True


--------------------------------------------------------------------------------
/tests/unit_tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/data/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/data/test_multimodal_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | ##
 4 | # Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
 5 | ##
 6 | 
 7 | import torch
 8 | 
 9 | from megatron.core.datasets.utils import compile_helpers
10 | from tests.unit_tests.test_utilities import Utils
11 | 
12 | if torch.distributed.is_available():
13 |     Utils.initialize_distributed()
14 |     if torch.distributed.get_rank() == 0:
15 |         compile_helpers()
16 |     torch.distributed.barrier()
17 | else:
18 |     compile_helpers()
19 | 
20 | ##
21 | # Done
22 | ##
23 | 
24 | from types import SimpleNamespace
25 | 
26 | from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
27 | from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig
28 | from megatron.training.tokenizer.tokenizer import _NullTokenizer
29 | 
30 | _MOCK_VOCAB_SIZE = 8192
31 | 
32 | 
33 | def test_mock_multimodal_dataset():
34 |     config = MultimodalDatasetConfig(
35 |         random_seed=1234,
36 |         sequence_length=1024,
37 |         reset_position_ids=False,
38 |         reset_attention_mask=False,
39 |         eod_mask_loss=True,
40 |         image_h=336,
41 |         image_w=336,
42 |         split="990,9,1",
43 |         tokenizer=_NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE),
44 |     )
45 | 
46 |     datasets = BlendedMegatronDatasetBuilder(
47 |         MockMultimodalDataset, [100, 100, 100], lambda: True, config
48 |     ).build()
49 | 
50 |     for ds in datasets:
51 |         sample = ds[0]
52 |         assert "image" in sample
53 |         assert sample["image"].shape == torch.Size([3, 336, 336])
54 |         assert "tokens" in sample
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     test_mock_multimodal_dataset()
59 | 


--------------------------------------------------------------------------------
/tests/unit_tests/dist_checkpointing/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import weakref
 3 | from pathlib import Path
 4 | from shutil import rmtree
 5 | from tempfile import TemporaryDirectory
 6 | from typing import Union, Optional
 7 | 
 8 | from tests.unit_tests.test_utilities import Utils
 9 | 
10 | 
11 | def empty_dir(path: Path):
12 |     if Utils.rank > 0:
13 |         return
14 |     for p in path.iterdir():
15 |         if p.is_dir():
16 |             rmtree(p)
17 |         else:
18 |             p.unlink()
19 | 
20 | 
21 | 
22 | class TempNamedDir(TemporaryDirectory):
23 |     """ TemporaryDirectory with a fully named directory. Empties the dir if not empty. """
24 |     def __init__(self, name: Union[str, Path], sync=True,
25 |                  ignore_cleanup_errors=False) -> None:
26 |         self.name = str(name)
27 |         if Utils.rank == 0:
28 |             os.makedirs(name, exist_ok=True)
29 |             empty_dir(Path(name))
30 | 
31 |         self._ignore_cleanup_errors = ignore_cleanup_errors
32 |         self._finalizer = weakref.finalize(
33 |             self, self._cleanup, self.name,
34 |             warn_message="Implicitly cleaning up {!r}".format(self))
35 |         self.sync = sync
36 | 
37 |     def cleanup(self, override_sync: Optional[bool] = None) -> None:
38 |         sync = self.sync if override_sync is None else override_sync
39 |         if sync :
40 |             import torch
41 |             torch.distributed.barrier()
42 | 
43 |         if Utils.rank == 0:
44 |             super().cleanup()
45 | 
46 |     def __enter__(self):
47 |         path = Path(super().__enter__())
48 |         if self.sync:
49 |             import torch
50 |             torch.distributed.barrier()
51 |         return path
52 | 
53 |     def __exit__(self, exc_type, exc_val, exc_tb):
54 |         raised = exc_type is not None
55 |         if not raised:
56 |             self.cleanup()
57 | 
58 | 


--------------------------------------------------------------------------------
/tests/unit_tests/dist_checkpointing/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from unittest import mock
 3 | 
 4 | import pytest
 5 | 
 6 | from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy
 7 | from tests.unit_tests.dist_checkpointing import TempNamedDir
 8 | from tests.unit_tests.test_utilities import Utils
 9 | 
10 | 
11 | @pytest.fixture(scope="session")
12 | def tmp_path_dist_ckpt(tmp_path_factory) -> Path:
13 |     """ Common directory for saving the checkpoint.
14 | 
15 |     Can't use pytest `tmp_path_factory` directly because directory must be shared between processes. """
16 | 
17 |     tmp_dir = tmp_path_factory.mktemp('ignored', numbered=False)
18 |     tmp_dir = tmp_dir.parent.parent / 'tmp_dist_ckpt'
19 | 
20 |     if Utils.rank == 0:
21 |         with TempNamedDir(tmp_dir, sync=False):
22 |             yield tmp_dir
23 | 
24 |     else:
25 |         yield tmp_dir
26 | 
27 | 
28 | @pytest.fixture(scope='session', autouse=True)
29 | def set_default_dist_ckpt_strategy():
30 |     def get_pyt_dist_save_sharded_strategy():
31 |         return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1)
32 | 
33 |     with mock.patch(
34 |         'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy',
35 |         new=get_pyt_dist_save_sharded_strategy,
36 |     ) as _fixture:
37 |         yield _fixture
38 | 


--------------------------------------------------------------------------------
/tests/unit_tests/dist_checkpointing/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/dist_checkpointing/models/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/fusions/test_torch_softmax.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax
 5 | from megatron.core.transformer.enums import AttnMaskType
 6 | from megatron.core.transformer.utils import attention_mask_func
 7 | 
 8 | 
 9 | class TestTorchSoftmax:
10 |     def setup_method(self, method):
11 |         # The important settings tested are forward_torch_softmax path
12 |         # with locally generated casual mask for attention_mask_func:
13 |         self.softmax = FusedScaleMaskSoftmax(
14 |             input_in_fp16=False,
15 |             input_in_bf16=False,
16 |             attn_mask_type=AttnMaskType.causal,
17 |             scaled_masked_softmax_fusion=False,
18 |             mask_func=attention_mask_func,
19 |             softmax_in_fp32=True,
20 |             scale=None,
21 |         )
22 | 
23 |     def test_output_shape(self):
24 |         x = torch.randn(8, 2, 4, 4, device="cuda")
25 |         y = self.softmax(x, None)
26 |         assert x.shape == y.shape
27 | 
28 |     def test_causal_mask_input_shape_assert(self):
29 |         x = torch.randn(1, 1, 4, 16, device="cuda")
30 |         with pytest.raises(AssertionError):
31 |             self.softmax(x, None)
32 | 
33 |     def test_causal_mask_equal_scores(self):
34 |         # For equal input values (e.g. zero) correctly masked softmax should
35 |         # produce equal scores among non-masked elements. For example, in case
36 |         # sq == sk == 2 the expected output is (ignoring b and np dimensions):
37 |         # [[1.0, 0.0],
38 |         #  [0.5, 0.5]]
39 |         b, np, sq, sk = 8, 2, 32, 32
40 |         x = torch.zeros([b, np, sq, sk]).cuda()
41 |         y = self.softmax(x, None)
42 |         y_expected = torch.tril(torch.ones(b, np, sq, sk, device="cuda"))
43 |         y_expected /= torch.arange(1, sq + 1, device="cuda").reshape((-1, 1))
44 |         assert torch.allclose(y, y_expected, rtol=1e-08, atol=1e-08)
45 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/engines/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/model_inference_wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/model_inference_wrappers/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
 3 | 
 4 | class TestModelInferenceWrapperConfig:
 5 | 
 6 |     def test_inference_params(self):
 7 |         inference_parameters = InferenceWrapperConfig(
 8 |             hidden_size=10,
 9 |             inference_batch_times_seqlen_threshold=10,
10 |             padded_vocab_size=10,
11 |             params_dtype=torch.float,
12 |             fp32_residual_connection=False
13 |         )
14 |         inference_parameters.add_attributes({"abc": 45})
15 |         assert inference_parameters.abc == 45, f"min tokens not set correctly. it is {inference_parameters.min_tokens}"


--------------------------------------------------------------------------------
/tests/unit_tests/inference/test_common_inference_params.py:
--------------------------------------------------------------------------------
1 | from megatron.core.inference.common_inference_params import CommonInferenceParams
2 | 
3 | class TestCommonInferenceParams:
4 | 
5 |     def test_inference_params(self):
6 |         inference_parameters = CommonInferenceParams()
7 |         inference_parameters.add_attributes({"min_tokens": 45})
8 |         assert inference_parameters.min_tokens == 45, f"min tokens not set correctly. it is {inference_parameters.min_tokens}"


--------------------------------------------------------------------------------
/tests/unit_tests/inference/test_inference_utils.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.inference.utils import Counter
 2 | 
 3 | class TestInferenceUtils:
 4 | 
 5 |     def test_counter(self):
 6 |         counter = Counter()
 7 |         r = next(counter)
 8 |         assert r == 0, f'Counter return value should be 0 but it is {r}'
 9 |         assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}'
10 |         counter.reset()
11 |         assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}'
12 | 


--------------------------------------------------------------------------------
/tests/unit_tests/inference/text_generation_controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/inference/text_generation_controllers/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/pipeline_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/tensor_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
 2 | import torch
 3 | from tests.unit_tests.test_utilities import Utils
 4 | import numpy as np
 5 | 
 6 | def test_vocab_parallel_cross_entropy():
 7 |     Utils.initialize_model_parallel(4,2)
 8 |     vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda()
 9 |     target = torch.arange(0,32,2).cuda()
10 |     output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
11 |     expected_output = torch.tensor([10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309,
12 |         10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309]).cuda()
13 |     assert(torch.equal(torch.round(expected_output), torch.round(output)))
14 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_data.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.data import broadcast_data
 2 | import torch
 3 | from tests.unit_tests.test_utilities import Utils
 4 | 
 5 | def test_broadcast_data():
 6 |     Utils.initialize_model_parallel(2,4)
 7 |     input_data = {
 8 |         0 : torch.ones((8,8)).cuda() * 0.0,
 9 |         1 : torch.ones((8,8)).cuda() * 1.0,
10 |         2 : torch.ones((8,8)).cuda() * 2.0,
11 |         3 : torch.ones((8,8)).cuda() * 3.0,
12 |         4 : torch.ones((8,8)).cuda() * 4.0,
13 |         5 : torch.ones((8,8)).cuda() * 5.0,
14 |         6 : torch.ones((8,8)).cuda() * 6.0,
15 |         7 : torch.ones((8,8)).cuda() * 7.0
16 |         }
17 |     dtype = torch.float32
18 |     actual_output = broadcast_data([0,1],input_data, dtype)
19 |     assert(torch.equal(actual_output[0], input_data[0]))
20 |     assert(torch.equal(actual_output[1], input_data[1]))
21 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_random.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed,get_cuda_rng_tracker
 3 | from megatron.core.tensor_parallel.random import checkpoint
 4 | from tests.unit_tests.test_utilities import Utils
 5 | import pytest
 6 | import torch
 7 | 
 8 | def test_cuda_rng_states_tracker():
 9 |     rng_tracker = CudaRNGStatesTracker()
10 |     rng_tracker.set_states({"state1":1234})
11 |     assert(rng_tracker.get_states()["state1"] == 1234)
12 |     rng_tracker.reset()
13 |     assert(rng_tracker.get_states() == {})
14 |     seed = 1111
15 |     rng_tracker.add("state2",seed)
16 |     with pytest.raises(Exception):
17 |         assert(rng_tracker.add("state3",seed))
18 |     with pytest.raises(Exception):
19 |         assert(rng_tracker.add("state2",111))
20 |     assert(rng_tracker.get_states()['state2'] is not None)
21 |     with pytest.raises(Exception):
22 |         assert()
23 |     
24 |     rng_tracker.fork("state2")
25 |     torch.cuda.manual_seed(seed)
26 |     rng_state = torch.cuda.get_rng_state()
27 |     assert torch.equal(rng_tracker.get_states()['state2'], rng_state)
28 | 
29 | def test_model_parallel_cuda_manual_seed():
30 |     Utils.initialize_model_parallel(4,2)
31 |     model_parallel_cuda_manual_seed(0)
32 |     rng_tracker = get_cuda_rng_tracker()
33 |     assert(rng_tracker.get_states()['model-parallel-rng'] is not None)
34 |     Utils.destroy_model_parallel()
35 | 
36 | def test_checkpoint():
37 |     def test_forward(*input):
38 |         return input[0]+input[1]
39 |     assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2)))
40 |     Utils.initialize_model_parallel()
41 |     input1 = torch.ones((4,4))
42 |     checkpoint(test_forward, True, input1, torch.ones((4,4))*2)
43 |     assert(torch.equal(torch.ones(input1.numel()).cuda(), input1))
44 |     Utils.destroy_model_parallel()
45 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_local_multi_tensor_fns.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from megatron.core.utils import (
 3 |     local_multi_tensor_applier,
 4 |     local_multi_tensor_l2_norm,
 5 |     local_multi_tensor_scale
 6 | )
 7 | import pytest
 8 | import torch
 9 | 
10 | def test_local_multi_tensor_l2_norm_and_scale():
11 |     amp_C = pytest.importorskip("amp_C")
12 |     multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply")
13 | 
14 |     torch.manual_seed(42)
15 | 
16 |     tensor_list = [torch.rand(5,5).cuda() for _ in range(10)]
17 |     tensor_list_copy = copy.deepcopy(tensor_list)
18 | 
19 |     norm_apex, _ = multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False)
20 |     norm_local, _ = multi_tensor_apply.multi_tensor_applier(local_multi_tensor_l2_norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list_copy], False)
21 |     torch.testing.assert_close(norm_apex, norm_local)
22 | 
23 |     clip_coeff = 0.05
24 |     multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_scale, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list, tensor_list], clip_coeff)
25 |     multi_tensor_apply.multi_tensor_applier(local_multi_tensor_scale, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list_copy, tensor_list_copy], clip_coeff)
26 |     torch.testing.assert_close(tensor_list, tensor_list_copy)
27 | 
28 | def test_local_multi_tensor_apply():
29 |     amp_C = pytest.importorskip("amp_C")
30 |     multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply")
31 | 
32 |     tensor_list = [torch.rand(5,5).cuda() for _ in range(10)]
33 | 
34 |     norm_apex, _ = multi_tensor_apply.multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False)
35 |     norm_local, _ = local_multi_tensor_applier(amp_C.multi_tensor_l2norm, torch.tensor([0], dtype=torch.int, device='cuda'), [tensor_list], False)
36 |     torch.testing.assert_close(norm_apex, norm_local)
37 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_training.py:
--------------------------------------------------------------------------------
 1 | from types import SimpleNamespace
 2 | 
 3 | from megatron.training.global_vars import set_args
 4 | from megatron.training.training import build_train_valid_test_data_iterators
 5 | from tests.unit_tests.test_utilities import Utils
 6 | 
 7 | 
 8 | def mock_train_valid_test_datasets_provider(train_val_test_num_samples):
 9 |     return 1, 2, 3
10 | 
11 | 
12 | def create_test_args():
13 |     # Set dummy values for the args.
14 |     args = SimpleNamespace()
15 |     args.iteration = 0
16 |     args.train_samples = 1
17 |     args.train_iters = 1
18 |     args.eval_interval = 1
19 |     args.eval_iters = 1
20 |     args.global_batch_size = 1
21 |     args.consumed_train_samples = 1
22 |     args.consumed_valid_samples = 1
23 |     args.dataloader_type = "external"
24 |     args.skip_train = False
25 | 
26 |     return args
27 | 
28 | 
29 | class TestTraining:
30 |     def setup_method(self, method):
31 |         Utils.initialize_model_parallel(1, 1)
32 |         args = create_test_args()
33 |         set_args(args)
34 | 
35 |     def test_build_train_valid_test_data_iterators(self):
36 |         train_iter, valid_iter, test_iter = build_train_valid_test_data_iterators(
37 |             mock_train_valid_test_datasets_provider
38 |         )
39 | 
40 |         assert (train_iter, valid_iter, test_iter) == (1, 2, 3)
41 | 
42 |     def teardown_method(self, method):
43 |         Utils.destroy_model_parallel()
44 | 


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/transformer/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sail-sg/VocabularyParallelism/6191c3bff018e2407d0c2d543be840fb3bd437f6/tests/unit_tests/transformer/moe/__init__.py


--------------------------------------------------------------------------------
/tools/autoformat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euox pipefail
 3 | 
 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 5 | CHECK_ONLY=${CHECK_ONLY:-false}
 6 | CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/main megatron/core | grep '\.py$' || true)
 7 | ADDITIONAL_ARGS=""
 8 | 
 9 | if [[ $CHECK_ONLY == true ]]; then
10 |     ADDITIONAL_ARGS="--check "
11 | fi
12 | 
13 | # for now we just format core
14 | if [[ -n "$CHANGED_FILES" ]]; then
15 |     black $ADDITIONAL_ARGS --verbose --diff $CHANGED_FILES
16 |     isort $ADDITIONAL_ARGS $CHANGED_FILES
17 | else
18 |     echo Changeset is empty, all good.
19 | fi
20 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder
4 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | from megatron.training import get_args, get_tokenizer
 7 | 
 8 | 
 9 | class BertEmbeddingDataset(torch.utils.data.Dataset):
10 |     '''Dataset to convert a text dataset to Bert tokens.'''
11 | 
12 |     def __init__(self, text_dataset, max_seq_length):
13 | 
14 |         super().__init__()
15 | 
16 |         args = get_args()
17 | 
18 |         # Dataset, tokenizer.
19 |         self.text_dataset = text_dataset
20 |         self.max_seq_length = max_seq_length
21 |         self.bert_tokenizer = get_tokenizer()
22 | 
23 |     def __len__(self):
24 |         return len(self.text_dataset)
25 | 
26 |     @classmethod
27 |     def build_sample(cls, tokenizer, token_ids):
28 |         get_constant_array = lambda c : np.full((len(token_ids) + 2,), c, "int64")
29 |         return {
30 |             "text" : np.array([ tokenizer.cls, *token_ids, tokenizer.sep ], dtype="int64"),
31 |             "types" : get_constant_array(0),
32 |             "labels" : get_constant_array(-1),
33 |             "is_random" : 0,
34 |             "loss_mask" : get_constant_array(0),
35 |             "padding_mask" : get_constant_array(1),
36 |             "truncated" : 0,
37 |         }
38 | 
39 |     def __getitem__(self, idx):
40 | 
41 |         # Text.
42 |         text_sample = self.text_dataset[idx]
43 |         text = text_sample["text"]
44 |         text = text.replace("<|endoftext|>", "")
45 | 
46 |         # Bert/Wordpiece tokens (+truncate).
47 |         bert_token_ids = self.bert_tokenizer.tokenize(text)
48 |         bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep.
49 |         if not bert_token_ids:
50 |             bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq
51 | 
52 |         # Bert sample.
53 |         sample = self.build_sample(self.bert_tokenizer, bert_token_ids)
54 | 
55 |         return sample
56 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "h5py",
 7 |     "transformers", # for huggingface bert
 8 | ]
 9 | 
10 | for lib in required_libs:
11 |     try:
12 |         globals()[lib] = importlib.import_module(lib)
13 |     except ImportError as e:
14 |         raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.")
15 | 


--------------------------------------------------------------------------------
/tools/checkpoint/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import psutil
 4 | 
 5 | 
 6 | def print_memory_usage(key, rank, num_ranks):
 7 |     '''Print memory usage.'''
 8 |     process = psutil.Process()
 9 |     mem_info = process.memory_info()
10 |     print("> memory usage: '%s', rank %d / %d, mem %.1f/%.1f gb." % (
11 |         key,
12 |         rank,
13 |         num_ranks,
14 |         mem_info.rss / 1024**3,
15 |         100 * mem_info.rss / process.memory_percent() / 1024**3,
16 |     ))
17 | 
18 | 
19 | def get_mcore_transformer_block_key(model_key):
20 |     return {
21 |         "GPT" : "decoder",
22 |         "BERT" : "encoder",
23 |     }[model_key]
24 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | 
 8 | """
 9 | This code adds id to each json object in a json file. User can add prefix
10 | to the ids.
11 | """
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     print('parsing the arguments ...')
16 | 
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
19 |         ' json file where id needs to be added')
20 |     parser.add_argument('--output-file', type=str, default=None, help=\
21 |         'Output file name with id')
22 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
23 |         'Id prefix')
24 |     parser.add_argument('--log-interval', type=int, default=100,
25 |                        help='Log interval')
26 |     args = parser.parse_args()
27 | 
28 |     print('Adding ids to dataset ...')
29 | 
30 |     f_input = open(args.input_file, 'r', encoding='utf-8')
31 |     f_output = open(args.output_file, 'wb')
32 | 
33 |     unique_ids = 1
34 |     start_time = time.time()
35 |     for row in f_input:
36 |         each_row = json.loads(row)
37 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
38 |         each_row['adlr_id'] = adlr_id_string
39 |         myjson = json.dumps(each_row, ensure_ascii=False)
40 | 
41 |         f_output.write(myjson.encode('utf-8'))
42 |         f_output.write('\n'.encode('utf-8'))
43 | 
44 |         if unique_ids % args.log_interval == 0:
45 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
46 |                     unique_ids, time.time() - start_time), flush=True)
47 | 
48 |         unique_ids += 1
49 | 
50 |     # Close the file.
51 |     f_input.close()
52 |     f_output.close()
53 |     
54 |     print('done :-)', flush=True)
55 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import glob
 5 | import sys
 6 | import json
 7 | import argparse
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--json_path", type=str, default=".",
13 |         help="path where all the json files are located")
14 | 
15 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
16 |         help="filename where the merged json should go")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     json_path = args.json_path
21 |     out_file = args.output_file
22 | 
23 |     json_files = glob.glob(json_path + '/*.json')
24 | 
25 |     counter = 0
26 | 
27 |     with open(out_file, 'w') as outfile:
28 |         for fname in json_files:
29 |             counter += 1
30 | 
31 |             if counter % 1024 == 0:
32 |                 print("Merging at ", counter, flush=True)
33 | 
34 |             with open(fname, 'r') as infile:
35 |                 for row in infile:
36 |                     each_row = json.loads(row)
37 |                     outfile.write(row)
38 | 
39 | 
40 |     print("Merged file", out_file, flush=True)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/report_theoretical_memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Computes theoretical memory footprint for model training without instantiating
 4 | a model and running training iterations on GPU(s)."""
 5 | 
 6 | from megatron.training import get_args
 7 | from megatron.training.initialize import initialize_megatron
 8 | from megatron.training.theoretical_memory_usage import report_theoretical_memory
 9 | 
10 | if __name__ == "__main__":
11 |     initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)
12 |     args = get_args()
13 | 
14 |     report_theoretical_memory(args, verbose=True)
15 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .cli import retro
4 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | from . import retro
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     retro.init(os.environ["RETRO_PROJECT_DIR"])
10 | 


--------------------------------------------------------------------------------
/tools/retro/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3
 2 | 
 3 | RUN pip install -U faiss-gpu
 4 | 
 5 | RUN apt update
 6 | 
 7 | RUN apt install -qy htop
 8 | 
 9 | RUN pip install -U transformers
10 | 
11 | RUN pip install --upgrade google-api-python-client
12 | 
13 | RUN pip install sentencepiece
14 | 
15 | RUN pip install h5py
16 | 
17 | RUN pip install nltk
18 | 
19 | RUN pip install einops
20 | 


--------------------------------------------------------------------------------
/tools/retro/sft/README.md:
--------------------------------------------------------------------------------
1 | ## Note
2 | 
3 | The content within this `sft` directory is still under active development and will be updated soon.


--------------------------------------------------------------------------------
/tools/retro/sft/open_inst.sh:
--------------------------------------------------------------------------------
1 | DATA_BLEND="1.0 open_inst"
2 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import sys
 3 | import json
 4 | import requests
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     url = sys.argv[1]
 9 |     url = 'http://' + url + '/api'
10 |     headers = {'Content-Type': 'application/json'}
11 | 
12 |     while True:
13 |         sentence = input("Enter prompt: ")
14 |         tokens_to_generate = int(eval(input("Enter number of tokens to generate: ")))
15 | 
16 |         data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate}
17 |         response = requests.put(url, data=json.dumps(data), headers=headers)
18 | 
19 |         if response.status_code != 200:
20 |             print(f"Error {response.status_code}: {response.json()['message']}")
21 |         else:
22 |             print("Megatron Response: ")
23 |             print(response.json()['text'][0])
24 | 


--------------------------------------------------------------------------------