├── .coveragerc
├── .github
    └── workflows
    │   └── python.yml
├── .gitignore
├── .gitlab-ci.yml
├── CODEOWNERS
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── dataset
    ├── README.md
    ├── download_books.sh
    ├── download_ckpt.sh
    └── download_vocab.sh
├── docs
    ├── distrib_optimizer.md
    └── images
    │   └── distrib_optimizer
    │       ├── data_flow.png
    │       └── sharding_scheme.png
├── examples
    ├── README.md
    ├── detxoify_lm
    │   ├── README.md
    │   ├── annotations
    │   │   ├── filter-selfgeneration.py
    │   │   ├── perspective_api_annotate.py
    │   │   └── preprocess.sh
    │   ├── finetune_gpt.py
    │   ├── finetune_gpt_distributed-1.3b.sh
    │   ├── generate-1.3b.sh
    │   ├── generate_samples_gpt.py
    │   ├── perspective_api.py
    │   └── self_generation
    │   │   └── selfgenerate-1.3b-unconditional.sh
    ├── evaluate_retriever_nq.sh
    ├── evaluate_zeroshot_gpt.sh
    ├── finetune_mnli_distributed.sh
    ├── finetune_race_distributed.sh
    ├── finetune_retriever_distributed.sh
    ├── merge_mp_bert.sh
    ├── msdp
    │   ├── README.md
    │   ├── data_processing.sh
    │   ├── eval_knwl_generation.sh
    │   ├── eval_resp_generation.sh
    │   ├── prep_resp_gen.sh
    │   ├── prompt_knwl_gen.sh
    │   └── prompt_resp_gen.sh
    ├── pretrain_bert.sh
    ├── pretrain_bert_distributed.sh
    ├── pretrain_bert_distributed_with_mp.sh
    ├── pretrain_gpt.sh
    ├── pretrain_gpt3_175B.sh
    ├── pretrain_gpt_distributed.sh
    ├── pretrain_gpt_distributed_with_mp.sh
    ├── pretrain_ict.sh
    ├── pretrain_t5.sh
    ├── pretrain_t5_distributed.sh
    ├── pretrain_t5_distributed_with_mp.sh
    ├── run_text_generation_server_345M.sh
    ├── run_text_generation_server_345M_8_tensor_parallel.sh
    └── sc21
    │   ├── CONFIG.sh
    │   ├── README.md
    │   ├── SBATCH.sh
    │   ├── SRUN.sh
    │   ├── run_figure_11.sh
    │   ├── run_figure_12.sh
    │   ├── run_figure_13.sh
    │   ├── run_figure_14.sh
    │   ├── run_figure_15.sh
    │   ├── run_figure_16.sh
    │   ├── run_figure_17.sh
    │   ├── run_figure_18.sh
    │   └── run_table_1.sh
├── examples_deepspeed
    ├── MoE
    │   ├── ds_config_gpt_TEMPLATE.json
    │   ├── ds_config_gpt_Zero2_TEMPLATE.json
    │   ├── ds_evalharness.sh
    │   ├── ds_pretrain_gpt_1.3B_MoE128.sh
    │   ├── ds_pretrain_gpt_1.3B_PR-MoE64or128.sh
    │   ├── ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh
    │   ├── ds_pretrain_gpt_1.3B_dense.sh
    │   ├── ds_pretrain_gpt_1.3B_dense_cl.sh
    │   ├── ds_pretrain_gpt_125M_MoE64.sh
    │   ├── ds_pretrain_gpt_125M_dense_cl.sh
    │   ├── ds_pretrain_gpt_350M_MoE128.sh
    │   ├── ds_pretrain_gpt_350M_PR-MoE32or64.sh
    │   ├── ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh
    │   ├── ds_pretrain_gpt_350M_dense.sh
    │   ├── ds_pretrain_gpt_6.7B_dense.sh
    │   └── readme_evalharness.md
    ├── README.md
    ├── azure
    │   ├── README.md
    │   ├── run-175b.sh
    │   ├── run-1t.sh
    │   └── run-benchmark-model.sh
    ├── azureml
    │   ├── Dockerfile.dockerfile
    │   ├── README.md
    │   ├── aml_submit.py
    │   └── prepare_dataset.py
    ├── bert_with_pile
    │   ├── README.md
    │   ├── ds_config_bert_TEMPLATE.json
    │   ├── ds_finetune_bert_mnli.sh
    │   ├── ds_finetune_bert_qqp.sh
    │   ├── ds_finetune_bert_race.sh
    │   ├── ds_pretrain_bert.sh
    │   └── prepare_pile_data.py
    ├── compression
    │   ├── 125M-Int8-test-64gpu-distilled-group48.sh
    │   ├── 125M-L10-Int8-test-64gpu-distilled-group48.sh
    │   ├── 125M-L12-Int8-test-64gpu-distilled-group48.sh
    │   ├── ds_config_gpt_TEMPLATE.json
    │   ├── ds_config_gpt_TEMPLATE_compression.json
    │   ├── ds_evalharness.sh
    │   ├── ds_pretrain_gpt_1.3B_dense_cl_kd.sh
    │   ├── ds_pretrain_gpt_125M_dense_cl_kd.sh
    │   ├── ds_pretrain_gpt_125M_dense_kd.sh
    │   └── ds_pretrain_gpt_350M_dense_kd.sh
    ├── curriculum_learning
    │   ├── README.md
    │   ├── ds_config_gpt_slw_TEMPLATE.json
    │   ├── ds_pretrain_gpt2.sh
    │   ├── ds_pretrain_gpt_1.3B_rope_slw.sh
    │   ├── ds_train.sh
    │   ├── ds_zero_stage_1_config_baseline.json
    │   └── ds_zero_stage_1_config_curriculum_fixed_linear.json
    ├── data_efficiency
    │   ├── README.md
    │   ├── analyze_data.py
    │   ├── bert
    │   │   ├── ds_analyze_bert_data_map.sh
    │   │   ├── ds_analyze_bert_data_reduce.sh
    │   │   ├── finetune
    │   │   │   ├── ds_config_bert_TEMPLATE.json
    │   │   │   ├── ds_finetune_bert_mnli.sh
    │   │   │   ├── ds_finetune_bert_qqp.sh
    │   │   │   ├── ds_finetune_bert_race.sh
    │   │   │   └── ds_finetune_gather_result.py
    │   │   ├── finetune_glue
    │   │   │   ├── ds_config_bert_TEMPLATE.json
    │   │   │   ├── ds_finetune_bert_glue.sh
    │   │   │   ├── ds_finetune_bert_glue_run.sh
    │   │   │   └── ds_finetune_gather_result.py
    │   │   ├── pile_data_download_preprocess.py
    │   │   └── pretrain
    │   │   │   ├── ds_config_bert_1clmetric_TEMPLATE.json
    │   │   │   ├── ds_config_bert_2clmetrics_TEMPLATE.json
    │   │   │   ├── ds_pretrain_bert_336M_base_script.sh
    │   │   │   └── ds_pretrain_bert_336M_run.sh
    │   └── gpt
    │   │   ├── ds_analyze_gpt_data_map.sh
    │   │   ├── ds_analyze_gpt_data_reduce.sh
    │   │   ├── eval
    │   │       ├── ds_config_eval_dummy.json
    │   │       ├── ds_evalharness_1gpu.sh
    │   │       ├── ds_evalharness_gather_result.py
    │   │       ├── ds_evalharness_parallel_run.sh
    │   │       └── ds_evalharness_parallel_run_10shot.sh
    │   │   └── pretrain
    │   │       ├── ds_config_gpt_1clmetric_TEMPLATE.json
    │   │       ├── ds_config_gpt_2clmetrics_TEMPLATE.json
    │   │       ├── ds_pretrain_gpt_1.3B_dense_base_script.sh
    │   │       └── ds_pretrain_gpt_1.3B_dense_run.sh
    ├── deepspeed4science
    │   └── megatron_long_seq_support
    │   │   ├── README.md
    │   │   ├── ds_config_gpt_TEMPLATE.json
    │   │   ├── host_file
    │   │   ├── pretrain_gpt_1.3B_seq_parallel.sh
    │   │   └── pretrain_gpt_30B_seq_parallel.sh
    ├── finetune_hf_llama
    │   ├── README.md
    │   ├── ds_config.json
    │   ├── ds_config_empty.json
    │   └── finetune_llama.sh
    ├── generate_text.sh
    ├── offload_pp
    │   ├── README.md
    │   ├── ds_config_gpt_TEMPLATE.json
    │   ├── ds_pretrain_gpt_350M.sh
    │   └── twin-offload.png
    ├── pretrain_llama2_distributed.sh
    ├── pretrain_llama_distributed.sh
    ├── rebase
    │   ├── README.md
    │   ├── ds_config_gpt_TEMPLATE.json
    │   ├── ds_config_gpt_slw_TEMPLATE.json
    │   ├── ds_pretrain_gpt_1.3B.sh
    │   ├── ds_pretrain_gpt_1.3B_megatron_checkpointing.sh
    │   ├── ds_pretrain_gpt_1.3B_rope.sh
    │   ├── ds_pretrain_gpt_1.3B_rope_slw.sh
    │   ├── ds_pretrain_gpt_125M.sh
    │   ├── ds_pretrain_gpt_125M_flashattn.sh
    │   ├── ds_pretrain_gpt_13B.sh
    │   ├── gpt2-merges.txt
    │   └── gpt2-vocab.json
    ├── run_deepspeed_example.sh
    ├── sequence_parallel
    │   ├── README.md
    │   ├── ds_config_gpt_TEMPLATE.json
    │   ├── ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
    │   ├── ds_pretrain_gpt_30B_seq_parallel_32k.sh
    │   ├── ds_pretrain_gpt_6.7B_fpdt_32k.sh
    │   └── preprocess_bookcorpus.py
    ├── universal_checkpointing
    │   ├── README.md
    │   ├── assets
    │   │   └── image
    │   │   │   ├── uc_char_training_loss.png
    │   │   │   ├── uc_char_validation_loss.png
    │   │   │   ├── uc_stage3_char_training_loss.png
    │   │   │   └── uc_stage3_char_validation_loss.png
    │   ├── ds_config.json
    │   ├── llama
    │   │   ├── run_llama_bf16.sh
    │   │   ├── run_tb_analysis_llama.sh
    │   │   └── run_universal_llama_bf16.sh
    │   ├── megatron_gpt
    │   │   ├── run_bf16.sh
    │   │   ├── run_fp16.sh
    │   │   ├── run_tb_analysis_gpt.sh
    │   │   ├── run_tb_analysis_gpt_plot_only.sh
    │   │   ├── run_universal_bf16.sh
    │   │   └── run_universal_fp16.sh
    │   └── tb_analysis
    │   │   ├── abstract_analysis.py
    │   │   ├── arguments.py
    │   │   ├── tb_analysis_script.py
    │   │   ├── uc_analysis.py
    │   │   └── utils.py
    └── zero_bubble_pp
    │   ├── README.md
    │   ├── benchmark.png
    │   ├── bw_split.png
    │   ├── zbh1.png
    │   └── zbh1_pretrain_gpt_1.3b.sh
├── finetune_llama.py
├── images
    ├── Achieved_petaFLOPs.png
    └── cases_april2021.png
├── megatron
    ├── __init__.py
    ├── arguments.py
    ├── checkpointing.py
    ├── core
    │   ├── README.md
    │   ├── __init__.py
    │   ├── enums.py
    │   ├── fusions
    │   │   ├── __init__.py
    │   │   ├── fused_bias_dropout.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_layer_norm.py
    │   │   └── fused_softmax.py
    │   ├── model_parallel_config.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── gpt_embedding.py
    │   │   │   └── gpt_model.py
    │   ├── package_info.py
    │   ├── parallel_state.py
    │   ├── pipeline_parallel
    │   │   ├── __init__.py
    │   │   ├── deepspeed_zbh1_engine.py
    │   │   ├── deepspeed_zbh1_schedule.py
    │   │   ├── p2p_communication.py
    │   │   └── schedules.py
    │   ├── requirements.txt
    │   ├── sequence_parallel
    │   │   ├── __init__.py
    │   │   └── cross_entropy.py
    │   ├── tensor_parallel
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   ├── utils.py
    │   │   └── weight_grad_store.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── core_attention.py
    │   │   ├── custom_layers
    │   │   │   └── transformer_engine.py
    │   │   ├── enums.py
    │   │   ├── mlp.py
    │   │   ├── module.py
    │   │   ├── transformer_block.py
    │   │   ├── transformer_config.py
    │   │   ├── transformer_layer.py
    │   │   └── utils.py
    │   └── utils.py
    ├── data
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── bert_dataset.py
    │   ├── biencoder_dataset_utils.py
    │   ├── blendable_dataset.py
    │   ├── data_samplers.py
    │   ├── dataset_utils.py
    │   ├── gpt_dataset.py
    │   ├── helpers.cpp
    │   ├── ict_dataset.py
    │   ├── image_folder.py
    │   ├── indexed_dataset.py
    │   ├── orqa_wiki_dataset.py
    │   ├── prompt_dataset.py
    │   ├── realm_dataset_utils.py
    │   ├── realm_index.py
    │   ├── t5_dataset.py
    │   ├── test
    │   │   ├── test_indexed_dataset.py
    │   │   └── test_preprocess_data.sh
    │   └── vit_dataset.py
    ├── dist_signal_handler.py
    ├── enums.py
    ├── fp16_deprecated
    │   └── loss_scaler.py
    ├── fused_kernels
    │   ├── __init__.py
    │   ├── compat.h
    │   ├── scaled_masked_softmax.cpp
    │   ├── scaled_masked_softmax.h
    │   ├── scaled_masked_softmax_cuda.cu
    │   ├── scaled_softmax.cpp
    │   ├── scaled_softmax_cuda.cu
    │   ├── scaled_upper_triang_masked_softmax.cpp
    │   ├── scaled_upper_triang_masked_softmax.h
    │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_fused_kernels.py
    │   └── type_shim.h
    ├── global_vars.py
    ├── indexer.py
    ├── initialize.py
    ├── memory.py
    ├── microbatches.py
    ├── model
    │   ├── __init__.py
    │   ├── bert_model.py
    │   ├── biencoder_model.py
    │   ├── classification.py
    │   ├── distributed.py
    │   ├── enums.py
    │   ├── fused_bias_gelu.py
    │   ├── fused_layer_norm.py
    │   ├── fused_rmsnorm.py
    │   ├── fused_softmax.py
    │   ├── gpt_model.py
    │   ├── language_model.py
    │   ├── module.py
    │   ├── multiple_choice.py
    │   ├── realm_model.py
    │   ├── rmsnorm.py
    │   ├── rotary_pos_embedding.py
    │   ├── t5_model.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── vision
    │   │   ├── __init__.py
    │   │   ├── classification.py
    │   │   ├── dino.py
    │   │   ├── esvit_swin_backbone.py
    │   │   ├── inpainting.py
    │   │   ├── knn_monitor.py
    │   │   ├── mit_backbone.py
    │   │   ├── swin_backbone.py
    │   │   ├── utils.py
    │   │   └── vit_backbone.py
    ├── mpu
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_layers.py
    │   │   └── test_random.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── clip_grads.py
    │   ├── distrib_optimizer.py
    │   ├── grad_scaler.py
    │   └── optimizer.py
    ├── optimizer_param_scheduler.py
    ├── p2p_communication.py
    ├── profiler.py
    ├── static
    │   └── index.html
    ├── text_generation
    │   ├── __init__.py
    │   ├── api.py
    │   ├── beam_utils.py
    │   ├── communication.py
    │   ├── forward_step.py
    │   ├── generation.py
    │   ├── sampling.py
    │   └── tokenization.py
    ├── text_generation_server.py
    ├── text_generation_utils.py
    ├── timers.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── bert_tokenization.py
    │   ├── gpt2_tokenization.py
    │   └── tokenizer.py
    ├── training.py
    └── utils.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_gpt_core.py
├── pretrain_ict.py
├── pretrain_retro.py
├── pretrain_t5.py
├── pretrain_vision_classify.py
├── pretrain_vision_dino.py
├── pretrain_vision_inpaint.py
├── setup.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_harness
    │   ├── download.py
    │   ├── evaluate.py
    │   └── report-to-csv.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── cola.py
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   ├── mrpc.py
    │   ├── qnli.py
    │   ├── qqp.py
    │   ├── rte.py
    │   ├── sst2.py
    │   └── stsb.py
    ├── main.py
    ├── msdp
    │   ├── README.md
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── prompt.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification
    │   │   ├── classification.py
    │   │   └── eval_utils.py
    │   ├── finetune_utils.py
    │   ├── main.py
    │   └── segmentation
    │   │   ├── cityscapes.py
    │   │   ├── data.py
    │   │   ├── finetune_segformer.py
    │   │   ├── finetune_setr.py
    │   │   ├── metrics.py
    │   │   ├── seg_heads.py
    │   │   ├── seg_models.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── functional_tests
    │   ├── __init__.py
    │   ├── python_test_utils
    │   │   ├── __init__.py
    │   │   ├── check_slurm_job_completion.py
    │   │   ├── get_test_results_from_tensorboard_logs.py
    │   │   ├── test_ci_pipeline.py
    │   │   └── test_resume_checkpoint_pipeline.py
    │   ├── shell_test_utils
    │   │   └── jobwait.sh
    │   ├── test_results
    │   │   ├── bert
    │   │   │   ├── bert_tp1_pp2_1nodes_50steps.json
    │   │   │   ├── bert_tp1_pp4_1nodes_50steps.json
    │   │   │   ├── bert_tp2_pp2_1nodes_50steps.json
    │   │   │   └── bert_tp4_pp1_1nodes_50steps.json
    │   │   └── gpt3
    │   │   │   ├── gpt3_tp1_pp2_1nodes_50steps.json
    │   │   │   ├── gpt3_tp1_pp4_1nodes_50steps.json
    │   │   │   ├── gpt3_tp2_pp2_1nodes_50steps.json
    │   │   │   └── gpt3_tp4_pp1_1nodes_50steps.json
    │   └── test_scripts
    │   │   ├── bert
    │   │       ├── pretrain_bert_distributed_resume_checkpoint_test.sh
    │   │       ├── pretrain_bert_distributed_test.sh
    │   │       ├── sbatch_bert_distributed_resume_checkpoint_test.sh
    │   │       └── sbatch_bert_distributed_test.sh
    │   │   └── gpt3
    │   │       ├── pretrain_gpt3_distributed_resume_checkpoint_test.sh
    │   │       ├── pretrain_gpt3_distributed_test.sh
    │   │       ├── sbatch_gpt3_distributed_resume_checkpoint_test.sh
    │   │       └── sbatch_gpt3_distributed_test.sh
    ├── models
    │   ├── __init__.py
    │   ├── test_gpt_embedding.py
    │   └── test_gpt_model.py
    ├── pipeline_parallel
    │   ├── __init__.py
    │   └── test_schedules.py
    ├── run_megatron.py
    ├── tensor_parallel
    │   └── __int__.py
    ├── test_megatron.py
    ├── transformer
    │   ├── __init__.py
    │   ├── test_core_attention.py
    │   ├── test_module.py
    │   ├── test_parallel_attention.py
    │   ├── test_parallel_mlp.py
    │   ├── test_parallel_transformer_block.py
    │   ├── test_parallel_transformer_layer.py
    │   └── test_transformer_config.py
    └── unit_tests
    │   ├── __init__.py
    │   ├── tensor_parallel
    │       ├── test_cross_entropy.py
    │       ├── test_data.py
    │       ├── test_mappings.py
    │       ├── test_random.py
    │       └── test_tensor_parallel_utils.py
    │   ├── test_basic.py
    │   ├── test_parallel_state.py
    │   ├── test_utilities.py
    │   └── test_utils.py
└── tools
    ├── __init__.py
    ├── bert_embedding
        ├── __init__.py
        ├── dataset.py
        ├── embed.py
        ├── external_libs.py
        ├── huggingface.py
        └── utils.py
    ├── checkpoint_loader_megatron.py
    ├── checkpoint_saver_megatron.py
    ├── checkpoint_util.py
    ├── convert_checkpoint
        ├── README.md
        ├── deepspeed_checkpoint.py
        ├── deepspeed_to_megatron.py
        ├── deepspeed_to_transformers.py
        ├── inspect_checkpoint.py
        └── inspect_deepspeed_checkpoint.py
    ├── generate_samples_gpt.py
    ├── hf2megads_weight_converter.py
    ├── linter.py
    ├── merge_datasets.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    ├── preprocess_data.py
    ├── preprocess_data_nmt.py
    ├── retro
        ├── README.md
        ├── __init__.py
        ├── cli
        │   ├── __init__.py
        │   ├── __main__.py
        │   └── cli.py
        ├── db
        │   ├── __init__.py
        │   ├── build.py
        │   ├── dataset.py
        │   └── utils.py
        ├── examples
        │   ├── get_dataset_configs.sh
        │   ├── get_preprocess_cmd.sh
        │   ├── preprocess_data.sh
        │   └── pretrain_model.sh
        ├── external_libs.py
        ├── index
        │   ├── __init__.py
        │   ├── build.py
        │   ├── factory.py
        │   ├── index.py
        │   ├── indexes
        │   │   ├── __init__.py
        │   │   ├── faiss_base.py
        │   │   └── faiss_par_add.py
        │   └── utils.py
        ├── main.py
        ├── query
        │   ├── __init__.py
        │   ├── chunk_dataset.py
        │   ├── query.py
        │   ├── retro_dataset.py
        │   └── utils.py
        └── utils.py
    ├── run_text_generation_server.py
    └── text_generation_cli.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [html]
2 | directory = coverage
3 | 
4 | [run]
5 | data_file = .coverage_$LOCAL_RANK
6 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yml:
--------------------------------------------------------------------------------
 1 | name: python
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |     branches:
 7 |       '**'
 8 |   schedule:
 9 |     - cron: "0 0 * * *"
10 | 
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   unit-tests:
17 |     strategy:
18 |       matrix:
19 |         pyVersion: ["3.7", "3.8", "3.9", "3.10"]
20 |       fail-fast: false
21 | 
22 |     runs-on: ubuntu-22.04
23 |     container:
24 |       image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
25 | 
26 |     steps:
27 |       - uses: actions/checkout@v4
28 | 
29 |       - name: environment
30 |         run: |
31 |           which python
32 |           python --version
33 |       - name: Install Megatron-DeepSpeed
34 |         run: |
35 |           pip3 install .
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | 
 3 | # Distribution / packaging
 4 | build/
 5 | dist/
 6 | *.egg-info/
 7 | 
 8 | # binaries
 9 | *.so
10 | 
11 | # tmp files
12 | *.swp
13 | 
14 | # AML workspace config file
15 | config.json
16 | 
17 | .coverage_*
18 | *~
19 | slurm*
20 | logs
21 | 
22 | # Data folder
23 | bookcorpus_data/


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft @guanhuawang
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | recursive-include megatron/fused_kernels *.cpp *.h *.cu *.tr *.cuh *.cc
4 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | # Run the scripts below to setup dataset 
2 | 
3 | bash download_books.sh
4 | 
5 | bash download_vocab.sh
6 | 


--------------------------------------------------------------------------------
/dataset/download_books.sh:
--------------------------------------------------------------------------------
1 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
2 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx


--------------------------------------------------------------------------------
/dataset/download_ckpt.sh:
--------------------------------------------------------------------------------
1 | mkdir -p checkpoints/gpt2_345m
2 | 
3 | cd checkpoints/gpt2_345m
4 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip
5 | unzip megatron_lm_345m_v0.0.zip
6 | rm megatron_lm_345m_v0.0.zip
7 | cd ../..
8 | 
9 | 


--------------------------------------------------------------------------------
/dataset/download_vocab.sh:
--------------------------------------------------------------------------------
1 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt


--------------------------------------------------------------------------------
/docs/images/distrib_optimizer/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/docs/images/distrib_optimizer/data_flow.png


--------------------------------------------------------------------------------
/docs/images/distrib_optimizer/sharding_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/docs/images/distrib_optimizer/sharding_scheme.png


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Original examples by NVIDIA/Megatron-LM
2 | 
3 | This folder includes examples from the original NVIDIA/Megatron-LM repo. All of them do NOT have DeepSpeed technologies integrations, and some of them may not work due to changes in this Megatron-DeepSpeed repo. Thus we recommend you to go to ```../examples_deepspeed/``` folder which includes examples that have DeepSpeed technologies integrated and are tested by DeepSpeed team.
4 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/annotations/preprocess.sh:
--------------------------------------------------------------------------------
 1 | VOCAB_FILE=pt2-vocab.json
 2 | MERGE_FILE=gpt2-merges.txt
 3 | 
 4 | python3 tools/preprocess_data.py \
 5 |     --input $1 \
 6 |     --output-prefix $2 \
 7 |     --vocab-file $VOCAB_FILE \
 8 |     --merge-file $MERGE_FILE \
 9 |     --tokenizer-type GPT2BPETokenizer \
10 |     --append-eod  --workers 20 --chunk-size 25
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Change for multinode config
 4 | GPUS_PER_NODE=16
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=$(($RANDOM + 1024))
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | # input
12 | DATA_PATH=$1
13 | SHARE_DATA=$PWD                       # current work dir
14 | FINETUNED_PATH="$SHARE_DATA/$2"
15 | lr=$3
16 | bs=$4
17 | iter=$5
18 | CHECKPOINT_PATH=$6
19 | 
20 | # vocab
21 | VOCAB_FILE=gpt2-vocab.json           # Your gpt-2 vocab
22 | MERGE_FILE=gpt2-merges.txt           # Your gpt-2 merge file
23 | 
24 | # tensorboard
25 | TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2"
26 | mkdir -p ${TENSORBOARD_DIR}
27 | 
28 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
29 | 
30 | python -m torch.distributed.run $DISTRIBUTED_ARGS \
31 |      examples/detxoify_lm/finetune_gpt.py \
32 |      --num-layers 24 \
33 |      --hidden-size 2048 \
34 |      --num-attention-heads 32 \
35 |      --micro-batch-size 4 \
36 |      --global-batch-size $bs \
37 |      --seq-length 2048 \
38 |      --max-position-embeddings 2048 \
39 |      --train-iters $iter \
40 |      --save $FINETUNED_PATH \
41 |      --load $CHECKPOINT_PATH \
42 |      --data-path $DATA_PATH \
43 |      --data-path2 ${DATA_BLEND} \
44 |      --vocab-file $VOCAB_FILE \
45 |      --merge-file $MERGE_FILE \
46 |      --data-impl mmap \
47 |      --split 100,0,0 \
48 |      --distributed-backend nccl \
49 |      --lr-decay-style constant \
50 |      --lr $lr \
51 |      --clip-grad 1.0 \
52 |      --weight-decay 0.1 \
53 |      --adam-beta1 0.9 \
54 |      --adam-beta2 0.95 \
55 |      --checkpoint-activations \
56 |      --log-interval 1 \
57 |      --save-interval 78 \
58 |      --eval-interval 78 \
59 |      --eval-iters 50 \
60 |      --fp16 \
61 |      --DDP-impl local \
62 |      --finetune --no-load-optim \
63 |      --log-validation-ppl-to-tensorboard \
64 |      --tensorboard-dir ${TENSORBOARD_DIR}
65 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/generate-1.3b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | VOCAB_FILE=gpt2-vocab.json
 4 | MERGE_FILE=gpt2-merges.txt
 5 | 
 6 | GPUS_PER_NODE=1
 7 | # Change for multinode config
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=$(($RANDOM + 1024))
10 | NNODES=1
11 | NODE_RANK=0
12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
13 | NUM_SAMPLES=$(wc -l < $1)
14 | PREFIX=$(basename $2)
15 | SEED=$(($RANDOM))
16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl
17 | 
18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
19 | 
20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
21 |        --tensor-model-parallel-size 1 \
22 |        --num-layers 24 \
23 |        --hidden-size 2048 \
24 |        --load $CHECKPOINT_PATH \
25 |        --num-attention-heads 32 \
26 |        --max-position-embeddings 2048 \
27 |        --tokenizer-type GPT2BPETokenizer \
28 |        --fp16 \
29 |        --micro-batch-size 400 \
30 |        --seq-length 2048 \
31 |        --out-seq-length 20 \
32 |        --temperature 1.0 \
33 |        --vocab-file $VOCAB_FILE \
34 |        --merge-file $MERGE_FILE \
35 |        --sample-input-file $1 \
36 |        --sample-output-file $OUTPUT \
37 |        --num-samples $NUM_SAMPLES \
38 |        --max-tokens-to-oom 1200000 \
39 |        --top_p 0.9 \
40 |        --seed $SEED
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | SHARE_DATA=$PWD             # current work dir
 4 | VOCAB_FILE=gpt2-vocab.json  # Your gpt-2 vocab
 5 | MERGE_FILE=gpt2-merges.txt  # Your gpt-2 merge file
 6 | 
 7 | GPUS_PER_NODE=1
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=$(($RANDOM + 1024))
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | SEED=$3
15 | SUFFIX=$(basename $CHECKPOINT_PATH)
16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/
17 | mkdir -p $save_dir
18 | echo $save_dir/$SEED.out
19 | 
20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
21 | 
22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
23 |        --tensor-model-parallel-size 1 \
24 |        --num-layers 24 \
25 |        --hidden-size 2048 \
26 |        --load $CHECKPOINT_PATH \
27 |        --num-attention-heads 32 \
28 |        --max-position-embeddings 2048 \
29 |        --tokenizer-type GPT2BPETokenizer \
30 |        --fp16 \
31 |        --micro-batch-size 150 \
32 |        --seq-length 2048 \
33 |        --out-seq-length 1000 \
34 |        --temperature 1.0 \
35 |        --vocab-file $VOCAB_FILE \
36 |        --merge-file $MERGE_FILE \
37 |        --num-samples $1 \
38 |        --top_p 0.9 \
39 |        --max-tokens-to-oom 1200000 \
40 |        --genfile $save_dir/$SEED.out  \
41 |        --seed $SEED
42 | 
43 | 


--------------------------------------------------------------------------------
/examples/evaluate_retriever_nq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained
 4 | # ICT model or a finetuned model for Natural Question task
 5 | 
 6 | # Datasets can be downloaded from the following link:
 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 8 | 
 9 | EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
10 | EMBEDDING_PATH=<Specify path of the embeddings>
11 | CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
12 | 
13 | QA_FILE=<Path of the natural question dev or test dataset>
14 | 
15 | python tasks/main.py \
16 |     --task RETRIEVER-EVAL \
17 |     --tokenizer-type BertWordPieceLowerCase \
18 |     --num-layers 12 \
19 |     --hidden-size 768 \
20 |     --num-attention-heads 12 \
21 |     --tensor-model-parallel-size 1 \
22 |     --micro-batch-size 128 \
23 |     --activations-checkpoint-method uniform \
24 |     --seq-length 512 \
25 |     --max-position-embeddings 512 \
26 |     --load ${CHECKPOINT_PATH} \
27 |     --evidence-data-path ${EVIDENCE_DATA_DIR} \
28 |     --embedding-path ${EMBEDDING_PATH} \
29 |     --retriever-seq-length 256 \
30 |     --vocab-file  bert-vocab.txt\
31 |     --qa-data-test ${QA_FILE} \
32 |     --faiss-use-gpu \
33 |     --retriever-report-topk-accuracies 1 5 20 100 \
34 |     --fp16 \
35 |     --indexer-log-interval 1000 \
36 |     --indexer-batch-size 128
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/examples/evaluate_zeroshot_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TASK="LAMBADA"
12 | 
13 | VALID_DATA=<lambada path>
14 | VOCAB_FILE=gpt2-vocab.json
15 | MERGE_FILE=gpt2-merges.txt
16 | CHECKPOINT=checkpoints/gpt2_345m
17 | 
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
20 |                --task $TASK \
21 |                --valid-data $VALID_DATA \
22 |                --tokenizer-type GPT2BPETokenizer \
23 |                --strict-lambada \
24 |                --vocab-file $VOCAB_FILE \
25 |                --merge-file $MERGE_FILE \
26 |                --load $CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 8 \
32 |                --activations-checkpoint-method uniform \
33 |                --seq-length 1024 \
34 |                --max-position-embeddings 1024 \
35 |                --log-interval 10 \
36 |                --fp16 \
37 |                --no-load-optim \
38 |                --no-load-rng
39 | 


--------------------------------------------------------------------------------
/examples/finetune_mnli_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv"
12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
13 |             data/glue_data/MNLI/dev_mismatched.tsv"
14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
15 | VOCAB_FILE=bert-vocab.txt
16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task MNLI \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 5 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 8 \
32 |                --activations-checkpoint-method uniform \
33 |                --lr 5.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.065 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 500000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --fp16
45 | 


--------------------------------------------------------------------------------
/examples/finetune_race_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/RACE/train/middle"
12 | VALID_DATA="data/RACE/dev/middle \
13 |             data/RACE/dev/high"
14 | VOCAB_FILE=bert-vocab.txt
15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
16 | CHECKPOINT_PATH=checkpoints/bert_345m_race
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task RACE \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 3 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 4 \
32 |                --activations-checkpoint-method uniform \
33 |                --lr 1.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.06 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 100000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --clip-grad 1.0 \
45 |                --hidden-dropout 0.1 \
46 |                --attention-dropout 0.1 \
47 |                --fp16
48 | 


--------------------------------------------------------------------------------
/examples/finetune_retriever_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Finetune a BERT or pretrained ICT model using Google natural question data 
 4 | # Datasets can be downloaded from the following link:
 5 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 6 | 
 7 | WORLD_SIZE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
16 | 
17 | # Load either of the below
18 | BERT_LOAD_PATH=<Path of BERT pretrained model>
19 | PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
20 | 
21 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
22 |         --task RET-FINETUNE-NQ \
23 |         --train-with-neg \
24 |         --train-hard-neg 1 \
25 |         --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
26 |         --num-layers 12 \
27 |         --hidden-size 768 \
28 |         --num-attention-heads 12 \
29 |         --tensor-model-parallel-size 1 \
30 |         --tokenizer-type BertWordPieceLowerCase \
31 |         --train-data nq-train.json \
32 |         --valid-data nq-dev.json \
33 |         --save ${CHECKPOINT_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --vocab-file bert-vocab.txt \
36 |         --bert-load ${BERT_LOAD_PATH} \
37 |         --save-interval 5000 \
38 |         --log-interval 10 \
39 |         --eval-interval 20000 \
40 |         --eval-iters 100 \
41 |         --indexer-log-interval 1000 \
42 |         --faiss-use-gpu \
43 |         --DDP-impl torch \
44 |         --fp16 \
45 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
46 |         --seq-length 512 \
47 |         --retriever-seq-length 256 \
48 |         --max-position-embeddings 512 \
49 |         --retriever-score-scaling \
50 |         --epochs 80 \
51 |         --micro-batch-size 8 \
52 |         --eval-micro-batch-size 16 \
53 |         --indexer-batch-size 128 \
54 |         --lr 2e-5 \
55 |         --lr-warmup-fraction 0.01 \
56 |         --weight-decay 1e-1
57 | 


--------------------------------------------------------------------------------
/examples/merge_mp_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TENSOR_MODEL_PARALLEL_SIZE=2
 4 | 
 5 | VOCAB_FILE=bert-vocab.txt
 6 | CHECKPOINT_PATH=checkpoints/bert_345m
 7 | 
 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
 9 |                                 --model-type BERT \
10 |                                 --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
11 |                                 --tokenizer-type BertWordPieceLowerCase \
12 |                                 --vocab-file $VOCAB_FILE \
13 |                                 --num-layers 24 \
14 |                                 --hidden-size 1024 \
15 |                                 --num-attention-heads 16 \
16 |                                 --seq-length 512 \
17 |                                 --max-position-embeddings 512 \
18 |                                 --load $CHECKPOINT_PATH
19 | 


--------------------------------------------------------------------------------
/examples/msdp/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
3 | 
4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/msdp/eval_knwl_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #########################
 4 | # Evaluate the F1 scores.
 5 | #########################
 6 | 
 7 | WORLD_SIZE=1
 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 9 |                   --nnodes 1 \
10 |                   --node_rank 0 \
11 |                   --master_addr localhost \
12 |                   --master_port 6000"
13 |                   
14 | MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
15 |         (e.g., /testseen_knowledge_generations.txt)
16 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
17 |         (e.g., /testseen_knowledge_reference.txt)
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
20 |         --num-layers 24 \
21 |         --hidden-size 1024 \
22 |         --num-attention-heads 16 \
23 |         --seq-length 2048 \
24 |         --max-position-embeddings 2048 \
25 |         --micro-batch-size 4 \
26 |         --task MSDP-EVAL-F1 \
27 |         --guess-file ${MODEL_GEN_PATH} \
28 |         --answer-file ${GROUND_TRUTH_PATH}
29 | 
30 | 
31 | ############################################
32 | # Evaluate BLEU, METEOR, and ROUGE-L scores.
33 | ############################################
34 | 
35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 
37 | 
38 | # To evaluate on these metrics, please setup the environments based on 
39 | # the nlg-eval github, and run the corresponding evaluation commands.
40 | 
41 | nlg-eval \
42 |     --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
43 |     --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
44 | 


--------------------------------------------------------------------------------
/examples/msdp/eval_resp_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #########################
 4 | # Evaluate the F1 scores.
 5 | #########################
 6 | 
 7 | WORLD_SIZE=1
 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 9 |                   --nnodes 1 \
10 |                   --node_rank 0 \
11 |                   --master_addr localhost \
12 |                   --master_port 6000"
13 |                   
14 | MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
15 |         (e.g., /testseen_response_generations.txt)
16 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \ 
17 |         (e.g., /testseen_response_reference.txt)
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
20 |         --num-layers 24 \
21 |         --hidden-size 1024 \
22 |         --num-attention-heads 16 \
23 |         --seq-length 2048 \
24 |         --max-position-embeddings 2048 \
25 |         --micro-batch-size 4 \
26 |         --task MSDP-EVAL-F1 \
27 |         --guess-file ${MODEL_GEN_PATH} \
28 |         --answer-file ${GROUND_TRUTH_PATH}
29 | 
30 | 
31 | ##########################
32 | # Evaluate the KF1 scores.
33 | ##########################
34 |                   
35 | MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
36 |         (e.g., /testseen_response_generations.txt)
37 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
38 |         (e.g., /testseen_knowledge_reference.txt)
39 | 
40 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
41 |         --num-layers 24 \
42 |         --hidden-size 1024 \
43 |         --num-attention-heads 16 \
44 |         --seq-length 2048 \
45 |         --max-position-embeddings 2048 \
46 |         --micro-batch-size 4 \
47 |         --task MSDP-EVAL-F1 \
48 |         --guess-file ${MODEL_GEN_PATH} \
49 |         --answer-file ${GROUND_TRUTH_PATH}
50 | 
51 | 
52 | ############################################
53 | # Evaluate BLEU, METEOR, and ROUGE-L scores.
54 | ############################################
55 | 
56 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
57 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 
58 | 
59 | # To evaluate on these metrics, please setup the environments based on 
60 | # the nlg-eval github, and run the corresponding evaluation commands.
61 | 
62 | nlg-eval \
63 |     --hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
64 |     --references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
65 | 


--------------------------------------------------------------------------------
/examples/msdp/prep_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Preparing the input file for the response generation (second-stage prompting)
 4 | 
 5 | DIR=`pwd`
 6 | 
 7 | TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
 8 |         (e.g., /testseen_processed.txt)
 9 | KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
10 |         (e.g., /testseen_knowledge_generations.txt)
11 | PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
12 |         (e.g., /testseen_processed_with_generated_knowledge.txt)
13 | 
14 | python ${DIR}/tasks/msdp/preprocessing.py \
15 |         --func prepare_input \
16 |         --test_file ${TEST_FILE} \
17 |         --knwl_gen_file ${KNOWLEDGE_FILE} \
18 |         --processed_file ${PROCESSED_FILE}
19 | 


--------------------------------------------------------------------------------
/examples/msdp/prompt_knwl_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge
 5 | # The size of the pretrained language model is 357M
 6 | 
 7 | WORLD_SIZE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
16 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
17 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
18 | INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \ 
19 |         (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
21 |         (e.g., /testseen_knowledge_prompts.json)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /testseen_knowledge_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type knowledge \
42 |         --num-prompt-examples 10 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/msdp/prompt_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response
 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1
 5 | # The output is the corresponding response.
 6 | # The size of the pretrained language model is 357M
 7 | 
 8 | WORLD_SIZE=8
 9 | 
10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
11 |                   --nnodes 1 \
12 |                   --node_rank 0 \
13 |                   --master_addr localhost \
14 |                   --master_port 6000"
15 | 
16 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
17 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
18 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
19 | INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
21 |         (e.g., /response_prompts.txt)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /output_testseen_response_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type response \
42 |         --num-prompt-examples 20 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | CHECKPOINT_PATH=<Specify path>
 6 | VOCAB_FILE=<Specify path to file>/bert-vocab.txt
 7 | DATA_PATH=<Specify path and file prefix>_text_sentence
 8 | 
 9 | BERT_ARGS="
10 |     --num-layers 24 \
11 |     --hidden-size 1024 \
12 |     --num-attention-heads 16 \
13 |     --seq-length 512 \
14 |     --max-position-embeddings 512 \
15 |     --micro-batch-size 4 \
16 |     --global-batch-size 8 \
17 |     --lr 0.0001 \
18 |     --train-iters 2000000 \
19 |     --lr-decay-iters 990000 \
20 |     --lr-decay-style linear \
21 |     --min-lr 0.00001 \
22 |     --weight-decay 1e-2 \
23 |     --lr-warmup-fraction .01 \
24 |     --clip-grad 1.0 \
25 |     --fp16
26 | "
27 | 
28 | DATA_ARGS="
29 |     --data-path $DATA_PATH \
30 |     --vocab-file $VOCAB_FILE \
31 |     --data-impl mmap \
32 |     --split 949,50,1
33 | "
34 | 
35 | OUTPUT_ARGS="
36 |     --log-interval 100 \
37 |     --save-interval 10000 \
38 |     --eval-interval 1000 \
39 |     --eval-iters 10
40 | "
41 | 
42 | torchrun pretrain_bert.py \
43 |     $BERT_ARGS \
44 |     $DATA_ARGS \
45 |     $OUTPUT_ARGS \
46 |     --save $CHECKPOINT_PATH \
47 |     --load $CHECKPOINT_PATH
48 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/bert-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | BERT_ARGS="
26 |     --num-layers 24 \
27 |     --hidden-size 1024 \
28 |     --num-attention-heads 16 \
29 |     --seq-length 512 \
30 |     --max-position-embeddings 512 \
31 |     --micro-batch-size 4 \
32 |     --global-batch-size 32 \
33 |     --lr 0.0001 \
34 |     --train-iters 1000000 \
35 |     --lr-decay-iters 990000 \
36 |     --lr-decay-style linear \
37 |     --min-lr 1.0e-5 \
38 |     --weight-decay 1e-2 \
39 |     --lr-warmup-fraction .01 \
40 |     --clip-grad 1.0 \
41 |     --fp16
42 | "
43 | 
44 | DATA_ARGS="
45 |     --data-path $DATA_PATH \
46 |     --vocab-file $VOCAB_FILE \
47 |     --data-impl mmap \
48 |     --split 949,50,1
49 | "
50 | 
51 | OUTPUT_ARGS="
52 |     --log-interval 100 \
53 |     --save-interval 10000 \
54 |     --eval-interval 1000 \
55 |     --eval-iters 10
56 | "
57 | 
58 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
59 |     $BERT_ARGS \
60 |     $DATA_ARGS \
61 |     $OUTPUT_ARGS \
62 |     --distributed-backend nccl \
63 |     --save $CHECKPOINT_PATH \
64 |     --load $CHECKPOINT_PATH
65 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/bert-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | BERT_ARGS="
26 |     --tensor-model-parallel-size 2 \
27 |     --pipeline-model-parallel-size 2 \
28 |     --num-layers 24 \
29 |     --hidden-size 1024 \
30 |     --num-attention-heads 16 \
31 |     --seq-length 512 \
32 |     --max-position-embeddings 512 \
33 |     --micro-batch-size 2 \
34 |     --global-batch-size 16 \
35 |     --lr 0.0001 \
36 |     --train-iters 1000000 \
37 |     --lr-decay-iters 990000 \
38 |     --lr-decay-style linear \
39 |     --min-lr 1.0e-5 \
40 |     --weight-decay 1e-2 \
41 |     --lr-warmup-fraction .01 \
42 |     --clip-grad 1.0 \
43 |     --fp16
44 | "
45 | 
46 | DATA_ARGS="
47 |     --data-path $DATA_PATH \
48 |     --vocab-file $VOCAB_FILE \
49 |     --data-impl mmap \
50 |     --split 949,50,1
51 | "
52 | 
53 | OUTPUT_ARGS="
54 |     --log-interval 100 \
55 |     --save-interval 10000 \
56 |     --eval-interval 1000 \
57 |     --eval-iters 10
58 | "
59 | 
60 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
61 |     $BERT_ARGS \
62 |     $DATA_ARGS \
63 |     $OUTPUT_ARGS \
64 |     --distributed-backend nccl \
65 |     --save $CHECKPOINT_PATH \
66 |     --load $CHECKPOINT_PATH
67 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | 
 7 | CHECKPOINT_PATH=<Specify path>
 8 | VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
 9 | MERGE_FILE=<Specify path to file>/gpt2-merges.txt
10 | DATA_PATH=<Specify path and file prefix>_text_document
11 | 
12 | GPT_ARGS="
13 |     --num-layers 24 \
14 |     --hidden-size 1024 \
15 |     --num-attention-heads 16 \
16 |     --seq-length 1024 \
17 |     --max-position-embeddings 1024 \
18 |     --micro-batch-size 4 \
19 |     --global-batch-size 8 \
20 |     --lr 0.00015 \
21 |     --train-iters 500000 \
22 |     --lr-decay-iters 320000 \
23 |     --lr-decay-style cosine \
24 |     --min-lr 1.0e-5 \
25 |     --weight-decay 1e-2 \
26 |     --lr-warmup-fraction .01 \
27 |     --clip-grad 1.0 \
28 |     --fp16
29 | "
30 | 
31 | DATA_ARGS="
32 |     --data-path $DATA_PATH \
33 |     --vocab-file $VOCAB_FILE \
34 |     --merge-file $MERGE_FILE \
35 |     --data-impl mmap \
36 |     --split 949,50,1
37 | "
38 | 
39 | OUTPUT_ARGS="
40 |     --log-interval 100 \
41 |     --save-interval 10000 \
42 |     --eval-interval 1000 \
43 |     --eval-iters 10
44 | "
45 | 
46 | torchrun pretrain_gpt.py \
47 |     $GPT_ARGS \
48 |     $DATA_ARGS \
49 |     $OUTPUT_ARGS \
50 |     --save $CHECKPOINT_PATH \
51 |     --load $CHECKPOINT_PATH
52 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_175B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | #SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
 5 | 
 6 | 
 7 | DIR=`pwd`
 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 9 | mkdir -p $DIR/logs
10 | 
11 | 
12 | DATASET_1="<PATH TO THE FIRST DATASET>"
13 | DATASET_2="<PATH TO THE SECOND DATASET>"
14 | DATASET_3="<PATH TO THE THIRD DATASET>"
15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
16 | 
17 | 
18 | options=" \
19 | 	--tensor-model-parallel-size 8 \
20 | 	--pipeline-model-parallel-size 16 \
21 |         --num-layers 96 \
22 |         --hidden-size 12288 \
23 |         --num-attention-heads 96 \
24 |         --seq-length 2048 \
25 |         --max-position-embeddings 2048 \
26 | 	--micro-batch-size 1 \
27 | 	--global-batch-size 1536 \
28 | 	--rampup-batch-size 16 16 5859375 \
29 | 	--train-samples 146484375 \
30 |        	--lr-decay-samples 126953125 \
31 |         --lr-warmup-samples 183105 \
32 |         --lr 6.0e-5 \
33 | 	--min-lr 6.0e-6 \
34 |         --lr-decay-style cosine \
35 |         --log-interval 10 \
36 |         --eval-iters 40 \
37 |         --eval-interval 1000 \
38 | 	--data-path ${DATASET} \
39 | 	--vocab-file <PATH TO gpt-vocab.json> \
40 | 	--merge-file <PATH TO gpt-merges.txt> \
41 | 	--save-interval 1000 \
42 | 	--save <PATH TO CHECKPOINTS DIRECTORY> \
43 | 	--load <PATH TO CHECKPOINTS DIRECTORY> \
44 |         --split 98,2,0 \
45 |         --clip-grad 1.0 \
46 | 	--weight-decay 0.1 \
47 | 	--adam-beta1 0.9 \
48 | 	--adam-beta2 0.95 \
49 | 	--init-method-std 0.006 \
50 | 	--tensorboard-dir <TENSORBOARD DIRECTORY> \
51 |         --fp16 \
52 | 	--activations-checkpoint-method uniform "
53 | 
54 | 
55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
56 | 
57 | 
58 | srun -l \
59 |      --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
60 |      --container-mounts "<DIRECTORIES TO MOUNT>" \
61 |      --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
62 | 
63 | 
64 | set +x
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | 
 7 | GPUS_PER_NODE=8
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=6000
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | 
15 | CHECKPOINT_PATH=<Specify path>
16 | VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
17 | MERGE_FILE=<Specify path to file>/gpt2-merges.txt
18 | DATA_PATH=<Specify path and file prefix>_text_document
19 | 
20 | DISTRIBUTED_ARGS="
21 |     --nproc_per_node $GPUS_PER_NODE \
22 |     --nnodes $NNODES \
23 |     --node_rank $NODE_RANK \
24 |     --master_addr $MASTER_ADDR \
25 |     --master_port $MASTER_PORT
26 | "
27 | 
28 | GPT_ARGS="
29 |     --num-layers 24 \
30 |     --hidden-size 1024 \
31 |     --num-attention-heads 16 \
32 |     --seq-length 1024 \
33 |     --max-position-embeddings 1024 \
34 |     --micro-batch-size 8 \
35 |     --global-batch-size 64 \
36 |     --lr 0.00015 \
37 |     --train-iters 500000 \
38 |     --lr-decay-iters 320000 \
39 |     --lr-decay-style cosine \
40 |     --min-lr 1.0e-5 \
41 |     --weight-decay 1e-2 \
42 |     --lr-warmup-fraction .01 \
43 |     --clip-grad 1.0 \
44 |     --fp16
45 | "
46 | 
47 | DATA_ARGS="
48 |     --data-path $DATA_PATH \
49 |     --vocab-file $VOCAB_FILE \
50 |     --merge-file $MERGE_FILE \
51 |     --data-impl mmap \
52 |     --split 949,50,1
53 | "
54 | 
55 | OUTPUT_ARGS="
56 |     --log-interval 100 \
57 |     --save-interval 10000 \
58 |     --eval-interval 1000 \
59 |     --eval-iters 10
60 | "
61 | 
62 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
63 |     $GPT_ARGS \
64 |     $DATA_ARGS \
65 |     $OUTPUT_ARGS \
66 |     --distributed-backend nccl \
67 |     --save $CHECKPOINT_PATH \
68 |     --load $CHECKPOINT_PATH
69 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | 
 7 | GPUS_PER_NODE=8
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=6000
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | 
15 | CHECKPOINT_PATH=<Specify path>
16 | VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
17 | MERGE_FILE=<Specify path to file>/gpt2-merges.txt
18 | DATA_PATH=<Specify path and file prefix>_text_document
19 | 
20 | DISTRIBUTED_ARGS="
21 |     --nproc_per_node $GPUS_PER_NODE \
22 |     --nnodes $NNODES \
23 |     --node_rank $NODE_RANK \
24 |     --master_addr $MASTER_ADDR \
25 |     --master_port $MASTER_PORT
26 | "
27 | 
28 | GPT_ARGS="
29 |     --tensor-model-parallel-size 2 \
30 |     --pipeline-model-parallel-size 2 \
31 |     --sequence-parallel \
32 |     --num-layers 24 \
33 |     --hidden-size 1024 \
34 |     --num-attention-heads 16 \
35 |     --seq-length 1024 \
36 |     --max-position-embeddings 1024 \
37 |     --micro-batch-size 4 \
38 |     --global-batch-size 16 \
39 |     --lr 0.00015 \
40 |     --train-iters 500000 \
41 |     --lr-decay-iters 320000 \
42 |     --lr-decay-style cosine \
43 |     --min-lr 1.0e-5 \
44 |     --weight-decay 1e-2 \
45 |     --lr-warmup-fraction .01 \
46 |     --clip-grad 1.0 \
47 |     --fp16
48 | "
49 | 
50 | DATA_ARGS="
51 |     --data-path $DATA_PATH \
52 |     --vocab-file $VOCAB_FILE \
53 |     --merge-file $MERGE_FILE \
54 |     --data-impl mmap \
55 |     --split 949,50,1
56 | "
57 | 
58 | OUTPUT_ARGS="
59 |     --log-interval 100 \
60 |     --save-interval 10000 \
61 |     --eval-interval 1000 \
62 |     --eval-iters 10
63 | "
64 | 
65 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
66 |     $GPT_ARGS \
67 |     $DATA_ARGS \
68 |     $OUTPUT_ARGS \
69 |     --distributed-backend nccl \
70 |     --save $CHECKPOINT_PATH \
71 |     --load $CHECKPOINT_PATH
72 | 
73 | 


--------------------------------------------------------------------------------
/examples/pretrain_ict.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "217M" parameter biencoder model for ICT retriever
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
 9 | TEXT_DATA_PATH=<Specify path and file prefix of the text data>
10 | TITLE_DATA_PATH=<Specify path and file prefix od the titles>
11 | CHECKPOINT_PATH=<Specify path>
12 | 
13 | 
14 | python pretrain_ict.py \
15 |         --num-layers 12 \
16 |         --hidden-size 768 \
17 |         --num-attention-heads 12 \
18 |         --tensor-model-parallel-size 1 \
19 |         --micro-batch-size 32 \
20 |         --seq-length 256 \
21 |         --max-position-embeddings 512 \
22 |         --train-iters 100000 \
23 |         --vocab-file bert-vocab.txt \
24 |         --tokenizer-type BertWordPieceLowerCase \
25 |         --DDP-impl torch \
26 |         --bert-load ${PRETRAINED_BERT_PATH} \
27 |         --log-interval 100 \
28 |         --eval-interval 1000 \
29 |         --eval-iters 10 \
30 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
31 |         --retriever-score-scaling \
32 |         --load $CHECKPOINT_PATH \
33 |         --save $CHECKPOINT_PATH \
34 |         --data-path ${TEXT_DATA_PATH} \
35 |         --titles-data-path ${TITLE_DATA_PATH} \
36 |         --lr 0.0001 \
37 |         --lr-decay-style linear \
38 |         --weight-decay 1e-2 \
39 |         --clip-grad 1.0 \
40 |         --lr-warmup-fraction 0.01 \
41 |         --save-interval 4000 \
42 |         --exit-interval 8000 \
43 |         --query-in-block-prob 0.1 \
44 |         --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | CHECKPOINT_PATH=<Specify path>
 6 | VOCAB_FILE=<Specify path to file>/t5-vocab.txt
 7 | DATA_PATH=<Specify path and file prefix>_text_sentence
 8 | 
 9 | T5_ARGS="
10 |     --num-layers 12 \
11 |     --hidden-size 768 \
12 |     --num-attention-heads 12 \
13 |     --kv-channels 64 \
14 |     --ffn-hidden-size 3072 \
15 |     --encoder-seq-length 512 \
16 |     --decoder-seq-length 128 \
17 |     --max-position-embeddings 512 \
18 |     --micro-batch-size 16 \
19 |     --global-batch-size 16 \
20 |     --lr 0.0001 \
21 |     --train-iters 1000000 \
22 |     --lr-decay-iters 1000000 \
23 |     --lr-decay-style linear \
24 |     --min-lr 0.00001 \
25 |     --weight-decay 1e-2 \
26 |     --lr-warmup-fraction .01 \
27 |     --clip-grad 1.0 \
28 |     --fp16 \
29 |     --vocab-extra-ids 100
30 | "
31 | 
32 | DATA_ARGS="
33 |     --data-path $DATA_PATH \
34 |     --vocab-file $VOCAB_FILE \
35 |     --data-impl mmap \
36 |     --split 949,50,1
37 | "
38 | 
39 | OUTPUT_ARGS="
40 |     --log-interval 100 \
41 |     --save-interval 10000 \
42 |     --eval-interval 1000 \
43 |     --eval-iters 10
44 | "
45 | 
46 | torchrun pretrain_t5.py \
47 |     $T5_ARGS \
48 |     $DATA_ARGS \
49 |     $OUTPUT_ARGS \
50 |     --save $CHECKPOINT_PATH \
51 |     --load $CHECKPOINT_PATH
52 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/t5-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | T5_ARGS="
26 |     --num-layers 12 \
27 |     --hidden-size 768 \
28 |     --num-attention-heads 12 \
29 |     --kv-channels 64 \
30 |     --ffn-hidden-size 3072 \
31 |     --encoder-seq-length 512 \
32 |     --decoder-seq-length 128 \
33 |     --max-position-embeddings 512 \
34 |     --micro-batch-size 16 \
35 |     --global-batch-size 128 \
36 |     --lr 0.0001 \
37 |     --train-iters 1000000 \
38 |     --lr-decay-iters 1000000 \
39 |     --lr-decay-style linear \
40 |     --min-lr 0.00001 \
41 |     --weight-decay 1e-2 \
42 |     --lr-warmup-fraction .01 \
43 |     --clip-grad 1.0 \
44 |     --fp16 \
45 |     --vocab-extra-ids 100
46 | "
47 | 
48 | DATA_ARGS="
49 |     --data-path $DATA_PATH \
50 |     --vocab-file $VOCAB_FILE \
51 |     --data-impl mmap \
52 |     --split 949,50,1
53 | "
54 | 
55 | OUTPUT_ARGS="
56 |     --log-interval 100 \
57 |     --save-interval 10000 \
58 |     --eval-interval 1000 \
59 |     --eval-iters 10
60 | "
61 | 
62 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
63 |     $T5_ARGS \
64 |     $DATA_ARGS \
65 |     $OUTPUT_ARGS \
66 |     --distributed-backend nccl \
67 |     --save $CHECKPOINT_PATH \
68 |     --load $CHECKPOINT_PATH
69 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/t5-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | T5_ARGS="
26 |     --tensor-model-parallel-size 2 \
27 |     --num-layers 12 \
28 |     --hidden-size 768 \
29 |     --num-attention-heads 12 \
30 |     --kv-channels 64 \
31 |     --ffn-hidden-size 3072 \
32 |     --encoder-seq-length 512 \
33 |     --decoder-seq-length 128 \
34 |     --max-position-embeddings 512 \
35 |     --micro-batch-size 16 \
36 |     --global-batch-size 128 \
37 |     --lr 0.0001 \
38 |     --train-iters 1000000 \
39 |     --lr-decay-iters 1000000 \
40 |     --lr-decay-style linear \
41 |     --min-lr 0.00001 \
42 |     --weight-decay 1e-2 \
43 |     --lr-warmup-fraction .01 \
44 |     --clip-grad 1.0 \
45 |     --fp16  \
46 |     --vocab-extra-ids 100
47 | "
48 | 
49 | DATA_ARGS="
50 |     --data-path $DATA_PATH \
51 |     --vocab-file $VOCAB_FILE \
52 |     --data-impl mmap \
53 |     --split 949,50,1
54 | "
55 | 
56 | OUTPUT_ARGS="
57 |     --log-interval 100 \
58 |     --save-interval 10000 \
59 |     --eval-interval 1000 \
60 |     --eval-iters 10
61 | "
62 | 
63 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
64 |     $T5_ARGS \
65 |     $DATA_ARGS \
66 |     $OUTPUT_ARGS \
67 |     --distributed-backend nccl \
68 |     --save $CHECKPOINT_PATH \
69 |     --load $CHECKPOINT_PATH
70 | 


--------------------------------------------------------------------------------
/examples/run_text_generation_server_345M.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model.
 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | export CUDA_DEVICE_MAX_CONNECTIONS=1
14 | 
15 | pip install flask-restful
16 | 
17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
18 |        --tensor-model-parallel-size 1  \
19 |        --pipeline-model-parallel-size 1  \
20 |        --num-layers 24  \
21 |        --hidden-size 1024  \
22 |        --load ${CHECKPOINT}  \
23 |        --num-attention-heads 16  \
24 |        --max-position-embeddings 1024  \
25 |        --tokenizer-type GPT2BPETokenizer  \
26 |        --fp16  \
27 |        --micro-batch-size 1  \
28 |        --seq-length 1024  \
29 |        --out-seq-length 1024  \
30 |        --temperature 1.0  \
31 |        --vocab-file $VOCAB_FILE  \
32 |        --merge-file $MERGE_FILE  \
33 |        --top_p 0.9  \
34 |        --seed 42
35 | 


--------------------------------------------------------------------------------
/examples/run_text_generation_server_345M_8_tensor_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel
 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | pip install flask-restful
14 | 
15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
16 |        --tensor-model-parallel-size 8  \
17 |        --pipeline-model-parallel-size 1  \
18 |        --num-layers 24  \
19 |        --hidden-size 1024  \
20 |        --load ${CHECKPOINT}  \
21 |        --num-attention-heads 16  \
22 |        --max-position-embeddings 1024  \
23 |        --tokenizer-type GPT2BPETokenizer  \
24 |        --fp16  \
25 |        --micro-batch-size 1  \
26 |        --seq-length 1024  \
27 |        --out-seq-length 1024  \
28 |        --temperature 1.0  \
29 |        --vocab-file $VOCAB_FILE  \
30 |        --merge-file $MERGE_FILE  \
31 |        --top_p 0.9  \
32 |        --seed 42
33 | 


--------------------------------------------------------------------------------
/examples/sc21/CONFIG.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # SLURM options.
 5 | export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
 6 | export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
 7 | 
 8 | 
 9 | # Source code.
10 | export MEGATRON_CODE_DIR=<megatron source code directory>
11 | 
12 | 
13 | # This variable is used to mount the relevant part of the filesystem
14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
15 | # launch directory already get mounted; this variable should be used to
16 | # mount the directories that contain the data and tokenizer files.
17 | export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
18 | 
19 | 
20 | # Data and tokenizer files.
21 | MEGATRON_DATA=<path to megatron processed data>
22 | BPE_VOCAB_FILE=<path to bpe vocab file>
23 | BPE_MERGE_FILE=<path to bpe merges file>
24 | 
25 | 
26 | # Megatron input parameters.
27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
28 | # that are not listed here. 
29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
30 | 	--tensor-model-parallel-size ${TP} \
31 | 	--pipeline-model-parallel-size ${PP} \
32 | 	--micro-batch-size ${MBS} \
33 | 	--global-batch-size ${GBS} \
34 |         --num-layers ${NLS} \
35 |         --hidden-size ${HS} \
36 |         --num-attention-heads ${NAH} \
37 | 	--DDP-impl ${DDP} \
38 | 	--data-path ${MEGATRON_DATA} \
39 | 	--vocab-file ${BPE_VOCAB_FILE} \
40 | 	--merge-file ${BPE_MERGE_FILE} \
41 |         --log-interval 5 \
42 |         --seq-length 2048 \
43 |         --max-position-embeddings 2048 \
44 |         --train-iters 500 \
45 |         --lr-decay-iters 320 \
46 |         --lr 0.0001 \
47 | 	--min-lr 0.00001 \
48 |         --lr-decay-style cosine \
49 |         --lr-warmup-fraction 0.01 \
50 |         --split 969,30,1 \
51 |         --eval-iters 100 \
52 |         --eval-interval 1000 \
53 |         --clip-grad 1.0 \
54 |         --fp16 \
55 | 	--loss-scale 8192 "
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/examples/sc21/README.md:
--------------------------------------------------------------------------------
 1 | # Reproducing Figures in SC21 Paper
 2 | 
 3 | 
 4 | This directory contains some of the scripts that were used to produce the
 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
 9 | schedulers as well.
10 | 
11 | 
12 | ## Setup
13 | 
14 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
15 | update the unspecified values (in angle brackets `<...>`) before launching any
16 | scripts.
17 | 
18 | 
19 | 
20 | ## Scripts
21 | 
22 | Below is a list of scripts that can be used to reproduce various figures in our
23 | [paper](https://arxiv.org/pdf/2104.04473.pdf):
24 | 
25 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
26 | for GPT models ranging from 1 billion to 1 trillion parameters.
27 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
28 | performance of pipeline parallelism.
29 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
30 | the interleaved schedule on a 175B GPT model.
31 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
32 | different degrees of pipeline and tensor model parallelism on a model with
33 | 162.2 billion parameters.
34 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
35 | different degrees of data and pipeline model parallelism on a model with
36 | 5.9 billion parameters.
37 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
38 | different degrees of data and tensor model parallelism on a model with
39 | 5.9 billion parameters.
40 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
41 | microbatch size.
42 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
43 | activation recomputation.
44 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
45 | the scatter-gather communication optimization.
46 | 


--------------------------------------------------------------------------------
/examples/sc21/SBATCH.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | sbatch -p ${SLURM_PARTITION} \
 5 |        -A ${SLURM_ACCOUNT} \
 6 |        --job-name=${JOB_NAME} \
 7 |        --nodes=${NNODES} \
 8 |        --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 9 | 
10 | exit 0
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/sc21/SRUN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 4 | 
 5 | 
 6 | THIS_DIR=`pwd`
 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 8 | mkdir -p ${THIS_DIR}/logs
 9 | 
10 | 
11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
12 | 
13 | 
14 | srun -l \
15 |      --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
16 |      --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
17 |      --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [1, 2, 4, 8].
 8 | PP=1
 9 | 
10 | # Batch size (global batch size) options = [8, 128].
11 | GBS=8
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel size options.
18 | NLS=$((3*PP))
19 | NNODES=${PP}
20 | 
21 | 
22 | # Other params.
23 | TP=8
24 | MBS=1
25 | HS=20480
26 | NAH=128
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Interleaved schedule options = [YES, NO].
 8 | INTERLEAVED=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set interleaved schedule options.
18 | if [ ${INTERLEAVED} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${INTERLEAVED} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and tensor-parallel size options.
18 | TP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | MBS=1
23 | NLS=32
24 | HS=20480
25 | NAH=128
26 | DDP=local
27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
28 | NNODES=8
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and data-parallel size options.
18 | DP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | TP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32].
 8 | TP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set tensor-parallel and data-parallel size options.
18 | DP=$((64/TP))
19 | 
20 | 
21 | # Other params.
22 | PP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Microbatch size options = [1, 2, 4, 8].
 8 | MBS=1
 9 | 
10 | # Batch size (global batch size) options = [128, 512].
11 | GBS=128
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Other params.
18 | TP=8
19 | PP=8
20 | NLS=32
21 | HS=15360
22 | NAH=128
23 | DDP=local
24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
25 | NNODES=8
26 | 
27 | 
28 | # Name of the job.
29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
30 | 
31 | 
32 | # Import the configs.
33 | . `pwd`/CONFIG.sh
34 | 
35 | 
36 | # Submit the job.
37 | . `pwd`/SBATCH.sh
38 | 
39 | 
40 | exit 0
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_17.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Activation recomputation options = [YES, NO].
 8 | ACTIVATION_RECOMPUTATION=YES
 9 | 
10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256].
11 | GBS=1
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set activation recomputation.
18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS=""
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=16
31 | MBS=1
32 | NLS=80
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=16
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_18.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Scatter-gather communication optimization options = [YES, NO].
 8 | SCATTER_GATHER=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set scatter-gather communication optimization options.
18 | if [ ${SCATTER_GATHER} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${SCATTER_GATHER} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples_deepspeed/MoE/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": CONFIG_FP16_ENABLED,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "bf16": {
23 |     "enabled": CONFIG_BF16_ENABLED
24 |   },
25 |   "curriculum_learning": {
26 |     "enabled": CONFIG_CL_ENABLED,
27 |     "curriculum_type": "seqlen",
28 |     "min_difficulty": CONFIG_CL_MIN,
29 |     "max_difficulty": CONFIG_CL_MAX,
30 |     "schedule_type": "fixed_linear",
31 |     "schedule_config": {
32 |       "total_curriculum_step": CONFIG_CL_DURATION,
33 |       "difficulty_step": 8
34 |     }
35 |   },
36 | 
37 |   "wall_clock_breakdown" : false
38 | }
39 | 


--------------------------------------------------------------------------------
/examples_deepspeed/MoE/ds_config_gpt_Zero2_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": 2
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": false,
12 | 
13 |   "fp16": {
14 |     "enabled": CONFIG_FP16_ENABLED,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "bf16": {
23 |     "enabled": CONFIG_BF16_ENABLED
24 |   },
25 |   "curriculum_learning": {
26 |     "enabled": CONFIG_CL_ENABLED,
27 |     "curriculum_type": "seqlen",
28 |     "min_difficulty": CONFIG_CL_MIN,
29 |     "max_difficulty": CONFIG_CL_MAX,
30 |     "schedule_type": "fixed_linear",
31 |     "schedule_config": {
32 |       "total_curriculum_step": CONFIG_CL_DURATION,
33 |       "difficulty_step": 8
34 |     }
35 |   },
36 | 
37 |   "wall_clock_breakdown" : false
38 | }
39 | 


--------------------------------------------------------------------------------
/examples_deepspeed/azure/README.md:
--------------------------------------------------------------------------------
 1 | ## Recipes for experimentation on Azure
 2 | 
 3 | The recipes have been tested on command line on a cluster setup using Azure VMs and VMSS as well as inside Docker based environments.
 4 | 
 5 | To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows
 6 | 
 7 | ```bash examples_deepspeed/azure/run-benchmark-model.sh```
 8 | 
 9 | ### Pre-requisites
10 | 
11 | To run the above script, you will need to either setup your own dataset and modify the scripts or use our helper scripts to download the publicly available Books dataset and GPT vocab files. Please use the following from the ```dataset``` folder
12 | 
13 | ```bash dataset/download_books.sh```
14 | 
15 | ```bash dataset/download_vocab.sh```
16 | 
17 | ### Run 175B and 1T models
18 | 
19 | We have included two recipes for the 175B model and the 1T model. To train the model, we assume that the users will modify and tune hyperparameters and configurations by themselves. To facilitate initial training, we have made the recipes runnable with the Books dataset as follows.
20 | 
21 | ```bash examples_deepspeed/azure/run-175b.sh```
22 | 
23 | ```bash examples_deepspeed/azure/run-1t.sh```
24 | 
25 | ### Note about ZeRO stage 3 and CPU offload 
26 | 
27 | By default, we have enabled ZeRO Stage 3 for both the recipes above. For the 1T model, we have also enabled the CPU-offload feature to save on memory and enable a larger batch size that offers better performance. 
28 | 


--------------------------------------------------------------------------------
/examples_deepspeed/azureml/Dockerfile.dockerfile:
--------------------------------------------------------------------------------
1 | FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-cuda11.3:12
2 | USER root:root
3 | 
4 | RUN pip install pybind11
5 | RUN pip install regex


--------------------------------------------------------------------------------
/examples_deepspeed/azureml/README.md:
--------------------------------------------------------------------------------
 1 | ## Megatron-DeepSpeed on AzureML
 2 | Example script for running Megatron-DeepSpeed using Azure Machine Learning.
 3 | 
 4 | ------
 5 | 
 6 | # Workspace Setup
 7 | Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk#set-up).
 8 | 
 9 | # Dataset Preparation
10 | Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 
11 | Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset.
12 | 
13 | > Note: The folder `bookcorpus_data` used by [prepare_dataset script](prepare_dataset.py) should not be under `azureml` directories. It is because Azure ML does not allow to include large files (limit: 100 files or 1048576 bytes) for Docker build context.
14 | 
15 | # Training
16 | Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py).
17 | 


--------------------------------------------------------------------------------
/examples_deepspeed/azureml/prepare_dataset.py:
--------------------------------------------------------------------------------
 1 | # Use this script to upload data to blob store
 2 | 
 3 | # AzureML libraries
 4 | from azureml.core import Workspace
 5 | from azureml.core.dataset import Dataset
 6 | from azureml.data.datapath import DataPath
 7 | 
 8 | ws = Workspace.from_config()
 9 | print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')
10 | 
11 | data_dir = "bookcorpus_data"  # Local directory for where data is located that includes .bin and .idx files
12 | blobstore_datadir = data_dir  # Blob store directory to store data in
13 | 
14 | datastore = ws.get_default_datastore()
15 | 
16 | # Book Corpus Data
17 | print("upload dataset to blob store")
18 | uploaded_data = Dataset.File.upload_directory(
19 |     src_dir=data_dir,
20 |     target=DataPath(datastore, blobstore_datadir),
21 |     show_progress=True
22 | )
23 | 
24 | # Usage after uploading the directory
25 | # To refer to the folder directly:
26 | train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)])
27 | print(train_dataset)
28 | # To refer to a specific file:
29 | # train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir + "/filename.ext")])
30 | # Create DatasetConsumptionConfig to specify how to deliver the dataset to a compute target.
31 | # In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target.
32 | # input_data_dir = train_dataset.as_mount()
33 | # input_data_dir = train_dataset.as_download()
34 | 


--------------------------------------------------------------------------------
/examples_deepspeed/bert_with_pile/ds_config_bert_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": CONFIG_FP16_ENABLED,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "bf16": {
23 |     "enabled": CONFIG_BF16_ENABLED
24 |   },
25 | 
26 |   "wall_clock_breakdown" : false
27 | }
28 | 


--------------------------------------------------------------------------------
/examples_deepspeed/compression/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": CONFIG_FP16_ENABLED,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "bf16": {
23 |     "enabled": CONFIG_BF16_ENABLED
24 |   },
25 |   "curriculum_learning": {
26 |     "enabled": CONFIG_CL_ENABLED,
27 |     "curriculum_type": "seqlen",
28 |     "min_difficulty": CONFIG_CL_MIN,
29 |     "max_difficulty": CONFIG_CL_MAX,
30 |     "schedule_type": "fixed_linear",
31 |     "schedule_config": {
32 |       "total_curriculum_step": CONFIG_CL_DURATION,
33 |       "difficulty_step": 8
34 |     }
35 |   },
36 | 
37 |   "wall_clock_breakdown" : false
38 | }
39 | 


--------------------------------------------------------------------------------
/examples_deepspeed/compression/ds_config_gpt_TEMPLATE_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": CONFIG_FP16_ENABLED,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "bf16": {
23 |     "enabled": CONFIG_BF16_ENABLED
24 |   },
25 |   "curriculum_learning": {
26 |     "enabled": CONFIG_CL_ENABLED,
27 |     "curriculum_type": "seqlen",
28 |     "min_difficulty": CONFIG_CL_MIN,
29 |     "max_difficulty": CONFIG_CL_MAX,
30 |     "schedule_type": "fixed_linear",
31 |     "schedule_config": {
32 |       "total_curriculum_step": CONFIG_CL_DURATION,
33 |       "difficulty_step": 8
34 |     }
35 |   },
36 | 
37 |   "wall_clock_breakdown" : false,
38 | 
39 |   "compression_training": {
40 |     "weight_quantization": {
41 |       "shared_parameters":{
42 |         "enabled": true,
43 |         "quantizer_kernel": false,
44 |         "schedule_offset": 50,
45 |         "quantize_groups": 48,
46 |         "quantize_verbose": false,
47 |         "quantization_type": "symmetric",
48 |         "rounding": "nearest",
49 |         "fp16_mixed_quantize":{
50 |           "enabled": false,
51 |           "quantize_change_ratio": 0.001
52 |         }
53 |       },
54 |       "different_groups":{
55 |         "wq1": {
56 |           "params": {
57 |               "start_bits": 12, 
58 |               "target_bits": 4,
59 |               "quantization_period": 50
60 |           },
61 |           "modules": [
62 |             "encoder.layers"
63 |           ]
64 |         }
65 |       }
66 |     },
67 |     "activation_quantization": {
68 |       "shared_parameters":{
69 |         "enabled": true,
70 |         "quantization_type": "asymmetric",
71 |         "range_calibration": "static",
72 |         "schedule_offset": 50
73 |       },
74 |       "different_groups":{
75 |         "aq1": {
76 |           "params": { 
77 |               "bits": 8
78 |           },
79 |           "modules": [
80 |             "encoder.layers"
81 |           ]
82 |         }
83 |       }
84 |     }
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/examples_deepspeed/curriculum_learning/README.md:
--------------------------------------------------------------------------------
1 | This is an example of how to use DeepSpeed's curriculum learning (CL) feature which provides faster and more stable language model pre-training. Currently it is only integrated for GPT pre-training. Note that there are two curriculum learning examples in two different repos for Megatron-LM GPT-2 pre-training. Both of them have some unique features and limitations. See details in our [tutorial](https://www.deepspeed.ai/tutorials/curriculum-learning/). For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084).


--------------------------------------------------------------------------------
/examples_deepspeed/curriculum_learning/ds_config_gpt_slw_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false,
23 |   "curriculum_learning": {
24 |     "enabled": true,
25 |     "curriculum_type": "seqlen",
26 |     "min_difficulty": CONFIG_CL_MIN,
27 |     "max_difficulty": CONFIG_CL_MAX,
28 |     "schedule_type": "fixed_linear",
29 |     "schedule_config": {
30 |       "total_curriculum_step": CONFIG_CL_DURATION,
31 |       "difficulty_step": 8
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/examples_deepspeed/curriculum_learning/ds_train.sh:
--------------------------------------------------------------------------------
 1 | # # baseline
 2 | # CONFIG=baseline
 3 | # TAG=baseline
 4 | # MODEL_SIZE=1558
 5 | # LR=1.5e-4
 6 | # BSZ=512
 7 | # SEQ_LEN=1024
 8 | # MP_SIZE=1
 9 | # SEED=1234
10 | # SAVE_INTERVAL=5000
11 | # NUM_ITER=600000
12 | # NUM_TOKEN=157286400000
13 | # LR_DECAY_TOKEN=157286400000
14 | # LR_WARMUP_ITER=3000
15 | # CONFIG_TEMPLATE=false
16 | # CURRICULUM_STEP=0
17 | # CURRICULUM_MIN=0
18 | 
19 | # curriculum learning
20 | CONFIG=curriculum_fixed_linear
21 | MODEL_SIZE=1558
22 | LR=6e-4
23 | BSZ=4096
24 | SEQ_LEN=1024
25 | MP_SIZE=1
26 | SEED=1234
27 | SAVE_INTERVAL=1000
28 | NUM_ITER=75000
29 | NUM_TOKEN=157286400000
30 | LR_DECAY_TOKEN=157286400000
31 | LR_WARMUP_ITER=3000
32 | CONFIG_TEMPLATE=true
33 | CURRICULUM_STEP=45000
34 | CURRICULUM_MIN=64
35 | TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}"
36 | 
37 | bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN
38 | 


--------------------------------------------------------------------------------
/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_baseline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 512,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 1,
 5 |   "zero_optimization": {
 6 |     "stage": 1
 7 |   },
 8 |   "optimizer": {
 9 |     "type": "Adam",
10 |     "params": {
11 |       "lr": 0.00015,
12 |       "max_grad_norm": 1.0,
13 |       "betas": [0.9, 0.95]
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true,
19 |     "loss_scale": 0,
20 |     "loss_scale_window": 1000,
21 |     "hysteresis": 2,
22 |     "min_loss_scale": 1
23 |   },
24 |   "wall_clock_breakdown": false,
25 |   "zero_allow_untested_optimizer": false
26 | }
27 | 


--------------------------------------------------------------------------------
/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 512,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 1,
 5 |   "zero_optimization": {
 6 |     "stage": 1
 7 |   },
 8 |   "optimizer": {
 9 |     "type": "Adam",
10 |     "params": {
11 |       "lr": 0.00015,
12 |       "max_grad_norm": 1.0,
13 |       "betas": [0.9, 0.95]
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true,
19 |     "loss_scale": 0,
20 |     "loss_scale_window": 1000,
21 |     "hysteresis": 2,
22 |     "min_loss_scale": 1
23 |   },
24 |   "wall_clock_breakdown": false,
25 |   "zero_allow_untested_optimizer": false,
26 |   "curriculum_learning": {
27 |     "enabled": true,
28 |     "curriculum_type": "seqlen",
29 |     "min_difficulty": CONFIG_CL_MIN,
30 |     "max_difficulty": CONFIG_CL_MAX,
31 |     "schedule_type": "fixed_linear",
32 |     "schedule_config": {
33 |       "total_curriculum_step": CONFIG_CL_DURATION,
34 |       "difficulty_step": 8
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/examples_deepspeed/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false
23 | }
24 | 


--------------------------------------------------------------------------------
/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false
23 | }
24 | 


--------------------------------------------------------------------------------
/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh:
--------------------------------------------------------------------------------
 1 | hostname_and_rank=$1
 2 | master_port=$2
 3 | pretrained_checkpoint=$3
 4 | 
 5 | # hostname_and_rank="worker-0:0,1,2,3"
 6 | # master_port=12345
 7 | # pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
 8 | 
 9 | tasks=(
10 |     RTE
11 |     MRPC
12 |     STS-B
13 |     CoLA
14 |     SST-2
15 |     QNLI
16 |     QQP
17 |     MNLI
18 | )
19 | 
20 | seeds=(
21 |     1234
22 |     1235
23 |     1236
24 |     1237
25 |     1238
26 | )
27 | 
28 | lrs=(
29 |     2e-5
30 |     3e-5
31 |     4e-5
32 |     5e-5
33 | )
34 | 
35 | for ((i=0;i<${#tasks[@]};++i)); do
36 |     task=${tasks[i]}
37 |     for ((j=0;j<${#seeds[@]};++j)); do
38 |         seed=${seeds[j]}
39 |         for ((k=0;k<${#lrs[@]};++k)); do
40 |             lr=${lrs[k]}
41 |             bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint}
42 |         done
43 |     done
44 | done


--------------------------------------------------------------------------------
/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false,
23 |   "dataloader_drop_last": true,
24 |   "data_efficiency": {
25 |     "enabled": true,
26 |     "seed": DATA_EFFICIENCY_SEED,
27 |     "data_routing": {
28 |       "enabled": LTD_ENABLED,
29 |       "random_ltd":{
30 |         "enabled": LTD_ENABLED,
31 |         "total_layer_num": 24,
32 |         "random_ltd_layer_num": 22,
33 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
34 |         "model_mask_name": "attention_mask",
35 |         "model_type": "encoder",
36 |         "hidden_state_order": "seq_batch_dim",
37 |         "random_ltd_schedule": {
38 |           "min_value": LTD_MIN,
39 |           "max_value": LTD_MAX,
40 |           "schedule_type":"fixed_linear",
41 |           "schedule_config": {
42 |             "require_steps": LTD_STEP,
43 |             "seq_per_step": 16
44 |           }
45 |         }
46 |       } 
47 |     },
48 |     "data_sampling": {
49 |       "enabled": CL_ENABLED,
50 |       "num_workers": DATA_SAMPLING_NUM_WORKERS,
51 |       "curriculum_learning": {
52 |         "enabled": CL_ENABLED,
53 |         "data_cluster_path": "CL_CLUSTER_PATH",
54 |         "curriculum_metrics": {
55 |           "CL_1st_METRIC_NAME": {
56 |             "index_to_sample_path": "CL_1st_SAMPLE_PATH",
57 |             "index_to_metric_path": "CL_1st_METRIC_PATH",
58 |             "difficulty_type": "CL_1st_DIFF_TYPE",
59 |             "clustering_type": "CL_1st_CLUSTER_TYPE",
60 |             "min_difficulty": CL_1st_MIN,
61 |             "max_difficulty": CL_1st_MAX,
62 |             "schedule_type": "fixed_root",
63 |             "schedule_config": {
64 |               "total_curriculum_step": CL_1st_TOTAL_STEP,
65 |               "difficulty_step": CL_1st_DIFF_STEP,
66 |               "root_degree": CL_1st_ROOT
67 |             }
68 |           }
69 |         }
70 |       }
71 |     }
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/examples_deepspeed/data_efficiency/gpt/eval/ds_config_eval_dummy.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "train_batch_size" : 2048,
 3 | "train_micro_batch_size_per_gpu": 16,
 4 | "steps_per_print": 10,
 5 | 
 6 | "zero_optimization": {
 7 |     "stage": 0
 8 | },
 9 | 
10 | "gradient_clipping": 1.0,
11 | "prescale_gradients": true,
12 | 
13 | "fp16": {
14 |     "enabled": false,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 | },
21 | 
22 | "bf16": {
23 |     "enabled": false
24 | },
25 | 
26 | "wall_clock_breakdown" : false
27 | }


--------------------------------------------------------------------------------
/examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false,
23 |   "dataloader_drop_last": true,
24 |   "data_efficiency": {
25 |     "enabled": true,
26 |     "seed": DATA_EFFICIENCY_SEED,
27 |     "data_routing": {
28 |       "enabled": LTD_ENABLED,
29 |       "random_ltd":{
30 |         "enabled": LTD_ENABLED,
31 |         "total_layer_num": 24,
32 |         "random_ltd_layer_num": 22,
33 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
34 |         "model_mask_name": "attention_mask",
35 |         "model_type": "decoder",
36 |         "hidden_state_order": "seq_batch_dim",
37 |         "random_ltd_schedule": {
38 |           "min_value": LTD_MIN,
39 |           "max_value": LTD_MAX,
40 |           "schedule_type":"fixed_linear",
41 |           "schedule_config": {
42 |             "require_steps": LTD_STEP,
43 |             "seq_per_step": 16
44 |           }
45 |         }
46 |       } 
47 |     },
48 |     "data_sampling": {
49 |       "enabled": CL_ENABLED,
50 |       "num_workers": DATA_SAMPLING_NUM_WORKERS,
51 |       "curriculum_learning": {
52 |         "enabled": CL_ENABLED,
53 |         "data_cluster_path": "CL_CLUSTER_PATH",
54 |         "curriculum_metrics": {
55 |           "CL_1st_METRIC_NAME": {
56 |             "index_to_sample_path": "CL_1st_SAMPLE_PATH",
57 |             "index_to_metric_path": "CL_1st_METRIC_PATH",
58 |             "difficulty_type": "CL_1st_DIFF_TYPE",
59 |             "clustering_type": "CL_1st_CLUSTER_TYPE",
60 |             "min_difficulty": CL_1st_MIN,
61 |             "max_difficulty": CL_1st_MAX,
62 |             "schedule_type": "fixed_root",
63 |             "schedule_config": {
64 |               "total_curriculum_step": CL_1st_TOTAL_STEP,
65 |               "difficulty_step": CL_1st_DIFF_STEP,
66 |               "root_degree": CL_1st_ROOT
67 |             }
68 |           }
69 |         }
70 |       }
71 |     }
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/examples_deepspeed/deepspeed4science/megatron_long_seq_support/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "flops_profiler": {
23 |     "enabled": true,
24 |     "profile_step": 1,
25 |     "module_depth": -1,
26 |     "top_modules": 3,
27 |     "detailed": true,
28 |     "output_file": null
29 |   },
30 | 
31 |   "wall_clock_breakdown" : false
32 | }
33 | 


--------------------------------------------------------------------------------
/examples_deepspeed/deepspeed4science/megatron_long_seq_support/host_file:
--------------------------------------------------------------------------------
1 | worker-1 slots=4
2 | 


--------------------------------------------------------------------------------
/examples_deepspeed/finetune_hf_llama/README.md:
--------------------------------------------------------------------------------
 1 | ## Example of Finetuning LLAMA-7B from Hugging Face Weights
 2 | 
 3 | ### Dataset
 4 | You can access the dataset from [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json).
 5 | 
 6 | ### Pre-trained Weights
 7 | The pre-trained weights can be found at [Hugging Face - LLAMA-7B](https://huggingface.co/huggyllama/llama-7b).
 8 | 
 9 | ### Usage:
10 | 
11 | #### 1. Converting Hugging Face Model Weights to Megatron-Deepspeed Model
12 | ```bash
13 | bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh convert_hf2mds
14 | ```
15 | This command writes the Hugging Face model weights into the Megatron-Deepspeed model and saves it. You can adjust the parallel configuration in the script.```convert_mds2hf``` can convert a Megatron-Deepspeed model into the Hugging Face format
16 | 
17 | #### 2. Fine-tuning Process
18 | ```bash
19 | bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh
20 | ```
21 | Execute this command to initiate the finetuning process. The task originates from [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca.git).
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/examples_deepspeed/finetune_hf_llama/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : 256,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 100,
 5 |   "zero_optimization": {
 6 |     "stage": 0
 7 |   },
 8 |   "bf16": {
 9 |     "enabled": true
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/examples_deepspeed/finetune_hf_llama/ds_config_empty.json:
--------------------------------------------------------------------------------
1 | {
2 |   "train_batch_size" : 256,
3 |   "train_micro_batch_size_per_gpu": 16,
4 |   "steps_per_print": 100
5 | }
6 | 


--------------------------------------------------------------------------------
/examples_deepspeed/generate_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export TORCH_CUDA_ARCH_LIST=8.6+PTX
 3 | CHECKPOINT_PATH=dataset/checkpoints/gpt2_345m
 4 | VOCAB_FILE=dataset/gpt2-vocab.json
 5 | MERGE_FILE=dataset/gpt2-merges.txt
 6 | b=8
 7 | mp=1
 8 | experts=1
 9 | nodes=1
10 | gpus=1
11 | 
12 | 
13 | use_tutel=""
14 | #use_tutel="--use-tutel"
15 | 
16 | 
17 | ds_inference=""
18 | #ds_inference="--ds-inference"
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | 
22 | launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus"
23 | L=24
24 | H=1024
25 | A=16
26 | #experts1=${experts[$k]}
27 | program_cmd="tools/generate_samples_gpt.py \
28 |        --tensor-model-parallel-size $mp \
29 |        --num-layers $L \
30 |        --hidden-size $H \
31 |        --num-attention-heads $A \
32 |        --max-position-embeddings 1024 \
33 |        --tokenizer-type GPT2BPETokenizer \
34 |        --fp16 \
35 |        --num-experts ${experts} \
36 |        --mlp-type standard \
37 |        --micro-batch-size $b \
38 |        --seq-length 1024 \
39 |        --out-seq-length 1024 \
40 |        --temperature 1.0 \
41 |        --vocab-file $VOCAB_FILE \
42 |        --merge-file $MERGE_FILE \
43 |        --genfile unconditional_samples.json \
44 |        --top_p 0.9 \
45 |        --log-interval 1 \
46 |        --num-samples 0 \
47 |        --load $CHECKPOINT_PATH \
48 |        $use_tutel $ds_inference"
49 | 
50 | echo $launch_cmd $program_cmd
51 | $launch_cmd $program_cmd
52 | 


--------------------------------------------------------------------------------
/examples_deepspeed/offload_pp/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": 3,
 8 |     "offload_optimizer": {
 9 |       "device": "cpu",
10 |       "pin_memory": true,
11 |       "ratio": 0.3
12 |     }
13 |   },
14 | 
15 |   "gradient_clipping": 1.0,
16 |   "prescale_gradients":false,
17 | 
18 |   "fp16": {
19 |     "enabled": CONFIG_FP16_ENABLED,
20 |     "loss_scale": 0,
21 |     "loss_scale_window": 500,
22 |     "hysteresis": 2,
23 |     "min_loss_scale": 1,
24 |     "initial_scale_power": 11
25 |   },
26 | 
27 |   "bf16": {
28 |     "enabled": CONFIG_BF16_ENABLED
29 |   },
30 | 
31 |   "wall_clock_breakdown" : false
32 | }
33 | 


--------------------------------------------------------------------------------
/examples_deepspeed/offload_pp/twin-offload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/offload_pp/twin-offload.png


--------------------------------------------------------------------------------
/examples_deepspeed/rebase/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false
23 | }
24 | 


--------------------------------------------------------------------------------
/examples_deepspeed/rebase/ds_config_gpt_slw_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false,
23 |   "curriculum_learning": {
24 |     "enabled": true,
25 |     "curriculum_type": "seqlen",
26 |     "min_difficulty": CONFIG_CL_MIN,
27 |     "max_difficulty": CONFIG_CL_MAX,
28 |     "schedule_type": "fixed_linear",
29 |     "schedule_config": {
30 |       "total_curriculum_step": CONFIG_CL_DURATION,
31 |       "difficulty_step": 8
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/examples_deepspeed/run_deepspeed_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | BASE_PATH=/vc_data/Megatron-LM/data
 5 | DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
 6 | DS_CONFIG=ds_config.json
 7 | 
 8 | TP=1
 9 | PP=1
10 | NLAYERS=24
11 | HIDDEN=512
12 | 
13 | GLOBAL_BATCH=64
14 | MICRO_BATCH=4
15 | 
16 | ZERO_STAGE=2
17 | 
18 | OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
19 | #OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
20 | mkdir -p $OUTPUT_DIR
21 | 
22 | cat <<EOT > $DS_CONFIG
23 | {
24 |   "train_batch_size" : $GLOBAL_BATCH,
25 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH,
26 |   "steps_per_print": 1,
27 | 
28 |   "zero_optimization": {
29 |     "stage": $ZERO_STAGE
30 |   },
31 | 
32 |   "fp16": {
33 |     "enabled": true,
34 |     "initial_scale_power": 12
35 |   },
36 | 
37 |   "wall_clock_breakdown" : true
38 | }
39 | EOT
40 | 
41 | export NCCL_DEBUG=warn 
42 | 
43 | ds_args=""
44 | ds_args=" --deepspeed ${ds_args}"
45 | ds_args=" --no-pipeline-parallel ${ds_args}" 
46 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
47 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
48 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
49 | 
50 | 
51 | deepspeed pretrain_gpt.py \
52 |     --tensor-model-parallel-size $TP \
53 |     --pipeline-model-parallel-size $PP \
54 |     --num-layers $NLAYERS \
55 |     --hidden-size $HIDDEN \
56 |     --num-attention-heads 16 \
57 |     --seq-length 256 \
58 |     --loss-scale 12 \
59 |     --max-position-embeddings 1024 \
60 |     --micro-batch-size 4 \
61 |     --global-batch-size 1024 \
62 |     --train-iters 1000 \
63 |     --lr 6.0e-5 \
64 |     --min-lr 6.0e-6 \
65 |     --lr-decay-style cosine \
66 |     --log-interval 1 \
67 |     --eval-iters 40 \
68 |     --eval-interval 1000 \
69 |     --data-path $DATA_PATH \
70 |     --vocab-file $BASE_PATH/gpt2-vocab.json \
71 |     --merge-file $BASE_PATH/gpt2-merges.txt \
72 |     --save-interval 1000 \
73 |     --split 98,2,0 \
74 |     --clip-grad 1.0 \
75 |     --weight-decay 0.1 \
76 |     --adam-beta1 0.9 \
77 |     --adam-beta2 0.95 \
78 |     --init-method-std 0.006 \
79 |     --fp16 \
80 |     --checkpoint-activations \
81 |     --tensorboard-dir $OUTPUT_DIR \
82 |     $ds_args \
83 |     --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
84 | 
85 | 


--------------------------------------------------------------------------------
/examples_deepspeed/sequence_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # Sequence Parallelism
 2 | 
 3 | This folder contains examples that demonstrate how to use DeepSpeed's sequence parallelism.
 4 | 
 5 | ## Setting Up the Environment for FlashAttention
 6 | 
 7 | DeepSpeed's sequence parallelism can be combined with the following types of attention.
 8 | 
 9 | - Classic attention
10 | - FlashAttention (enabled by `--use-flash-attn`)
11 | - FlashAttention + Triton (enabled by `--use-flash-attn-triton`)
12 | 
13 | For the best performance, we recommend using FlashAttention + Triton. Here are the installation steps and the versions we have tested. Note that FlashAttention is compatible only with Turing, Ampere, Ada, or Hopper GPUs.
14 | 
15 | ```shell
16 | # install triton
17 | git clone -b legacy-backend https://github.com/openai/triton
18 | cd triton/python/
19 | pip install cmake
20 | pip install .
21 | 
22 | # install
23 | cd ${WORK_DIR}
24 | git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
25 | cd flash-attention
26 | python setup.py install
27 | ```
28 | 
29 | ## Enabling Sequence Parallelism
30 | 
31 | To enable sequence parallelism, set the degree of parallelism using the `--ds-sequence-parallel-size` argument. Ensure that the number of attention heads is divisible by this value.
32 | Ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the document of [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/v1.0.4) for more details.
33 | 
34 | Some working examples ([GPT1.3B](ds_pretrain_gpt_1.3B_seq_parallel_32k.sh), [GPT30B](ds_pretrain_gpt_30B_seq_parallel_32k.sh)), that enable sequence parallelism, are available in this foloder.
35 | 
36 | Please note that our sequence parallelism feature is currently incompatible with Megatron-LM's tensor or pipeline parallelism.
37 | 


--------------------------------------------------------------------------------
/examples_deepspeed/sequence_parallel/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": PRESCALE_GRAD,
12 | 
13 |   "fp16": {
14 |     "enabled": true,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "wall_clock_breakdown" : false
23 | }
24 | 


--------------------------------------------------------------------------------
/examples_deepspeed/sequence_parallel/preprocess_bookcorpus.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | train_data = load_dataset('bookcorpus/bookcorpus', split='train')
4 | train_data.to_json("BookCorpusDataset_text_document.json", lines=True)
5 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/assets/image/uc_char_training_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_char_training_loss.png


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/assets/image/uc_char_validation_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_char_validation_loss.png


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : 16,
 3 |   "train_micro_batch_size_per_gpu": 16,
 4 |   "steps_per_print": 1,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": 1
 8 |   },
 9 | 
10 |   "bf16": {
11 |     "enabled": true
12 |   },
13 | 
14 |   "data_types": {
15 |         "grad_accum_dtype": "fp32" 
16 |   },
17 | 
18 |   "wall_clock_breakdown" : false
19 | }
20 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/llama/run_tb_analysis_llama.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | OUTPUT_PATH=$1
 8 | 
 9 | if [ "$OUTPUT_PATH" == "" ]; then
10 |     OUTPUT_PATH="z1_uni_ckpt"
11 | fi
12 | 
13 | # Training Loss
14 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
15 |     --tb_dir $OUTPUT_PATH \
16 |     --tb_event_key "lm-loss-training/lm loss" \
17 |     --plot_name "uc_char_training_loss.png" \
18 |     --plot_title "Llama 7B Universal Checkpointing - Training Loss" \
19 | 
20 | # Validation Loss
21 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
22 |     --tb_dir $OUTPUT_PATH \
23 |     --tb_event_key "lm-loss-validation/lm loss validation" \
24 |     --csv_name "val_" \
25 |     --plot_name "uc_char_validation_loss.png" \
26 |     --plot_title "Llama 7B Universal Checkpointing - Validation Loss" \
27 |     --plot_y_label "Validation LM Loss" \
28 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | OUTPUT_PATH=$1
 8 | 
 9 | if [ "$OUTPUT_PATH" == "" ]; then
10 |     OUTPUT_PATH="z1_uni_ckpt"
11 | fi
12 | 
13 | # Training Loss
14 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
15 |     --tb_dir $OUTPUT_PATH \
16 |     --tb_event_key "lm-loss-training/lm loss" \
17 |     --plot_name "uc_char_training_loss.png" \
18 |     --plot_title "Megatron-GPT Universal Checkpointing - Training Loss" \
19 | 
20 | # Validation Loss
21 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
22 |     --tb_dir $OUTPUT_PATH \
23 |     --tb_event_key "lm-loss-validation/lm loss validation" \
24 |     --csv_name "val_" \
25 |     --plot_name "uc_char_validation_loss.png" \
26 |     --plot_title "Megatron-GPT Universal Checkpointing - Validation Loss" \
27 |     --plot_y_label "Validation LM Loss" \
28 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt_plot_only.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | OUTPUT_PATH=$1
 8 | 
 9 | if [ "$OUTPUT_PATH" == "" ]; then
10 |     OUTPUT_PATH="z1_uni_ckpt"
11 | fi
12 | 
13 | # Training Loss
14 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
15 |     --tb_dir $OUTPUT_PATH \
16 |     --tb_event_key "lm-loss-training/lm loss" \
17 |     --plot_name "uc_char_training_loss.png" \
18 |     --plot_title "Megatron-GPT Universal Checkpointing - Training Loss" \
19 |     --plot_only \
20 |     --csv_dir "/workspace/uc/megatron/loss_csv" \
21 | 
22 | # Validation Loss
23 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
24 |     --tb_dir $OUTPUT_PATH \
25 |     --tb_event_key "lm-loss-validation/lm loss validation" \
26 |     --csv_name "val_" \
27 |     --plot_name "uc_char_validation_loss.png" \
28 |     --plot_title "Megatron-GPT Universal Checkpointing - Validation Loss" \
29 |     --plot_y_label "Validation LM Loss" \
30 |     --plot_only \
31 |     --csv_dir "/workspace/uc/megatron/val_csv" \
32 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/tb_analysis/abstract_analysis.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import abc
 7 | from abc import ABC
 8 | 
 9 | 
10 | class TensorBoardAnalysis(ABC):
11 | 
12 |     def __init__(self):
13 |         self._name = None
14 |         self._label_name = None
15 |         self._csv_name = None
16 | 
17 |     @abc.abstractmethod
18 |     def set_names(self, path_name):
19 |         ...
20 | 
21 |     @abc.abstractmethod
22 |     def get_label_name(self):
23 |         ...
24 | 
25 |     @abc.abstractmethod
26 |     def get_csv_filename(self):
27 |         ...
28 | 
29 |     @abc.abstractmethod
30 |     def path_regex(self):
31 |         ...
32 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | from argparse import ArgumentParser
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--tb_dir", required=True, type=str, help="Directory for tensorboard output")
10 | parser.add_argument("--analyzer", default="universal_checkpointing", type=str, choices=["universal_checkpointing"], help="Specify the analyzer to use")
11 | parser.add_argument("--tb_event_key", required=False, default="lm-loss-training/lm loss", type=str, help="Optional override of the TensorBoard event key")
12 | parser.add_argument("--plot_title", required=False, default="Megatron-GPT Universal Checkpointing", type=str, help="Optional override of the plot title")
13 | parser.add_argument("--plot_x_label", required=False, default="Training Step", type=str, help="Optional override of the plot x-label")
14 | parser.add_argument("--plot_y_label", required=False, default="LM Loss", type=str, help="Optional override of the plot y-label")
15 | parser.add_argument("--plot_name", required=False, default="uni_ckpt_char.png", type=str, help="Optional override of the plot file name")
16 | parser.add_argument("--skip_plot", action='store_true', help="Skip generation of plot file")
17 | parser.add_argument("--skip_csv", action='store_true', help="Skip generation of csv files")
18 | parser.add_argument("--use_sns", action='store_true', help="Use the SNS library to format plot")
19 | parser.add_argument("--csv_name", required=False, default="", type=str, help="Unique name for CSV files")
20 | parser.add_argument("--plot_only", action='store_true', help="Plot only using csv files")
21 | parser.add_argument("--csv_dir", required=False, type=str, help="Directory for csv files")
22 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import re
 7 | from abstract_analysis import TensorBoardAnalysis
 8 | 
 9 | 
10 | class UniversalCheckpointingAnalysis(TensorBoardAnalysis):
11 | 
12 |     def __init__(self):
13 |         self._name = "universal_checkpointing"
14 | 
15 |     def set_names(self, path_name):
16 |         match = re.match(self.path_regex(), path_name)
17 |         if not match:
18 |             raise ValueError(f"Path ({path_name}) did not match regex ({self.path_regex()})")
19 |         tp, pp, dp, sp = match.groups()
20 | 
21 |         self._label_name = f"Training Run: TP: {tp}, PP: {pp}, DP: {dp}"
22 |         self._csv_name = f"uc_out_tp{tp}_pp{pp}_dp{dp}_sp{sp}"
23 | 
24 |     def get_label_name(self):
25 |         return self._label_name
26 | 
27 |     def get_csv_filename(self):
28 |         return self._csv_name
29 | 
30 |     def path_regex(self):
31 |         return '.*tp(\d+).*pp(\d+).*dp(\d+).*sp(\d+)'
32 | 


--------------------------------------------------------------------------------
/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import os
 7 | from uc_analysis import UniversalCheckpointingAnalysis
 8 | 
 9 | 
10 | def find_files_prefix(directory, file_prefix):
11 |     """
12 |     Searches for files with a specific prefix in a directory using os.walk().
13 | 
14 |     Args:
15 |         directory (str): The path to the directory to search.
16 |         file_prefix (str): The desired file prefix.
17 | 
18 |     Returns:
19 |         list: A list of paths to matching files.
20 |     """
21 |     matching_paths = []
22 |     for root, _, files in os.walk(directory):
23 |         for filename in files:
24 |             if root not in matching_paths and filename.lower().startswith(file_prefix.lower()):
25 |                 matching_paths.append(os.path.join(root))
26 |     return matching_paths
27 | 
28 | def find_files_suffix(directory, file_suffix):
29 |     """
30 |     Searches for files with a specific suffix in a directory using os.walk().
31 | 
32 |     Args:
33 |         directory (str): The path to the directory to search.
34 |         file_suffix (str): The desired file suffix.
35 | 
36 |     Returns:
37 |         list: A list of paths to matching files.
38 |     """
39 |     matching_paths = []
40 |     for root, _, files in os.walk(directory):
41 |         for filename in files:
42 |             if root not in matching_paths and filename.lower().endswith(file_suffix.lower()):
43 |                 matching_paths.append(os.path.join(filename))
44 |     return matching_paths
45 | 
46 | def get_analyzer(analyzer_name):
47 |     if analyzer_name == 'universal_checkpointing':
48 |         return UniversalCheckpointingAnalysis()
49 |     else:
50 |         raise ValueError(f"Unsupported analyzer {analyzer_name}")
51 | 


--------------------------------------------------------------------------------
/examples_deepspeed/zero_bubble_pp/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/zero_bubble_pp/benchmark.png


--------------------------------------------------------------------------------
/examples_deepspeed/zero_bubble_pp/bw_split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/zero_bubble_pp/bw_split.png


--------------------------------------------------------------------------------
/examples_deepspeed/zero_bubble_pp/zbh1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/zero_bubble_pp/zbh1.png


--------------------------------------------------------------------------------
/images/Achieved_petaFLOPs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/images/Achieved_petaFLOPs.png


--------------------------------------------------------------------------------
/images/cases_april2021.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/images/cases_april2021.png


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args, get_retro_args
 6 | from .global_vars import get_current_global_batch_size
 7 | from .global_vars import get_num_microbatches
 8 | from .global_vars import get_signal_handler
 9 | from .global_vars import update_num_microbatches
10 | from .global_vars import get_tokenizer
11 | from .global_vars import get_tensorboard_writer
12 | from .global_vars import get_wandb_writer
13 | from .global_vars import get_adlr_autoresume
14 | from .global_vars import get_timers
15 | from .initialize  import initialize_megatron
16 | 
17 | from .utils import (print_rank_0,
18 |                     is_last_rank,
19 |                     print_rank_last,
20 |                     is_rank_0,
21 |                     is_aml)
22 | 


--------------------------------------------------------------------------------
/megatron/core/README.md:
--------------------------------------------------------------------------------
1 | Megatron Core is a library for efficient and scalable training of transformer based models.
2 | 


--------------------------------------------------------------------------------
/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import megatron.core.parallel_state
 2 | import megatron.core.tensor_parallel
 3 | import megatron.core.utils
 4 | 
 5 | from .model_parallel_config import ModelParallelConfig
 6 | 
 7 | # Alias parallel_state as mpu, its legacy name
 8 | mpu = parallel_state
 9 | 
10 | __all__ = [
11 |     "parallel_state",
12 |     "tensor_parallel",
13 |     "utils",
14 |     "ModelParallelConfig"
15 | ]
16 | 


--------------------------------------------------------------------------------
/megatron/core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class ModelType(enum.Enum):
 6 |     encoder_or_decoder = 1
 7 |     encoder_and_decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/core/fusions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from typing import Tuple, Optional
 5 | 
 6 | def _bias_dropout_add_func(x, bias, residual, prob, training):
 7 |     # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
 8 |     # NOTE: Previously, the argument `bias` used to be passed as
 9 |     # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
10 |     # transformer layer but broadcasting should automatically take care of that.
11 |     # Also, looking at broadcasting semantics, `expand_as` and broadcasting
12 |     # seem to be identical performance-wise (both just change the view).
13 |     if bias is not None:
14 |         x = x + bias
15 |     out = torch.nn.functional.dropout(x, p=prob, training=training)
16 |     out = residual + out
17 |     return out
18 | 
19 | def get_bias_dropout_add(training, fused):
20 | 
21 |     def unfused_bias_dropout_add(x_with_bias, residual, prob):
22 |         x, bias = x_with_bias # unpack
23 |         return _bias_dropout_add_func(x, bias, residual, prob, training)
24 | 
25 |     @torch.jit.script
26 |     def bias_dropout_add_fused_train(
27 |         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
28 |         residual: torch.Tensor,
29 |         prob: float
30 |     ) -> torch.Tensor:
31 |         x, bias = x_with_bias # unpack
32 |         return _bias_dropout_add_func(x, bias, residual, prob, True)
33 | 
34 |     @torch.jit.script
35 |     def bias_dropout_add_fused_inference(
36 |         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
37 |         residual: torch.Tensor,
38 |         prob: float
39 |     ) -> torch.Tensor:
40 |         x, bias = x_with_bias # unpack
41 |         return _bias_dropout_add_func(x, bias, residual, prob, False)
42 | 
43 |     if fused:
44 |         # jit scripting for a nn.module (with dropout) is not
45 |         # triggering the fusion kernel. For now, we use two
46 |         # different nn.functional routines to account for varying
47 |         # dropout semantics during training and inference phases.
48 |         if training:
49 |             return bias_dropout_add_fused_train
50 |         else:
51 |             return bias_dropout_add_fused_inference
52 |     else:
53 |         return unfused_bias_dropout_add
54 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 7 | # 1/sqrt(2*pi)-> 0.3989423
 8 | # 1/sqrt(2)   -> 0.70710678
 9 | # sqrt(2/pi)  -> 0.79788456
10 | # this function is tanh approximation of gelu
11 | # actual gelu is:
12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
13 | 
14 | @torch.jit.script
15 | def bias_gelu(bias, y):
16 |     x = bias + y
17 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18 | 
19 | # gradient of tanh approximation of gelu
20 | # gradient of actual gelu is:
21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
22 | @torch.jit.script
23 | def bias_gelu_back(g, bias, y):
24 |     x = bias + y
25 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
26 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
27 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
28 |     return ff*g
29 | 
30 | class GeLUFunction(torch.autograd.Function):
31 |     @staticmethod
32 |     # bias is an optional argument
33 |     def forward(ctx, input, bias):
34 |         ctx.save_for_backward(input, bias)
35 |         return bias_gelu(bias, input)
36 | 
37 |     @staticmethod
38 |     def backward(ctx, grad_output):
39 |         input, bias = ctx.saved_tensors
40 |         tmp = bias_gelu_back(grad_output, bias, input)
41 |         return tmp, tmp
42 | 
43 | bias_gelu_impl = GeLUFunction.apply
44 | 


--------------------------------------------------------------------------------
/megatron/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/core/models/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_model import GPTModel
2 | 


--------------------------------------------------------------------------------
/megatron/core/package_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | 
 4 | MAJOR = 0
 5 | MINOR = 2
 6 | PATCH = 0
 7 | PRE_RELEASE = ''
 8 | 
 9 | # Use the following formatting: (major, minor, patch, pre-release)
10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
11 | 
12 | __shortversion__ = '.'.join(map(str, VERSION[:3]))
13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
14 | 
15 | __package_name__ = 'megatron_core'
16 | __contact_names__ = 'NVIDIA'
17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email
18 | __homepage__ = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage
19 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
20 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
21 | __description__ = 'Megatron Core - a library for efficient and scalable training of transformer based models'
22 | __license__ = 'BSD-3'
23 | __keywords__ = 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
24 | 


--------------------------------------------------------------------------------
/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedules import get_forward_backward_func
2 | 


--------------------------------------------------------------------------------
/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | pybind11
2 | torch
3 | regex


--------------------------------------------------------------------------------
/megatron/core/sequence_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .cross_entropy import vocab_sequence_parallel_cross_entropy
2 | 


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cross_entropy import vocab_parallel_cross_entropy
 2 | from .data import broadcast_data
 3 | 
 4 | from .layers import (
 5 |     ColumnParallelLinear,
 6 |     RowParallelLinear,
 7 |     VocabParallelEmbedding,
 8 |     set_tensor_model_parallel_attributes,
 9 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
10 |     copy_tensor_model_parallel_attributes,
11 |     param_is_not_tensor_parallel_duplicate,
12 |     linear_with_grad_accumulation_and_async_allreduce
13 | 
14 | )
15 | 
16 | from .mappings import (
17 |     copy_to_tensor_model_parallel_region,
18 |     gather_from_tensor_model_parallel_region,
19 |     gather_from_sequence_parallel_region,
20 |     scatter_to_tensor_model_parallel_region,
21 |     scatter_to_sequence_parallel_region,
22 | )
23 | 
24 | from .random import (
25 |     checkpoint,
26 |     get_cuda_rng_tracker,
27 |     model_parallel_cuda_manual_seed,
28 |     model_parallel_reconfigure_tp_seed,
29 |     init_checkpointed_activations_memory_buffer,
30 |     reset_checkpointed_activations_memory_buffer,
31 | )
32 | 
33 | from .utils import (
34 |     split_tensor_along_last_dim,
35 |     split_tensor_into_1d_equal_chunks,
36 |     gather_split_1d_tensor,
37 | )
38 | 
39 | __all__ = [
40 |     # cross_entropy.py
41 |     "vocab_parallel_cross_entropy",
42 |     # data.py
43 |     "broadcast_data",
44 |     #layers.py
45 |     "ColumnParallelLinear",
46 |     "RowParallelLinear",
47 |     "VocabParallelEmbedding",
48 |     "set_tensor_model_parallel_attributes",
49 |     "set_defaults_if_not_set_tensor_model_parallel_attributes",
50 |     "copy_tensor_model_parallel_attributes",
51 |     "param_is_not_tensor_parallel_duplicate",
52 |     "linear_with_grad_accumulation_and_async_allreduce",
53 |     # mappings.py
54 |     "copy_to_tensor_model_parallel_region",
55 |     "gather_from_tensor_model_parallel_region",
56 |     "gather_from_sequence_parallel_region",
57 | #    "reduce_from_tensor_model_parallel_region",
58 |     "scatter_to_tensor_model_parallel_region",
59 |     "scatter_to_sequence_parallel_region",
60 |     # random.py
61 |     "checkpoint",
62 |     "get_cuda_rng_tracker",
63 |     "model_parallel_cuda_manual_seed",
64 |     "init_checkpointed_activations_memory_buffer",
65 |     "reset_checkpointed_activations_memory_buffer",
66 |     # utils.py
67 |     "split_tensor_along_last_dim",
68 |     "split_tensor_into_1d_equal_chunks",
69 |     "gather_split_1d_tensor",
70 | ]
71 | 


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/weight_grad_store.py:
--------------------------------------------------------------------------------
 1 | import queue
 2 | 
 3 | class WeightGradStore:
 4 | 
 5 |     cache = []
 6 |     weight_grad_queue = queue.Queue()
 7 |     combine_bw = True
 8 | 
 9 |     @classmethod
10 |     def set_combine_bw(cls, combine_bw):
11 |         # For the following backward pass, combine W with B and skip next W.
12 |         cls.combine_bw = combine_bw
13 | 
14 |     @classmethod
15 |     def put(cls, total_input, grad_output, weight, func):
16 |         if cls.combine_bw == True:
17 |             func(total_input, grad_output, weight)
18 |             return
19 |         # Store the weight gradient computation of linear layers.
20 |         cls.cache.append((total_input, grad_output, weight, func))
21 | 
22 |     @classmethod
23 |     def flush(cls):
24 |         # Collect all stored computations during backward as a W.
25 |         cls.weight_grad_queue.put(cls.cache)
26 |         cls.cache = []
27 | 
28 |     @classmethod
29 |     def pop(cls):
30 |         # Execute a single W.
31 |         assert cls.weight_grad_queue.qsize() > 0
32 |         stored_grads = cls.weight_grad_queue.get()
33 |         for total_input, grad_output, weight, func in stored_grads:
34 |             func(total_input, grad_output, weight)


--------------------------------------------------------------------------------
/megatron/core/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .transformer_config import TransformerConfig
4 | from .core_attention import CoreAttention
5 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | # can we get rid of this?
 7 | # it's being used in pipeline schedules
 8 | class ModelType(enum.Enum):
 9 |     encoder_or_decoder = 1
10 |     encoder_and_decoder = 2
11 | 
12 | 
13 | # class LayerType(enum.Enum):
14 | #     encoder = 1
15 | #     decoder = 2
16 | 
17 | 
18 | class AttnType(enum.Enum):
19 |     self_attn = 1
20 |     cross_attn = 2
21 | 
22 | 
23 | class AttnMaskType(enum.Enum):
24 |     padding = 1
25 |     causal = 2
26 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Utilities for transformer layers."""
 4 | 
 5 | import torch
 6 | 
 7 | from megatron import get_args
 8 | 
 9 | from deepspeed.runtime.zero import GatheredParameters
10 | 
11 | def attention_mask_func(attention_scores, attention_mask):
12 |     attention_scores.masked_fill_(attention_mask, -10000.0)
13 |     return attention_scores
14 | 
15 | 
16 | def get_linear_layer(rows, columns, init_method, gather_params_on_init=False):
17 |     """Simple linear layer with weight initialization."""
18 |     layer = torch.nn.Linear(rows, columns)
19 |     if get_args().perform_initialization:
20 |         with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
21 |             init_method(layer.weight)
22 |     with torch.no_grad():
23 |         with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
24 |             layer.bias.zero_()
25 |     return layer
26 | 
27 | 
28 | @torch.jit.script
29 | def gelu_impl(x):
30 |     """OpenAI's gelu implementation."""
31 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
32 | 
33 | 
34 | def openai_gelu(x):
35 |     return gelu_impl(x)
36 | 
37 | 
38 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
39 | @torch.jit.script
40 | def erf_gelu(x):
41 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
42 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/megatron/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import enum
17 | 
18 | class LayerType(enum.Enum):
19 |     encoder = 1
20 |     decoder = 2
21 |  
22 | class AttnType(enum.Enum):
23 |     self_attn = 1
24 |     cross_attn = 2
25 | 
26 | class AttnMaskType(enum.Enum):
27 |     padding = 1
28 |     causal = 2
29 |     prefix = 3
30 | 
31 | class PositionEmbeddingType(enum.Enum):
32 |     rotary = 1
33 |     absolute = 2
34 |     alibi = 3
35 | 


--------------------------------------------------------------------------------
/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     float scale_factor);
14 | 
15 | torch::Tensor bwd_cuda(
16 |     torch::Tensor const& output_grads, 
17 |     torch::Tensor const& softmax_results,
18 |     float scale_factor);
19 | 
20 | torch::Tensor fwd(
21 |     torch::Tensor const& input,
22 |     float scale_factor) {
23 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
24 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
25 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
26 |       "Only fp16 and bf16 are supported");
27 | 
28 |   return fwd_cuda(input, scale_factor);
29 | }
30 | 
31 | torch::Tensor bwd(
32 |     torch::Tensor const& output_grads, 
33 |     torch::Tensor const& softmax_results,
34 |     float scale_factor) {
35 | 
36 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
37 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
38 | 
39 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
40 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
41 |       "Only fp16 and bf16 are supported");
42 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
43 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
44 |       "Only fp16 and bf16 are supported");
45 | 
46 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
47 | }
48 | 
49 | } // end namespace scaled_softmax
50 | } // end namespace fused_softmax
51 | } // end namespace multihead_attn
52 | 
53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
54 |   m.def("forward", 
55 |         &multihead_attn::fused_softmax::scaled_softmax::fwd, 
56 | 	"Self Multihead Attention scaled, softmax -- Forward.");
57 |   m.def("backward", 
58 |         &multihead_attn::fused_softmax::scaled_softmax::bwd,
59 | 	"Self Multihead Attention scaled, softmax -- Backward.");
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_upper_triang_masked_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     float scale_factor);
14 | 
15 | torch::Tensor bwd_cuda(
16 |     torch::Tensor const& output_grads, 
17 |     torch::Tensor const& softmax_results,
18 |     float scale_factor);
19 | 
20 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
21 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
22 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
23 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
24 |       "Only fp16 and bf16 are supported");
25 | 
26 |   return fwd_cuda(input, scale_factor);
27 | }
28 | 
29 | torch::Tensor bwd(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor) {
33 | 
34 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
35 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
36 | 
37 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
38 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
39 |       "Only fp16 and bf16 are supported");
40 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
41 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
42 |       "Only fp16 and bf16 are supported");
43 | 
44 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
45 | }
46 | 
47 | } // end namespace scaled_upper_triang_masked_softmax
48 | } // end namespace fused_softmax
49 | } // end namespace multihead_attn
50 | 
51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
52 |   m.def("forward", 
53 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
54 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
55 |   m.def("backward", 
56 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
57 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
58 | }
59 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from deepspeed.accelerator.real_accelerator import get_accelerator
 5 | 
 6 | if get_accelerator().device_name() == 'xpu':
 7 |     import intel_extension_for_pytorch
 8 | if get_accelerator().device_name() == 'cuda':
 9 |     from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
10 |     from apex.normalization import MixedFusedRMSNorm as RMSNorm
11 | else:
12 |     if hasattr(torch.xpu, "IpexRmsNorm"):
13 |         from .fused_rmsnorm import RMSNorm
14 |     else:
15 |         from .rmsnorm import RMSNorm
16 |     from torch.nn import LayerNorm
17 | 
18 | from .distributed import DistributedDataParallel
19 | from .bert_model import BertModel
20 | from .gpt_model import GPTModel, GPTModelPipe
21 | from .t5_model import T5Model
22 | from .language_model import get_language_model
23 | from .module import Float16Module
24 | 


--------------------------------------------------------------------------------
/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 |     retro_decoder_with_retriever = 5
11 |  
12 | class AttnType(enum.Enum):
13 |     self_attn = 1
14 |     cross_attn = 2
15 | 
16 | class AttnMaskType(enum.Enum):
17 |     padding = 1
18 |     causal = 2
19 | 
20 | # For backward compatibility with old model checkpoints
21 | from megatron.core.enums import ModelType
22 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 7 | # 1/sqrt(2*pi)-> 0.3989423
 8 | # 1/sqrt(2)   -> 0.70710678
 9 | # sqrt(2/pi)  -> 0.79788456
10 | # this function is tanh approximation of gelu
11 | # actual gelu is:
12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
13 | 
14 | @torch.jit.script
15 | def bias_gelu(bias, y):
16 |     x = bias + y
17 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18 | 
19 | # gradient of tanh approximation of gelu
20 | # gradient of actual gelu is:
21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
22 | @torch.jit.script
23 | def bias_gelu_back(g, bias, y):
24 |     x = bias + y
25 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
26 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
27 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
28 |     return ff*g
29 | 
30 | class GeLUFunction(torch.autograd.Function):
31 |     @staticmethod
32 |     # bias is an optional argument
33 |     def forward(ctx, input, bias):
34 |         ctx.save_for_backward(input, bias)
35 |         return bias_gelu(bias, input)
36 | 
37 |     @staticmethod
38 |     def backward(ctx, grad_output):
39 |         input, bias = ctx.saved_tensors
40 |         tmp = bias_gelu_back(grad_output, bias, input)
41 |         return tmp, tmp
42 | 
43 | bias_gelu_impl = GeLUFunction.apply
44 | 


--------------------------------------------------------------------------------
/megatron/model/fused_rmsnorm.py:
--------------------------------------------------------------------------------
 1 | from megatron import get_args
 2 | 
 3 | import torch
 4 | from torch.nn.parameter import Parameter
 5 | from torch.nn import init
 6 | import intel_extension_for_pytorch as ipex  # noqa
 7 | 
 8 | # Taken from facebookresearch/llama
 9 | class RMSNorm(torch.nn.Module):
10 |     def __init__(self, dim: int, eps: float = 1e-6, sequence_parallel=False):
11 |         super().__init__()
12 |         self.eps = eps
13 |         self.weight = Parameter(torch.ones(dim,
14 |                                 dtype=get_args().params_dtype))
15 |         self.sequence_parallel = sequence_parallel
16 |         setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
17 | 
18 |     def forward(self, x):
19 |         output = torch.xpu.IpexRmsNorm(x, self.weight.shape, self.weight, self.eps)
20 |         return output
21 | 


--------------------------------------------------------------------------------
/megatron/model/rmsnorm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
 2 | 
 3 | from deepspeed.accelerator import get_accelerator
 4 | from megatron import get_args
 5 | 
 6 | import torch
 7 | from torch.nn import init
 8 | from torch.nn.parameter import Parameter
 9 | 
10 | # Taken from facebookresearch/llama
11 | class RMSNorm(torch.nn.Module):
12 |     def __init__(self, dim: int, eps: float = 1e-6, sequence_parallel=False):
13 |         super().__init__()
14 |         self.eps = eps
15 |         init_device = None
16 |         if get_accelerator().device_name() == 'hpu':
17 |             init_device = get_accelerator().current_device_name() 
18 |         self.weight = Parameter(torch.empty(dim,
19 |                                 device=init_device,
20 |                                 dtype=get_args().params_dtype))
21 |         init.ones_(self.weight)
22 |         self.sequence_parallel = sequence_parallel
23 |         setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
24 | 
25 |     def _norm(self, x):
26 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
27 | 
28 |     def forward(self, x):
29 |         output = self._norm(x.float()).type_as(x)
30 |         return output * self.weight
31 | 


--------------------------------------------------------------------------------
/megatron/model/vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/model/vision/__init__.py


--------------------------------------------------------------------------------
/megatron/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def resize(input,
 7 |            size=None,
 8 |            scale_factor=None,
 9 |            mode='nearest',
10 |            align_corners=None,
11 |            warning=True):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
18 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
19 |                         and (output_w - 1) % (input_w - 1)):
20 |                     warnings.warn(
21 |                         f'When align_corners={align_corners}, '
22 |                         'the output would more aligned if '
23 |                         f'input size {(input_h, input_w)} is `x+1` and '
24 |                         f'out size {(output_h, output_w)} is `nx+1`')
25 |     if isinstance(size, torch.Size):
26 |         size = tuple(int(x) for x in size)
27 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
28 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import os
 5 | import random
 6 | import numpy
 7 | import torch
 8 | 
 9 | import mpu
10 | from deepspeed.accelerator import get_accelerator
11 | 
12 | class IdentityLayer(torch.nn.Module):
13 |     def __init__(self, size, scale=1.0):
14 |         super(IdentityLayer, self).__init__()
15 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
16 | 
17 |     def forward(self):
18 |         return self.weight
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     """Set random seed for reproducability."""
23 |     random.seed(seed)
24 |     numpy.random.seed(seed)
25 |     torch.manual_seed(seed)
26 |     mpu.model_parallel_cuda_manual_seed(seed)
27 | 
28 | 
29 | def initialize_distributed(backend='nccl'):
30 |     """Initialize torch.distributed."""
31 |     # Get local rank in case it is provided.
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--local_rank', type=int, default=None,
34 |                         help='local rank passed from distributed launcher')
35 |     args = parser.parse_args()
36 |     local_rank = args.local_rank
37 | 
38 |     # Get rank and world size.
39 |     rank = int(os.getenv('RANK', '0'))
40 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
41 | 
42 |     print('> initializing torch.distributed with local rank: {}, '
43 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
44 | 
45 |     # Set the device id.
46 |     device = rank % get_accelerator().device_count()
47 |     if local_rank is not None:
48 |         device = local_rank
49 |     get_accelerator().set_device(device)
50 | 
51 |     # Call the init process.
52 |     init_method = 'tcp://'
53 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
54 |     master_port = os.getenv('MASTER_PORT', '6000')
55 |     init_method += master_ip + ':' + master_port
56 |     torch.distributed.init_process_group(
57 |         backend=backend,
58 |         world_size=world_size,
59 |         rank=rank,
60 |         init_method=init_method)
61 | 
62 | 
63 | def print_separator(message):
64 |     torch.distributed.barrier()
65 |     filler_len = (78 - len(message)) // 2
66 |     filler = '-' * filler_len
67 |     string = '\n' + filler + ' {} '.format(message) + filler
68 |     if torch.distributed.get_rank() == 0:
69 |         print(string, flush=True)
70 |     torch.distributed.barrier()
71 | 


--------------------------------------------------------------------------------
/megatron/profiler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
 2 | 
 3 | import torch
 4 | 
 5 | on_step_begin = []
 6 | on_step_end = []
 7 | 
 8 | def trigger(phase):
 9 |     [f() for f in phase]
10 | 
11 | def setup_profiler(args, device):
12 |     if args.profile is None:
13 |         return
14 | 
15 |     start_step, end_step = map(int, args.profile_steps.split(','))
16 |     active_steps = end_step - start_step + 1
17 |     cur_step = 0
18 | 
19 |     def on_step_begin_fn():
20 |         nonlocal cur_step
21 |         cur_step = cur_step + 1
22 |     on_step_begin.append(on_step_begin_fn)
23 | 
24 |     def when(cond, clbk):
25 |         def fn():
26 |             if cond():
27 |                 clbk()
28 |         return fn
29 | 
30 |     def is_start_step():
31 |         return cur_step == start_step
32 | 
33 |     def is_end_step():
34 |         return cur_step == end_step
35 | 
36 |     def is_capture_step():
37 |         return cur_step >= start_step and cur_step <= end_step
38 | 
39 |     if args.profile.startswith('pt') and (
40 |         args.profile_ranks is None or torch.distributed.get_rank() in args.profile_ranks
41 |     ):
42 |         schedule = torch.profiler.schedule(wait=0, warmup=0, active=active_steps, repeat=1)
43 |         activities = [torch.profiler.ProfilerActivity.CPU]
44 |         activities.extend([torch.profiler.ProfilerActivity.HPU] if device.startswith("hpu") else [])
45 |         activities.extend([torch.profiler.ProfilerActivity.CUDA] if device.startswith("cuda") else [])
46 |         full = args.profile == 'pt-full'
47 | 
48 |         profiler = torch.profiler.profile(
49 |             schedule=schedule,
50 |             activities=activities,
51 |             on_trace_ready=torch.profiler.tensorboard_trace_handler(args.tensorboard_dir, use_gzip=True),
52 |             with_stack=full)
53 | 
54 |         on_step_begin.append(when(is_start_step, profiler.start))
55 |         on_step_end.append(when(is_capture_step, profiler.step))
56 |         on_step_end.append(when(is_end_step, profiler.stop))
57 | 


--------------------------------------------------------------------------------
/megatron/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/tasks/eval_harness/download.py:
--------------------------------------------------------------------------------
 1 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed
 2 | # under the license https://huggingface.co/spaces/bigscience/license
 3 | 
 4 | # Downloads the specified taks in the evaluation harness
 5 | # This is particularly useful when running in environments where the GPU nodes 
 6 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
 7 | 
 8 | from lm_eval import tasks
 9 | from lm_eval.tasks import ALL_TASKS
10 | import argparse
11 | import os
12 | 
13 | 
14 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
15 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
16 | args = parser.parse_args()
17 | 
18 | def main():
19 |     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
20 |     tasks.get_task_dict(task_list)
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 
25 | 
26 |     


--------------------------------------------------------------------------------
/tasks/eval_harness/report-to-csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed
 4 | # under the license https://huggingface.co/spaces/bigscience/license
 5 | 
 6 | # this script converts results.json:
 7 | #
 8 | #   "results": {
 9 | #     "arc_challenge": {
10 | #       "acc": 0.24232081911262798,
11 | #       "acc_stderr": 0.01252159329580012,
12 | #       "acc_norm": 0.2764505119453925,
13 | #       "acc_norm_stderr": 0.013069662474252425
14 | #     },
15 | #
16 | # into a format expected by a spreadsheet, which is:
17 | #
18 | #   task          metric   value    err
19 | #   arc_challenge acc      xxx      yyy
20 | #   arc_challenge acc_norm xxx      yyy
21 | #   arc_challenge f1       xxx      yyy
22 | #
23 | # usage:
24 | # report-to-csv.py results.json
25 | 
26 | 
27 | import sys
28 | import json
29 | import io
30 | import csv
31 | 
32 | results_file = sys.argv[1]
33 | 
34 | csv_file = results_file.replace("json", "csv")
35 | 
36 | print(f"Converting {results_file} to {csv_file}")
37 | 
38 | with io.open(results_file, 'r', encoding='utf-8') as f:
39 |     results = json.load(f)
40 | 
41 | with io.open(csv_file, 'w', encoding='utf-8') as f:
42 | 
43 |     writer = csv.writer(f)
44 |     writer.writerow(["task", "metric", "value", "err", "version"])
45 | 
46 |     versions = results["versions"]
47 | 
48 |     for k,v in sorted(results["results"].items()):
49 |         if k not in versions:
50 |             versions[k] = -1
51 | 
52 |         if "acc" in v:
53 |             writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]])
54 |         if "acc_norm" in v:
55 |             writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]])
56 |         if "f1" in v:
57 |             writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]])
58 |         # if "ppl" in v:
59 |         #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]])
60 |         # if "em" in v:
61 |         #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]])
62 | 


--------------------------------------------------------------------------------
/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """GLUE dataset."""
 4 | 
 5 | from abc import ABC
 6 | from abc import abstractmethod
 7 | 
 8 | from torch.utils.data import Dataset
 9 | 
10 | from megatron import print_rank_0
11 | from tasks.data_utils import build_sample
12 | from tasks.data_utils import build_tokens_types_paddings_from_text
13 | 
14 | 
15 | class GLUEAbstractDataset(ABC, Dataset):
16 |     """GLUE base dataset class."""
17 | 
18 |     def __init__(self, task_name, dataset_name, datapaths,
19 |                  tokenizer, max_seq_length):
20 |         # Store inputs.
21 |         self.task_name = task_name
22 |         self.dataset_name = dataset_name
23 |         self.tokenizer = tokenizer
24 |         self.max_seq_length = max_seq_length
25 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
26 |                                                              self.dataset_name))
27 |         # Process the files.
28 |         string = '  > paths:'
29 |         for path in datapaths:
30 |             string += ' ' + path
31 |         print_rank_0(string)
32 |         self.samples = []
33 |         for datapath in datapaths:
34 |             self.samples.extend(self.process_samples_from_single_path(datapath))
35 |         print_rank_0('  >> total number of samples: {}'.format(
36 |             len(self.samples)))
37 | 
38 |     def __len__(self):
39 |         return len(self.samples)
40 | 
41 |     def __getitem__(self, idx):
42 |         raw_sample = self.samples[idx]
43 |         ids, types, paddings = build_tokens_types_paddings_from_text(
44 |             raw_sample['text_a'], raw_sample['text_b'],
45 |             self.tokenizer, self.max_seq_length)
46 |         sample = build_sample(ids, types, paddings,
47 |                               raw_sample['label'], raw_sample['uid'])
48 |         return sample
49 | 
50 |     @abstractmethod
51 |     def process_samples_from_single_path(self, datapath):
52 |         """Abstract method that takes a single path / filename and
53 |         returns a list of dataset samples, each sample being a dict of
54 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
55 |         """
56 |         pass
57 | 


--------------------------------------------------------------------------------
/tasks/msdp/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
 3 | 
 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
 5 | 
 6 | ## Multi-Stage Dialogue Prompting
 7 | 
 8 | ### Data Preparation
 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
11 | 
12 | ### Stage-1: Prompting for Knowledge Generation
13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
15 | 
16 | ### Stage-2: Prompting for Response Generation
17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
19 | 3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
20 | 


--------------------------------------------------------------------------------
/tasks/msdp/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Model evaluation"""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from tasks.msdp.metrics import F1Metric
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | def evaluate_f1(guess_file, answer_file):
12 |     """Evaluating F1 Score"""
13 | 
14 |     guess_list = []
15 |     print_rank_0('reading %s' % guess_file)
16 |     with open(guess_file, "r") as f:
17 |         for i, line in enumerate(tqdm(f)):
18 |             line = line.strip()
19 |             if "<|endoftext|>" in line:
20 |                 line = line.replace("<|endoftext|>", "")
21 |             guess_list.append(line)
22 | 
23 |     answer_list = []
24 |     print_rank_0('reading %s' % answer_file)
25 |     with open(answer_file, "r") as f:
26 |         for i, line in enumerate(tqdm(f)):
27 |             line = line.strip()
28 |             if line == "no_passages_used":
29 |                 line = ""
30 |             answer_list.append(line)
31 | 
32 |     assert len(guess_list) == len(answer_list), \
33 |         "lengths of guess and answer are different!"
34 | 
35 |     precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
36 |     print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
37 | 
38 |     print_rank_0('done :-)')
39 | 
40 | 
41 | def main():
42 |     args = get_args()
43 |     
44 |     evaluate_f1(args.guess_file, args.answer_file)
45 | 
46 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | from megatron import get_args, print_rank_0
 6 | from megatron.indexer import IndexBuilder
 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator
 8 | 
 9 | def main():
10 |     """
11 |     Main program
12 |     """
13 | 
14 |     args = get_args()
15 | 
16 |     """
17 |     Create a BlockData data structure by running an IndexBuilder over an
18 |     ICT Dataset and then evaluate on NQ task
19 |     """
20 | 
21 |     print_rank_0("Starting index builder!")
22 | 
23 |     index_builder = IndexBuilder()
24 |     index_builder.build_and_save_index()
25 |     print_rank_0("Build and save indices: done!")
26 | 
27 | 
28 |     print_rank_0("Starting evaluations!")
29 | 
30 |     # Set up the model and evaluator
31 |     evaluator = ORQAEvaluator()
32 | 
33 |     # Run evaluation
34 |     if args.qa_data_dev is not None:
35 |         evaluator.evaluate(args.qa_data_dev, "DEV")
36 | 
37 |     if args.qa_data_test is not None:
38 |         evaluator.evaluate(args.qa_data_test, "TEST")
39 | 
40 | 


--------------------------------------------------------------------------------
/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Race."""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from megatron import get_tokenizer
 8 | from megatron.model.multiple_choice import MultipleChoice
 9 | from tasks.eval_utils import accuracy_func_provider
10 | from tasks.finetune_utils import finetune
11 | from tasks.race.data import RaceDataset
12 | from megatron.arguments import core_transformer_config_from_args
13 | 
14 | 
15 | def train_valid_datasets_provider():
16 |     """Provide train and validation datasets."""
17 |     args = get_args()
18 |     tokenizer = get_tokenizer()
19 | 
20 |     train_dataset = RaceDataset('training', args.train_data,
21 |                                 tokenizer, args.seq_length)
22 |     valid_dataset = RaceDataset('validation', args.valid_data,
23 |                                 tokenizer, args.seq_length)
24 | 
25 |     return train_dataset, valid_dataset
26 | 
27 | 
28 | def model_provider(pre_process=True, post_process=True):
29 |     """Build the model."""
30 |     config = core_transformer_config_from_args(get_args())
31 |     print_rank_0('building multichoice model for RACE ...')
32 |     model = MultipleChoice(config=config,
33 |                            num_tokentypes=2,
34 |                            pre_process=pre_process,
35 |                            post_process=post_process)
36 | 
37 |     return model
38 | 
39 | 
40 | def metrics_func_provider():
41 |     """Privde metrics callback function."""
42 |     args = get_args()
43 |     tokenizer = get_tokenizer()
44 | 
45 |     def single_dataset_provider(datapath):
46 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
47 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
48 | 
49 |     return accuracy_func_provider(single_dataset_provider)
50 | 
51 | 
52 | def main():
53 | 
54 |     finetune(train_valid_datasets_provider, model_provider,
55 |              end_of_epoch_callback_provider=metrics_func_provider)
56 | 


--------------------------------------------------------------------------------
/tasks/vision/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | sys.path.append(
 9 |     os.path.abspath(
10 |         os.path.join(
11 |             os.path.join(os.path.dirname(__file__), os.path.pardir),
12 |             os.path.pardir,
13 |         )
14 |     )
15 | )
16 | from megatron import get_args
17 | from megatron.initialize import initialize_megatron
18 | 
19 | def get_tasks_args(parser):
20 |     """Provide extra arguments required for tasks."""
21 |     group = parser.add_argument_group(title="tasks")
22 | 
23 |     group.add_argument('--task', type=str, default='segment',
24 |                        choices=['classify', 'segment_setr', 'segment_segformer'],
25 |                        help='task name.')
26 |     group.add_argument("--epochs", type=int, default=None,
27 |                        help="Number of finetunning epochs. Zero results in "
28 |                        "evaluation only.")
29 |     group.add_argument('--pretrained-checkpoint-type', type=str, default='default',
30 |                        choices=['default', 'external', 'constrastive'],
31 |                        help='Type of pretrained checkpoint')
32 |     group.add_argument("--pretrained-checkpoint", type=str, default=None,
33 |                        help="Pretrained checkpoint used for finetunning.")
34 |     group.add_argument('--seg-stride', type=int, default=None,
35 |                        help='sliding window stride during evaluation')
36 |     return parser
37 | 
38 | 
39 | if __name__ == "__main__":
40 | 
41 |     initialize_megatron(extra_args_provider=get_tasks_args)
42 |     args = get_args()
43 | 
44 |     if args.task == 'classify':
45 |         from tasks.vision.classification.classification import main
46 |         main()
47 |     elif args.task == 'segment_setr':
48 |         from tasks.vision.segmentation.finetune_setr import main
49 |         main()
50 |     elif args.task == 'segment_segformer':
51 |         from tasks.vision.segmentation.finetune_segformer import main
52 |         main()
53 | 
54 | 


--------------------------------------------------------------------------------
/tasks/zeroshot_gpt/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Detokenization."""
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | def ptb_detokenizer(string):
 9 |     string = string.replace(" '", "'")
10 |     string = string.replace(" \n", "\n")
11 |     string = string.replace("\n ", "\n")
12 |     string = string.replace(" n't", "n't")
13 |     string = string.replace(" N ", "1 ")
14 |     string = string.replace("$ 1", "$1")
15 |     string = string.replace("# 1", "#1")
16 |     return string
17 | 
18 | 
19 | def wikitext_detokenizer(string):
20 |     # contractions
21 |     string = string.replace("s '", "s'")
22 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
23 |     # number separators
24 |     string = string.replace(" @-@ ", "-")
25 |     string = string.replace(" @,@ ", ",")
26 |     string = string.replace(" @.@ ", ".")
27 |     # punctuation
28 |     string = string.replace(" : ", ": ")
29 |     string = string.replace(" ; ", "; ")
30 |     string = string.replace(" . ", ". ")
31 |     string = string.replace(" ! ", "! ")
32 |     string = string.replace(" ? ", "? ")
33 |     string = string.replace(" , ", ", ")
34 |     # double brackets
35 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
36 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
37 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
38 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
39 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
40 |     # miscellaneous
41 |     string = string.replace("= = = =", "====")
42 |     string = string.replace("= = =", "===")
43 |     string = string.replace("= =", "==")
44 |     string = string.replace(" " + chr(176) + " ", chr(176))
45 |     string = string.replace(" \n", "\n")
46 |     string = string.replace("\n ", "\n")
47 |     string = string.replace(" N ", " 1 ")
48 |     string = string.replace(" 's", "'s")
49 | 
50 |     return string
51 | 
52 | 
53 | def lambada_detokenizer(string):
54 |     return string
55 | 
56 | 
57 | _DETOKENIZERS = {
58 |     'ptb': ptb_detokenizer,
59 |     'wiki': wikitext_detokenizer,
60 |     'lambada': lambada_detokenizer,
61 | }
62 | 
63 | 
64 | def get_detokenizer(path):
65 |     for key in _DETOKENIZERS.keys():
66 |         if key in path:
67 |             return _DETOKENIZERS[key]
68 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import pytest
 4 | 
 5 | from megatron.core import parallel_state
 6 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 7 | 
 8 | from megatron.core.transformer.transformer_config import TransformerConfig
 9 | 
10 | # initialize model parallel for tests
11 | parallel_state.set_tensor_model_parallel_world_size(1)
12 | parallel_state.set_tensor_model_parallel_rank(0)
13 | parallel_state._set_global_memory_buffer()
14 | parallel_state.set_pipeline_model_parallel_rank(0)
15 | parallel_state.set_pipeline_model_parallel_world_size(1)
16 | 
17 | model_parallel_cuda_manual_seed(123)
18 | 
19 | 
20 | @pytest.fixture
21 | def transformer_config():
22 |     return TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
23 | 


--------------------------------------------------------------------------------
/tests/functional_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/functional_tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/functional_tests/python_test_utils/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/check_slurm_job_completion.py:
--------------------------------------------------------------------------------
 1 | """Check if a given slurm job id completed successfully
 2 |    Usage:
 3 |        python3 check_slurm_job_completion.py <JOB_ID>
 4 | """
 5 | 
 6 | import sys
 7 | import subprocess
 8 | 
 9 | 
10 | cmd = f"sacct -j {sys.argv[1]}"
11 | result = subprocess.check_output(cmd, shell=True).decode().split()
12 | assert len(result) > 14, "JOB state not available."
13 | 
14 | status = result[19]
15 | exit_code = result[20]
16 | 
17 | assert status == "COMPLETED", f"Job {sys.argv[1]} not completed."
18 | assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully."
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/functional_tests/shell_test_utils/jobwait.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | JOBID=$1
 4 | echo "Job id : $JOBID"
 5 | 
 6 | if [[ $JOBID -eq "" ]]; then
 7 |   exit 1
 8 | fi
 9 | 
10 | sleep 10s
11 | 
12 | while true; do
13 |     export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
14 |     case "${STATE}" in
15 |         PENDING|RUNNING|REQUEUED)
16 |             echo "Job is still in $STATE"
17 |             sleep 15s
18 |             ;;
19 |         *)
20 |             sleep 30s
21 |             echo "Exiting with SLURM job status '${STATE}'"
22 |             exit 0
23 |             ;;
24 |     esac
25 | done
26 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 48, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87978, 10.84463, 10.67266, 10.62932, 10.52767, 10.25362]}, "num-zeros": {"start_step": 0, "end_step": 31, "step_interval": 5, "values": [2450.0, 2396.0, 2523.0, 2242.0, 2225.0, 2478.0, 2536.0]}, "iteration_timing_avg": 0.11416968750000002}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89173, 10.84724, 10.6886, 10.62864, 10.53925, 10.26646]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2306.0, 2412.0, 2032.0, 2077.0, 2475.0, 2347.0]}, "iteration_timing_avg": 0.15481029411764707}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -o xtrace
 3 | 
 4 | DATA_PATH=$1
 5 | CHECKPOINT_PATH=$2
 6 | TENSORBOARD_DIR=$3
 7 | TP_SIZE=$4
 8 | PP_SIZE=$5
 9 | NNODES=$6
10 | MAX_STEPS=$7
11 | VP_SIZE=$8
12 | GPUS_PER_NODE=8
13 | # Change for multinode config
14 | MASTER_ADDR=localhost
15 | MASTER_PORT=6000
16 | NODE_RANK=0
17 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
18 | export CUDA_DEVICE_MAX_CONNECTIONS=1
19 | 
20 | 
21 | # Runs the "345M" parameter model
22 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
23 | 
24 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
25 |        pretrain_bert.py \
26 |        --num-layers 24 \
27 |        --hidden-size 1024 \
28 |        --num-attention-heads 16 \
29 |        --log-params-norm \
30 |        --log-num-zeros-in-grad \
31 |        --log-validation-ppl-to-tensorboard \
32 |        --log-timers-to-tensorboard \
33 |        --tensorboard-dir ${TENSORBOARD_DIR} \
34 |        --micro-batch-size 4 \
35 |        --global-batch-size 128 \
36 |        --seq-length 512 \
37 |        --max-position-embeddings 512 \
38 |        --train-iters $MAX_STEPS \
39 |        --timing-log-level 2 \
40 |        --lr-decay-iters 990000 \
41 |        --save $CHECKPOINT_PATH \
42 |        --load $CHECKPOINT_PATH \
43 |        --data-path $DATA_PATH \
44 |        --vocab-file /workspace/data/bert_data/vocab.txt \
45 |        --data-impl mmap \
46 |        --split 949,50,1 \
47 |        --distributed-backend nccl \
48 |        --lr 0.0001 \
49 |        --min-lr 0.00001 \
50 |        --lr-warmup-fraction 0.01 \
51 |        --log-interval 1 \
52 |        --save-interval 10000 \
53 |        --eval-interval 1000 \
54 |        --eval-iters 10 \
55 |        --tensor-model-parallel-size $TP_SIZE \
56 |        --pipeline-model-parallel-size $PP_SIZE \
57 |        ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
58 |        --no-gradient-accumulation-fusion \
59 |        --fp16 


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr
 5 | #SBATCH --job-name=adlr-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | 
13 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
14 |   ls 
15 |   cd /workspace/megatron-lm
16 |   ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr
 5 | #SBATCH --job-name=adlr-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | 
13 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
14 |   ls 
15 |   cd /workspace/megatron-lm
16 |   ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE"


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr
 5 | #SBATCH --job-name=adlr-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | 
13 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
14 |   ls 
15 |   cd /workspace/megatron-lm
16 |   ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr
 5 | #SBATCH --job-name=adlr-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | IMAGE=gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
13 | 
14 | if [[ $USE_TE -eq 1 ]]; then
15 |   echo "Using container nvcr.io/nvidia/pytorch:23.04-py3 for running with TE ..."
16 |   IMAGE=nvcr.io/nvidia/pytorch:23.04-py3
17 | fi
18 | 
19 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
20 |   ls 
21 |   cd /workspace/megatron-lm
22 |   ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE $MBS $GBS"
23 | 


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/pipeline_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/tensor_parallel/__int__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/tensor_parallel/__int__.py


--------------------------------------------------------------------------------
/tests/test_megatron.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import re
 4 | import subprocess
 5 | 
 6 | 
 7 | @pytest.fixture(params=[1])
 8 | def moe_num_experts(request):
 9 |     return str(request.param)
10 | 
11 | 
12 | @pytest.fixture(params=[1])
13 | def mp_size(request):
14 |     return str(request.param)
15 | 
16 | 
17 | @pytest.fixture
18 | def params(moe_num_experts, mp_size):
19 |     base_dir = os.getenv("MEGATRON_CKPT_DIR")
20 |     assert base_dir, "Please set MEGATRON_CKPT_DIR in your environment"
21 | 
22 |     vocab_file = os.path.join(base_dir, "gpt2-vocab.json")
23 |     merge_file = os.path.join(base_dir, "gpt2-merges.txt")
24 |     ckpt_path = os.path.join(base_dir, "checkpoints/gpt2_345m")
25 | 
26 |     return [
27 |         "--micro-batch-size", "1",
28 |         "--num-layers", "24",
29 |         "--hidden-size", "1024",
30 |         "--num-attention-heads", "16",
31 |         "--max-position-embeddings", "1024",
32 |         "--vocab-file", vocab_file,
33 |         "--merge-file", merge_file,
34 |         "--load", ckpt_path,
35 |         "--seq-length", "1024",
36 |         "--out-seq-length", "1024",
37 |         "--tensor-model-parallel-size", mp_size,
38 |         "--tokenizer-type", "GPT2BPETokenizer",
39 |         "--num-experts", moe_num_experts,
40 |         "--mlp-type", "standard",
41 |         "--num-samples", "0",
42 |         "--fp16",
43 |     ]
44 | 
45 | 
46 | def test_moe_megatron(params, mp_size):
47 |     output_re = r"===START OUTPUT===([\S\s]*)===END OUTPUT==="
48 | 
49 |     # Run the baseline
50 |     baseline_cmd = ["deepspeed", "--num_gpus", mp_size, "./run_megatron.py"] + params
51 |     result = subprocess.run(baseline_cmd, stdout=subprocess.PIPE)
52 |     baseline_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
53 | 
54 |     # Run with DeepSpeed
55 |     deepspeed_cmd = baseline_cmd + ["--ds-inference"]
56 |     result = subprocess.run(deepspeed_cmd, stdout=subprocess.PIPE)
57 |     deepspeed_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
58 | 
59 |     assert (
60 |         baseline_output == deepspeed_output
61 |     ), f"outputs do not match: {baseline_output}\n{deepspeed_output}"
62 | 


--------------------------------------------------------------------------------
/tests/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/transformer/__init__.py


--------------------------------------------------------------------------------
/tests/transformer/test_core_attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import pytest
 5 | 
 6 | import torch
 7 | 
 8 | from megatron.core.transformer.core_attention import CoreAttention
 9 | 
10 | 
11 | @pytest.fixture
12 | def core_attention(transformer_config):
13 |     return CoreAttention(transformer_config)
14 | 
15 | 
16 | class TestCoreAttention:
17 |     def test_constructor(self, core_attention):
18 |         assert isinstance(core_attention, CoreAttention)
19 |         assert core_attention.layer_number == 1
20 | 
21 |         num_weights = sum([p.numel() for p in core_attention.parameters()])
22 |         assert num_weights == 0
23 | 
24 |     def test_cpu_forward(self, core_attention):
25 |         # we can't currently do this because the global memory buffer is on GPU
26 |         pass
27 | 
28 |     def test_gpu_forward(self, core_attention):
29 | 
30 |         # destroy_global_memory_buffer()
31 |         # _set_global_memory_buffer()
32 |         # model_parallel_cuda_manual_seed(123)
33 | 
34 |         core_attention.cuda()
35 |         config = core_attention.config
36 |         sequence_length = 32
37 |         micro_batch_size = 2
38 |         # query_layer (float): [sequence_length, micro_batch_size, num_attention_heads, hidden_size / num_attention_heads]
39 |         query_layer = torch.ones(
40 |             (
41 |                 sequence_length,
42 |                 micro_batch_size,
43 |                 config.num_attention_heads,
44 |                 config.hidden_size // config.num_attention_heads,
45 |             )
46 |         ).cuda()
47 | 
48 |         key_layer = torch.ones_like(query_layer).cuda()
49 | 
50 |         value_layer = torch.ones_like(query_layer).cuda()
51 | 
52 |         attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
53 | 
54 |         context_layer = core_attention(
55 |             query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask
56 |         )
57 | 
58 |         assert context_layer.shape[0] == sequence_length
59 |         assert context_layer.shape[1] == micro_batch_size
60 |         assert context_layer.shape[2] == config.hidden_size
61 |         assert context_layer.device.type == 'cuda'
62 |         assert context_layer.dtype == torch.float32
63 | 
64 | 


--------------------------------------------------------------------------------
/tests/transformer/test_parallel_mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 | 
 4 | import pytest
 5 | 
 6 | import torch
 7 | import types
 8 | 
 9 | from megatron.core.transformer.parallel_mlp import ParallelMLP
10 | from megatron.global_vars import set_args
11 | 
12 | from deepspeed.accelerator import get_accelerator
13 | device_name = get_accelerator().device_name()
14 | 
15 | @pytest.fixture
16 | def mlp(transformer_config):
17 |     mlp_args = types.SimpleNamespace(
18 |         swiglu=False,
19 |         openai_gelu=True,
20 |         onnx_safe=False,
21 |         bias_gelu_fusion=False,
22 |         transformer_impl="",
23 |         cache_fp8_weight=False,
24 |         fp8_interval=False,
25 |         cache_fp8_weight_fwd=False
26 |     )
27 |     set_args(mlp_args)
28 |     return ParallelMLP(transformer_config)
29 | 
30 | 
31 | class TestParallelMLP:
32 |     def test_constructor(self, mlp):
33 |         assert isinstance(mlp, ParallelMLP)
34 | 
35 |         num_weights = sum([p.numel() for p in mlp.parameters()])
36 |         assert num_weights == 1212
37 | 
38 |     def test_cpu_forward(self, mlp, transformer_config):
39 |         # [sequence length, micro batch size, hidden size]
40 |         hidden_states = torch.ones((32, 2, transformer_config.hidden_size))
41 |         output, output_bias = mlp(hidden_states)
42 |         assert output.shape[0] == 32
43 |         assert output.shape[1] == 2
44 |         assert output.shape[2] == transformer_config.hidden_size
45 |         assert output_bias == None
46 |         assert output.dtype == torch.float32
47 | 
48 |     @pytest.mark.skipif(not get_accelerator().is_available(), reason="accelerator not available")
49 |     def test_accelerator_forward(self, mlp, transformer_config):
50 |         mlp.to(device_name)
51 |         # [sequence length, batch size, hidden size]
52 |         hidden_states = torch.ones((32, 2, transformer_config.hidden_size))
53 |         hidden_states = hidden_states.to(device_name)
54 |         output, output_bias = mlp(hidden_states)
55 |         assert output.shape[0] == 32
56 |         assert output.shape[1] == 2
57 |         assert output.shape[2] == transformer_config.hidden_size
58 |         assert output_bias == None
59 |         assert output.dtype == torch.float32
60 |         assert output.device.type == device_name
61 | 
62 | 


--------------------------------------------------------------------------------
/tests/transformer/test_parallel_transformer_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import pytest
 5 | 
 6 | import torch
 7 | 
 8 | from megatron.core.transformer.transformer_config import TransformerConfig
 9 | from megatron.core.transformer.parallel_transformer_layer import ParallelTransformerLayer
10 | 
11 | 
12 | @pytest.fixture
13 | def parallel_transformer_layer(transformer_config):
14 |     return ParallelTransformerLayer(transformer_config)
15 | 
16 | 
17 | class TestParallelTransformerLayer:
18 |     def test_constructor(self, parallel_transformer_layer):
19 |         assert isinstance(parallel_transformer_layer, ParallelTransformerLayer)
20 |         assert parallel_transformer_layer.layer_number == 1
21 | 
22 |         num_weights = sum([p.numel() for p in parallel_transformer_layer.parameters()])
23 |         assert num_weights == 1884
24 | 
25 |     def test_gpu_forward(self, parallel_transformer_layer):
26 |         config: TransformerConfig = parallel_transformer_layer.config
27 |         sequence_length = 32
28 |         micro_batch_size = 2
29 |         parallel_transformer_layer.cuda()
30 | 
31 |         # [sequence length, batch size, hidden size]
32 |         hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
33 |         hidden_states = hidden_states.cuda()
34 | 
35 |         attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
36 | 
37 |         hidden_states = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask)
38 |         assert hidden_states.shape[0] == sequence_length
39 |         assert hidden_states.shape[1] == micro_batch_size
40 |         assert hidden_states.shape[2] == config.hidden_size
41 | 


--------------------------------------------------------------------------------
/tests/transformer/test_transformer_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | class TestTransformerConfig:
 5 |     def test_transformer_config(self, transformer_config):
 6 | 
 7 |         assert transformer_config.hidden_size == 12
 8 |         assert transformer_config.ffn_hidden_size == 48
 9 |         assert transformer_config.num_attention_heads == 4
10 |         assert transformer_config.kv_channels == 3
11 | 


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/unit_tests/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
 2 | import torch
 3 | from tests.unit_tests.test_utilities import Utils
 4 | import numpy as np
 5 | 
 6 | def test_vocab_parallel_cross_entropy():
 7 |     Utils.initialize_model_parallel(4,2)
 8 |     vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda()
 9 |     target = torch.arange(0,32,2).cuda()
10 |     output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
11 |     expected_output = torch.tensor([10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309,
12 |         10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309]).cuda()
13 |     assert(torch.equal(torch.round(expected_output), torch.round(output)))
14 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_data.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.data import broadcast_data
 2 | import torch
 3 | from tests.unit_tests.test_utilities import Utils
 4 | 
 5 | def test_broadcast_data():
 6 |     Utils.initialize_model_parallel(2,4)
 7 |     input_data = {
 8 |         0 : torch.ones((8,8)).cuda() * 0.0,
 9 |         1 : torch.ones((8,8)).cuda() * 1.0,
10 |         2 : torch.ones((8,8)).cuda() * 2.0,
11 |         3 : torch.ones((8,8)).cuda() * 3.0,
12 |         4 : torch.ones((8,8)).cuda() * 4.0,
13 |         5 : torch.ones((8,8)).cuda() * 5.0,
14 |         6 : torch.ones((8,8)).cuda() * 6.0,
15 |         7 : torch.ones((8,8)).cuda() * 7.0
16 |         }
17 |     dtype = torch.float32
18 |     actual_output = broadcast_data([0,1],input_data, dtype)
19 |     assert(torch.equal(actual_output[0], input_data[0]))
20 |     assert(torch.equal(actual_output[1], input_data[1]))
21 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_random.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 3 | from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER
 4 | from megatron.core.tensor_parallel.random import checkpoint
 5 | from tests.unit_tests.test_utilities import Utils
 6 | import pytest
 7 | import torch
 8 | 
 9 | def test_cuda_rng_states_tracker():
10 |     rng_tracker = CudaRNGStatesTracker()
11 |     rng_tracker.set_states({"state1":1234})
12 |     assert(rng_tracker.get_states()["state1"] == 1234)
13 |     rng_tracker.reset()
14 |     assert(rng_tracker.get_states() == {})
15 |     seed = 1111
16 |     rng_tracker.add("state2",seed)
17 |     with pytest.raises(Exception):
18 |         assert(rng_tracker.add("state3",seed))
19 |     with pytest.raises(Exception):
20 |         assert(rng_tracker.add("state2",111))
21 |     assert(rng_tracker.get_states()['state2'] is not None)
22 |     with pytest.raises(Exception):
23 |         assert()
24 |     
25 |     rng_tracker.fork("state2")
26 |     torch.cuda.manual_seed(seed)
27 |     rng_state = torch.cuda.get_rng_state()
28 |     assert torch.equal(rng_tracker.get_states()['state2'], rng_state)
29 | 
30 | def test_model_parallel_cuda_manual_seed():
31 |     Utils.initialize_model_parallel(4,2)
32 |     model_parallel_cuda_manual_seed(0)
33 |     assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None)
34 |     Utils.destroy_model_parallel()
35 | 
36 | def test_checkpoint():
37 |     def test_forward(*input):
38 |         return input[0]+input[1]
39 |     assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2)))
40 |     Utils.initialize_model_parallel()
41 |     input1 = torch.ones((4,4))
42 |     checkpoint(test_forward, True, input1, torch.ones((4,4))*2)
43 |     assert(torch.equal(torch.ones(input1.numel()).cuda(), input1))
44 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import megatron.core.tensor_parallel.utils as util
 3 | import megatron.core.parallel_state as ps
 4 | from tests.unit_tests.test_utilities import Utils
 5 | 
 6 | rank = Utils.rank
 7 | 
 8 | def test_split_tensor_along_last_dim():
 9 |     input_tensor = torch.rand((3,4))
10 |     torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0])
11 |     torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1])
12 | 
13 | def test_split_tensor_into_1d_equal_chunks():
14 |     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
15 |     input_tensor = torch.rand((3,4))
16 |     output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor)
17 |     if rank % 2 == 0 :
18 |         start = 0
19 |         end = int(input_tensor.numel()/2)
20 |     else :
21 |         start = int(input_tensor.numel()/2)
22 |         end = input_tensor.numel()
23 |         
24 |     assert torch.equal(output_tensor, input_tensor.flatten()[start:end])
25 |     Utils.destroy_model_parallel()
26 | 
27 | def test_gather_split_1d_tensor():
28 |     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
29 |     input_tensor = torch.ones((2,4)).cuda() * rank
30 |     actual_output_tensor = util.gather_split_1d_tensor(input_tensor)
31 |     if rank %2 == 0:
32 |         expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1))
33 |     else : 
34 |         expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten()))
35 |     assert(torch.equal(actual_output_tensor, expected_output_tensor))
36 |     Utils.destroy_model_parallel()
37 | 
38 | def test_vocab():
39 |     global_vocab_size = 1600
40 |     per_partition_vocab_size = 1600 / Utils.world_size
41 |     assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size)))
42 |     assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size)))
43 |     


--------------------------------------------------------------------------------
/tests/unit_tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_utilities.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
 2 | 
 3 | import os
 4 | import torch
 5 | import megatron.core.parallel_state as ps
 6 | 
 7 | from deepspeed.accelerator import get_accelerator
 8 | 
 9 | class Utils:
10 | 
11 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
12 |     rank = int(os.getenv('LOCAL_RANK', '0'))
13 | 
14 |     @staticmethod
15 |     def initialize_distributed():
16 |         print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
17 |         get_accelerator().set_device(Utils.rank % get_accelerator().device_count())
18 |         init_method = 'tcp://'
19 |         master_ip = os.getenv('MASTER_ADDR', 'localhost')
20 |         master_port = os.getenv('MASTER_PORT', '6000')
21 |         init_method += master_ip + ':' + master_port
22 |         torch.distributed.init_process_group(backend=get_accelerator().communication_backend_name(), world_size=Utils.world_size, rank=Utils.rank, init_method=init_method)
23 |         
24 |     @staticmethod
25 |     def destroy_model_parallel():
26 |         ps.destroy_model_parallel()
27 |         torch.distributed.barrier()
28 | 
29 |     @staticmethod
30 |     def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, sequence_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None):
31 |         ps.destroy_model_parallel()
32 |         if not torch.distributed.is_initialized():
33 |             Utils.initialize_distributed()
34 |         ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, sequence_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank)


--------------------------------------------------------------------------------
/tests/unit_tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import megatron.core.utils as util
 4 | import numpy as np
 5 | 
 6 | def test_divide_properly():
 7 |     assert util.divide(4,2) == 2
 8 | 
 9 | def test_divide_improperly():
10 |     with pytest.raises(AssertionError):
11 |         util.divide(4,5)
12 | 
13 | def test_global_memory_buffer():
14 |     global_memory_buffer = util.GlobalMemoryBuffer()
15 |     obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor")
16 |     expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device())
17 |     assert torch.equal(obtained_tensor, expected_tensor)
18 | 
19 | def test_make_viewless_tensor():
20 |     inp = torch.rand((3,4))
21 |     assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True)))
22 |     assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False)))
23 | 
24 | def test_safely_set_viewless_tensor_data():
25 |     tensor = torch.zeros((3,4))
26 |     new_data_tensor = torch.tensor(np.random.rand(3,4))
27 |     util.safely_set_viewless_tensor_data(tensor, new_data_tensor)
28 |     assert(torch.equal(tensor, new_data_tensor))
29 | 
30 | def test_assert_viewless_tensor():
31 |     tensor = torch.rand((3,4))
32 |     assert(torch.equal(util.assert_viewless_tensor(tensor), tensor))
33 |     input_tensor_list=[tensor,tensor,tensor]
34 |     output_tensor_list = util.assert_viewless_tensor(input_tensor_list)
35 |     for inp,out in zip(input_tensor_list, output_tensor_list):
36 |         assert(torch.equal(inp,out))
37 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tools/__init__.py


--------------------------------------------------------------------------------
/tools/bert_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder
4 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "h5py",
 7 |     "transformers", # for huggingface bert
 8 | ]
 9 | 
10 | for lib in required_libs:
11 |     try:
12 |         globals()[lib] = importlib.import_module(lib)
13 |     except ImportError as e:
14 |         raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.")
15 | 


--------------------------------------------------------------------------------
/tools/convert_checkpoint/inspect_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import os
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | def dump_data(datum, name_list=[]):
 8 |     if type(datum) in (dict, OrderedDict):
 9 |         for k, v in datum.items():
10 |             dump_data(v, name_list+[str(k)])
11 |     elif type(datum) in (list, tuple):
12 |         for v in datum:
13 |             dump_data(v, name_list)
14 |     elif torch.is_tensor(datum):
15 |         prefix = '.'.join(name_list)
16 |         print(f'[tensor] {prefix} = {datum.shape}')
17 |     else:
18 |         #pass 
19 |         prefix = '.'.join(name_list)
20 |         print(f'[other] {prefix} = {datum}')
21 | 
22 | def main():
23 |     if len(sys.argv) < 2:
24 |         print(f'Usage: {sys.argv[0]} <checkpoint file>')
25 |         exit(1)
26 | 
27 |     ckpt_file = sys.argv[1]
28 |     if not os.path.isfile(ckpt_file):
29 |         print(f'{ckpt_file} is not a valid file')
30 |         exit(1)
31 | 
32 |     print(f'loading checkpoint file: {ckpt_file}')
33 |     sd = torch.load(ckpt_file)
34 |     dump_data(sd)
35 | 
36 |     quit()
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | 
 8 | """
 9 | This code adds id to each json object in a json file. User can add prefix
10 | to the ids.
11 | """
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     print('parsing the arguments ...')
16 | 
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
19 |         ' json file where id needs to be added')
20 |     parser.add_argument('--output-file', type=str, default=None, help=\
21 |         'Output file name with id')
22 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
23 |         'Id prefix')
24 |     parser.add_argument('--log-interval', type=int, default=100,
25 |                        help='Log interval')
26 |     args = parser.parse_args()
27 | 
28 |     print('Adding ids to dataset ...')
29 | 
30 |     f_input = open(args.input_file, 'r', encoding='utf-8')
31 |     f_output = open(args.output_file, 'wb')
32 | 
33 |     unique_ids = 1
34 |     start_time = time.time()
35 |     for row in f_input:
36 |         each_row = json.loads(row)
37 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
38 |         each_row['adlr_id'] = adlr_id_string
39 |         myjson = json.dumps(each_row, ensure_ascii=False)
40 | 
41 |         f_output.write(myjson.encode('utf-8'))
42 |         f_output.write('\n'.encode('utf-8'))
43 | 
44 |         if unique_ids % args.log_interval == 0:
45 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
46 |                     unique_ids, time.time() - start_time), flush=True)
47 | 
48 |         unique_ids += 1
49 | 
50 |     # Close the file.
51 |     f_input.close()
52 |     f_output.close()
53 |     
54 |     print('done :-)', flush=True)
55 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import glob
 5 | import sys
 6 | import json
 7 | import argparse
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--json_path", type=str, default=".",
13 |         help="path where all the json files are located")
14 | 
15 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
16 |         help="filename where the merged json should go")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     json_path = args.json_path
21 |     out_file = args.output_file
22 | 
23 |     json_files = glob.glob(json_path + '/*.json')
24 | 
25 |     counter = 0
26 | 
27 |     with open(out_file, 'w') as outfile:
28 |         for fname in json_files:
29 |             counter += 1
30 | 
31 |             if counter % 1024 == 0:
32 |                 print("Merging at ", counter, flush=True)
33 | 
34 |             with open(fname, 'r') as infile:
35 |                 for row in infile:
36 |                     each_row = json.loads(row)
37 |                     outfile.write(row)
38 | 
39 | 
40 |     print("Merged file", out_file, flush=True)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import json
 5 | import time
 6 | import sys
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     url_filename = sys.argv[1]
12 |     data_filename = sys.argv[2]
13 |     output_filename = sys.argv[3]
14 | 
15 |     urls = set()
16 |     with open(url_filename, 'r') as f:
17 |         for line in f:
18 |             myjson = json.loads(line)
19 |             for key in myjson:
20 |                 this_urls = myjson[key]
21 |                 for i in range(1, len(this_urls)):
22 |                     urls.add(this_urls[i])
23 |     print('will be removing {} urls'.format(len(urls)), flush=True)
24 | 
25 |     written_docs = 0
26 |     removed_docs = 0
27 |     removed_chars = 0
28 |     start_time = time.time()
29 |     with open(output_filename, 'wb') as fout:
30 |         with open(data_filename, 'r') as fin:
31 |             for line in fin:
32 |                 try:
33 |                     myjson = json.loads(line)
34 |                     url = myjson['url']
35 |                     if url in urls:
36 |                         print('removing', myjson)
37 |                         removed_docs += 1
38 |                         removed_chars += len(myjson['text'])
39 |                         continue
40 |                     myjson = json.dumps(myjson, ensure_ascii=False)
41 |                     fout.write(myjson.encode('utf-8'))
42 |                     fout.write('\n'.encode('utf-8'))
43 |                     written_docs += 1
44 |                     if written_docs % 10000 == 0:
45 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
46 |                               '| removed: {} (char: {})'.format(
47 |                                   time.time() - start_time,
48 |                                   written_docs, removed_docs, removed_chars))
49 |                 except Exception as e:
50 |                     print('[SKIPPING]', line, e)
51 | 
52 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
53 |           '| removed: {} (char: {})'.format(
54 |               time.time() - start_time,
55 |               written_docs, removed_docs, removed_chars))
56 |     print('done :-)')
57 | 


--------------------------------------------------------------------------------
/tools/retro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tools/retro/__init__.py


--------------------------------------------------------------------------------
/tools/retro/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .cli import retro
4 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | from . import retro
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     retro.init(os.environ["RETRO_WORKDIR"])
10 | 


--------------------------------------------------------------------------------
/tools/retro/db/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .build import build_db
4 | 


--------------------------------------------------------------------------------
/tools/retro/examples/get_dataset_configs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Small English Wikipedia dataset (~2M chunks).
 4 | get_wiki_tiny_config() {
 5 |     RETRO_INDEX_STR="IVF4096_HNSW4,Flat"
 6 |     RETRO_NCHUNKS_SAMPLED=2281307
 7 |     RETRO_GPT_TRAIN_SAMPLES=31250
 8 |     LR_DECAY_SAMPLES=2
 9 |     LR_WARMUP_SAMPLES=1
10 |     RETRO_GPT_EVAL_INTERVAL=2000
11 |     RETRO_GPT_EVAL_ITERS=100
12 |     RETRO_EF_SEARCH=4
13 |     RETRO_NPROBE=64
14 |     DATALOADER_TYPE=cyclic
15 | }
16 | 
17 | # English Wikipedia dataset (~67M chunks).
18 | get_wiki_config() {
19 |     RETRO_INDEX_STR="IVF262144_HNSW32,Flat"
20 |     RETRO_NCHUNKS_SAMPLED=66625331
21 |     RETRO_GPT_TRAIN_SAMPLES=2037248
22 |     LR_DECAY_SAMPLES=2
23 |     LR_WARMUP_SAMPLES=1
24 |     RETRO_GPT_EVAL_INTERVAL=2000
25 |     RETRO_GPT_EVAL_ITERS=100
26 |     RETRO_EF_SEARCH=16
27 |     RETRO_NPROBE=4096
28 |     DATALOADER_TYPE=cyclic
29 | }
30 | 
31 | # Full corpus (~5B chunks).
32 | get_corpus_config() {
33 |     RETRO_INDEX_STR="OPQ64_128,IVF4194304_HNSW32,PQ64"
34 |     RETRO_NCHUNKS_SAMPLED=300000000
35 |     RETRO_GPT_TRAIN_SAMPLES=192000000
36 |     LR_DECAY_SAMPLES=166400000
37 |     LR_WARMUP_SAMPLES=162761
38 |     RETRO_GPT_EVAL_INTERVAL=2000
39 |     RETRO_GPT_EVAL_ITERS=50
40 |     RETRO_EF_SEARCH=32
41 |     RETRO_NPROBE=4096
42 |     DATALOADER_TYPE=single
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/retro/examples/preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -u
 4 | unset NCCL_DEBUG
 5 | 
 6 | NPROCS=8 # NPROCS must be <= number of GPUs.
 7 | 
 8 | set_current_dir() {
 9 |     DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
10 | }
11 | 
12 | ################ Dataset configs. ################
13 | # This script contains methods to customize arguments to specific dataset
14 | # types. Customize this script as needed for your datasets.
15 | set_current_dir
16 | . $DIR/get_dataset_configs.sh
17 | 
18 | ################ Environment variables. ################
19 | # *Note*: See 'Required environment variables' in 'get_preprocess_cmd.sh' for
20 | # a description of the required environment variables. These variables can be
21 | # set however a user would like. In our setup, we use another bash script
22 | # (location defined by $RETRO_ENV_VARS) that sets all the environment variables
23 | # at once.
24 | . $RETRO_ENV_VARS
25 | 
26 | ######## Environment vars. ########
27 | set_current_dir
28 | . ${DIR}/get_preprocess_cmd.sh
29 | 
30 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
31 | echo "DIR = '$DIR'."
32 | echo "RETRO_PREPROCESS_CMD = '$RETRO_PREPROCESS_CMD'."
33 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
34 | 
35 | ######## Command. ########
36 | FULL_CMD="\
37 |     pwd && cd ${REPO_DIR} && pwd && \
38 |     export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
39 |     python -m torch.distributed.run \
40 |     --nproc_per_node ${NPROCS} \
41 |     --nnodes 1 \
42 |     --node_rank ${NODE_RANK} \
43 |     --master_addr ${MASTER_ADDR} \
44 |     --master_port 6000 \
45 |     $RETRO_PREPROCESS_CMD \
46 | "
47 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
48 | echo "FULL_CMD = '$FULL_CMD'."
49 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
50 | eval $FULL_CMD
51 | 


--------------------------------------------------------------------------------
/tools/retro/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "faiss",
 7 |     "h5py",
 8 |     "transformers", # for huggingface bert
 9 | ]
10 | 
11 | for lib in required_libs:
12 |     try:
13 |         globals()[lib] = importlib.import_module(lib)
14 |     except ImportError as e:
15 |         raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.")
16 | 


--------------------------------------------------------------------------------
/tools/retro/index/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .build import add_to_index, build_index, train_index
4 | # from .index import Index
5 | 


--------------------------------------------------------------------------------
/tools/retro/index/factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from .indexes import FaissBaseIndex, FaissParallelAddIndex
 4 | 
 5 | 
 6 | class IndexFactory:
 7 |     '''Get index.
 8 | 
 9 |     Index type generally read from argument '--retro-index-ty'.
10 |     '''
11 | 
12 |     @classmethod
13 |     def get_index_class(cls, index_type):
14 |         return {
15 |             "faiss-base" : FaissBaseIndex,
16 |             "faiss-par-add" : FaissParallelAddIndex,
17 |         }[index_type]
18 | 
19 |     @classmethod
20 |     def get_index(cls, index_type):
21 |         index_class = cls.get_index_class(index_type)
22 |         index = index_class()
23 |         return index
24 | 


--------------------------------------------------------------------------------
/tools/retro/index/index.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import abc
 4 | import numpy as np
 5 | import os
 6 | import torch
 7 | 
 8 | from megatron import get_retro_args
 9 | from tools.retro.external_libs import faiss
10 | 
11 | from .utils import get_index_dir
12 | 
13 | 
14 | class Index(abc.ABC):
15 | 
16 |     '''Abstract base class for indexes.
17 | 
18 |     *Note* : While currently only Faiss-based classes are implemented, in the
19 |     future, this class will be extended with other types of indexes that have
20 |     different performance-accuracy trade-offs.
21 | 
22 |     The primary methods to override are:
23 |     - train() : Train index on the sampled training chunks.
24 |     - add() : Add all training chunks to index.
25 |     '''
26 | 
27 |     @classmethod
28 |     def c_verbose(cls, index, v):
29 |         '''Make index object verbose.'''
30 |         assert isinstance(v, bool)
31 |         faiss.ParameterSpace().set_index_parameter(index, "verbose", v)
32 | 
33 |     def get_empty_index_path(self):
34 |         args = get_retro_args()
35 |         return os.path.join(
36 |             get_index_dir(),
37 |             "empty_%.3f.faissindex" % args.retro_index_train_load_fraction,
38 |         )
39 | 
40 |     def get_empty_index(self):
41 |         return faiss.read_index(self.get_empty_index_path())
42 | 
43 |     def get_added_index_path(self):
44 |         args = get_retro_args()
45 |         return os.path.join(
46 |             get_index_dir(),
47 |             "added_%.3f_%.3f.faissindex" % (
48 |                 args.retro_index_train_load_fraction,
49 |                 args.retro_index_add_load_fraction,
50 |             ),
51 |         )
52 | 
53 |     def get_added_index(self):
54 |         return faiss.read_index(self.get_added_index_path())
55 | 
56 |     @abc.abstractmethod
57 |     def train(self, *args):
58 |         pass
59 | 
60 |     @abc.abstractmethod
61 |     def add(self, *args):
62 |         pass
63 | 
64 |     def embed_text_dataset_block(self, embedder, text_dataset, _range):
65 |         '''Embed a range of a text dataset.'''
66 |         sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range))
67 |         return embedder.embed_text_dataset(sub_dataset)
68 | 


--------------------------------------------------------------------------------
/tools/retro/index/indexes/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .faiss_base import FaissBaseIndex
4 | from .faiss_par_add import FaissParallelAddIndex
5 | 


--------------------------------------------------------------------------------
/tools/retro/index/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import concurrent
 4 | import gc
 5 | import glob
 6 | import numpy as np
 7 | import os
 8 | import psutil
 9 | import time
10 | import torch
11 | from tqdm import tqdm
12 | 
13 | from megatron import get_retro_args, print_rank_0
14 | from tools.retro.db.utils import get_indexed_dataset_infos
15 | from tools.retro.external_libs import h5py
16 | 
17 | 
18 | def get_index_dir():
19 |     """Create sub-directory for this index."""
20 | 
21 |     args = get_retro_args()
22 | 
23 |     # Directory path.
24 |     index_dir_path = os.path.join(
25 |         args.retro_workdir,
26 |         "index",
27 |         args.retro_index_type,
28 |         args.retro_index_str,
29 |     )
30 | 
31 |     # Make directory.
32 |     os.makedirs(index_dir_path, exist_ok=True)
33 | 
34 |     return index_dir_path
35 | 
36 | 
37 | def num_samples_to_block_ranges(num_samples):
38 |     '''Split a range (length num_samples) into sequence of block ranges
39 |     of size block_size.'''
40 |     args = get_retro_args()
41 |     block_size = args.retro_block_size
42 |     start_idxs = list(range(0, num_samples, block_size))
43 |     end_idxs = [min(num_samples, s + block_size) for s in start_idxs]
44 |     ranges = list(zip(start_idxs, end_idxs))
45 |     return ranges
46 | 
47 | 
48 | def get_training_data_root_dir():
49 |     args = get_retro_args()
50 |     return os.path.join(args.retro_workdir, "index", "train_emb")
51 | 
52 | 
53 | def get_training_data_block_dir():
54 |     return os.path.join(get_training_data_root_dir(), "blocks")
55 | 
56 | 
57 | def get_training_data_block_paths():
58 |     return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5"))
59 | 
60 | 
61 | def get_training_data_merged_path():
62 |     args = get_retro_args()
63 |     return os.path.join(get_training_data_root_dir(),
64 |                         "train_%.3f.bin" % args.retro_index_train_load_fraction)
65 | 
66 | 
67 | def get_added_codes_dir():
68 |     return os.path.join(get_index_dir(), "add_codes")
69 | 
70 | 
71 | def get_added_code_paths():
72 |     return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5"))
73 | 


--------------------------------------------------------------------------------
/tools/retro/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .query import query_pretraining_neighbors
4 | 


--------------------------------------------------------------------------------
/tools/retro/query/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import hashlib
 4 | import os
 5 | 
 6 | from megatron import get_retro_args
 7 | 
 8 | 
 9 | def get_query_workdir():
10 |     args = get_retro_args()
11 |     return os.path.join(args.retro_workdir, "query")
12 | 
13 | 
14 | def get_neighbor_dirname(key, dataset):
15 |     hashes = ",".join([ d.desc_hash for d in dataset.datasets ])
16 |     hash = hashlib.md5(hashes.encode()).hexdigest()
17 |     return os.path.join(get_query_workdir(), os.path.basename(f"{key}_{hash}"))
18 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import sys
 3 | import json
 4 | import requests
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     url = sys.argv[1]
 9 |     url = 'http://' + url + '/api'
10 |     headers = {'Content-Type': 'application/json'}
11 | 
12 |     while True:
13 |         sentence = input("Enter prompt: ")
14 |         tokens_to_generate = int(eval(input("Enter number of tokens to generate: ")))
15 | 
16 |         data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate}
17 |         response = requests.put(url, data=json.dumps(data), headers=headers)
18 | 
19 |         if response.status_code != 200:
20 |             print(f"Error {response.status_code}: {response.json()['message']}")
21 |         else:
22 |             print("Megatron Response: ")
23 |             print(response.json()['text'][0])
24 | 


--------------------------------------------------------------------------------