├── .coveragerc ├── .github └── workflows │ └── python.yml ├── .gitignore ├── .gitlab-ci.yml ├── CODEOWNERS ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── dataset ├── README.md ├── download_books.sh ├── download_ckpt.sh └── download_vocab.sh ├── docs ├── distrib_optimizer.md └── images │ └── distrib_optimizer │ ├── data_flow.png │ └── sharding_scheme.png ├── examples ├── README.md ├── detxoify_lm │ ├── README.md │ ├── annotations │ │ ├── filter-selfgeneration.py │ │ ├── perspective_api_annotate.py │ │ └── preprocess.sh │ ├── finetune_gpt.py │ ├── finetune_gpt_distributed-1.3b.sh │ ├── generate-1.3b.sh │ ├── generate_samples_gpt.py │ ├── perspective_api.py │ └── self_generation │ │ └── selfgenerate-1.3b-unconditional.sh ├── evaluate_retriever_nq.sh ├── evaluate_zeroshot_gpt.sh ├── finetune_mnli_distributed.sh ├── finetune_race_distributed.sh ├── finetune_retriever_distributed.sh ├── merge_mp_bert.sh ├── msdp │ ├── README.md │ ├── data_processing.sh │ ├── eval_knwl_generation.sh │ ├── eval_resp_generation.sh │ ├── prep_resp_gen.sh │ ├── prompt_knwl_gen.sh │ └── prompt_resp_gen.sh ├── pretrain_bert.sh ├── pretrain_bert_distributed.sh ├── pretrain_bert_distributed_with_mp.sh ├── pretrain_gpt.sh ├── pretrain_gpt3_175B.sh ├── pretrain_gpt_distributed.sh ├── pretrain_gpt_distributed_with_mp.sh ├── pretrain_ict.sh ├── pretrain_t5.sh ├── pretrain_t5_distributed.sh ├── pretrain_t5_distributed_with_mp.sh ├── run_text_generation_server_345M.sh ├── run_text_generation_server_345M_8_tensor_parallel.sh └── sc21 │ ├── CONFIG.sh │ ├── README.md │ ├── SBATCH.sh │ ├── SRUN.sh │ ├── run_figure_11.sh │ ├── run_figure_12.sh │ ├── run_figure_13.sh │ ├── run_figure_14.sh │ ├── run_figure_15.sh │ ├── run_figure_16.sh │ ├── run_figure_17.sh │ ├── run_figure_18.sh │ └── run_table_1.sh ├── examples_deepspeed ├── MoE │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_config_gpt_Zero2_TEMPLATE.json │ ├── ds_evalharness.sh │ ├── ds_pretrain_gpt_1.3B_MoE128.sh │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128.sh │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh │ ├── ds_pretrain_gpt_1.3B_dense.sh │ ├── ds_pretrain_gpt_1.3B_dense_cl.sh │ ├── ds_pretrain_gpt_125M_MoE64.sh │ ├── ds_pretrain_gpt_125M_dense_cl.sh │ ├── ds_pretrain_gpt_350M_MoE128.sh │ ├── ds_pretrain_gpt_350M_PR-MoE32or64.sh │ ├── ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh │ ├── ds_pretrain_gpt_350M_dense.sh │ ├── ds_pretrain_gpt_6.7B_dense.sh │ └── readme_evalharness.md ├── README.md ├── azure │ ├── README.md │ ├── run-175b.sh │ ├── run-1t.sh │ └── run-benchmark-model.sh ├── azureml │ ├── Dockerfile.dockerfile │ ├── README.md │ ├── aml_submit.py │ └── prepare_dataset.py ├── bert_with_pile │ ├── README.md │ ├── ds_config_bert_TEMPLATE.json │ ├── ds_finetune_bert_mnli.sh │ ├── ds_finetune_bert_qqp.sh │ ├── ds_finetune_bert_race.sh │ ├── ds_pretrain_bert.sh │ └── prepare_pile_data.py ├── compression │ ├── 125M-Int8-test-64gpu-distilled-group48.sh │ ├── 125M-L10-Int8-test-64gpu-distilled-group48.sh │ ├── 125M-L12-Int8-test-64gpu-distilled-group48.sh │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_config_gpt_TEMPLATE_compression.json │ ├── ds_evalharness.sh │ ├── ds_pretrain_gpt_1.3B_dense_cl_kd.sh │ ├── ds_pretrain_gpt_125M_dense_cl_kd.sh │ ├── ds_pretrain_gpt_125M_dense_kd.sh │ └── ds_pretrain_gpt_350M_dense_kd.sh ├── curriculum_learning │ ├── README.md │ ├── ds_config_gpt_slw_TEMPLATE.json │ ├── ds_pretrain_gpt2.sh │ ├── ds_pretrain_gpt_1.3B_rope_slw.sh │ ├── ds_train.sh │ ├── ds_zero_stage_1_config_baseline.json │ └── ds_zero_stage_1_config_curriculum_fixed_linear.json ├── data_efficiency │ ├── README.md │ ├── analyze_data.py │ ├── bert │ │ ├── ds_analyze_bert_data_map.sh │ │ ├── ds_analyze_bert_data_reduce.sh │ │ ├── finetune │ │ │ ├── ds_config_bert_TEMPLATE.json │ │ │ ├── ds_finetune_bert_mnli.sh │ │ │ ├── ds_finetune_bert_qqp.sh │ │ │ ├── ds_finetune_bert_race.sh │ │ │ └── ds_finetune_gather_result.py │ │ ├── finetune_glue │ │ │ ├── ds_config_bert_TEMPLATE.json │ │ │ ├── ds_finetune_bert_glue.sh │ │ │ ├── ds_finetune_bert_glue_run.sh │ │ │ └── ds_finetune_gather_result.py │ │ ├── pile_data_download_preprocess.py │ │ └── pretrain │ │ │ ├── ds_config_bert_1clmetric_TEMPLATE.json │ │ │ ├── ds_config_bert_2clmetrics_TEMPLATE.json │ │ │ ├── ds_pretrain_bert_336M_base_script.sh │ │ │ └── ds_pretrain_bert_336M_run.sh │ └── gpt │ │ ├── ds_analyze_gpt_data_map.sh │ │ ├── ds_analyze_gpt_data_reduce.sh │ │ ├── eval │ │ ├── ds_config_eval_dummy.json │ │ ├── ds_evalharness_1gpu.sh │ │ ├── ds_evalharness_gather_result.py │ │ ├── ds_evalharness_parallel_run.sh │ │ └── ds_evalharness_parallel_run_10shot.sh │ │ └── pretrain │ │ ├── ds_config_gpt_1clmetric_TEMPLATE.json │ │ ├── ds_config_gpt_2clmetrics_TEMPLATE.json │ │ ├── ds_pretrain_gpt_1.3B_dense_base_script.sh │ │ └── ds_pretrain_gpt_1.3B_dense_run.sh ├── deepspeed4science │ └── megatron_long_seq_support │ │ ├── README.md │ │ ├── ds_config_gpt_TEMPLATE.json │ │ ├── host_file │ │ ├── pretrain_gpt_1.3B_seq_parallel.sh │ │ └── pretrain_gpt_30B_seq_parallel.sh ├── finetune_hf_llama │ ├── README.md │ ├── ds_config.json │ ├── ds_config_empty.json │ └── finetune_llama.sh ├── generate_text.sh ├── offload_pp │ ├── README.md │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_pretrain_gpt_350M.sh │ └── twin-offload.png ├── pretrain_llama2_distributed.sh ├── pretrain_llama_distributed.sh ├── rebase │ ├── README.md │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_config_gpt_slw_TEMPLATE.json │ ├── ds_pretrain_gpt_1.3B.sh │ ├── ds_pretrain_gpt_1.3B_megatron_checkpointing.sh │ ├── ds_pretrain_gpt_1.3B_rope.sh │ ├── ds_pretrain_gpt_1.3B_rope_slw.sh │ ├── ds_pretrain_gpt_125M.sh │ ├── ds_pretrain_gpt_125M_flashattn.sh │ ├── ds_pretrain_gpt_13B.sh │ ├── gpt2-merges.txt │ └── gpt2-vocab.json ├── run_deepspeed_example.sh ├── sequence_parallel │ ├── README.md │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_pretrain_gpt_1.3B_seq_parallel_32k.sh │ ├── ds_pretrain_gpt_30B_seq_parallel_32k.sh │ ├── ds_pretrain_gpt_6.7B_fpdt_32k.sh │ └── preprocess_bookcorpus.py ├── universal_checkpointing │ ├── README.md │ ├── assets │ │ └── image │ │ │ ├── uc_char_training_loss.png │ │ │ ├── uc_char_validation_loss.png │ │ │ ├── uc_stage3_char_training_loss.png │ │ │ └── uc_stage3_char_validation_loss.png │ ├── ds_config.json │ ├── llama │ │ ├── run_llama_bf16.sh │ │ ├── run_tb_analysis_llama.sh │ │ └── run_universal_llama_bf16.sh │ ├── megatron_gpt │ │ ├── run_bf16.sh │ │ ├── run_fp16.sh │ │ ├── run_tb_analysis_gpt.sh │ │ ├── run_tb_analysis_gpt_plot_only.sh │ │ ├── run_universal_bf16.sh │ │ └── run_universal_fp16.sh │ └── tb_analysis │ │ ├── abstract_analysis.py │ │ ├── arguments.py │ │ ├── tb_analysis_script.py │ │ ├── uc_analysis.py │ │ └── utils.py └── zero_bubble_pp │ ├── README.md │ ├── benchmark.png │ ├── bw_split.png │ ├── zbh1.png │ └── zbh1_pretrain_gpt_1.3b.sh ├── finetune_llama.py ├── images ├── Achieved_petaFLOPs.png └── cases_april2021.png ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── core │ ├── README.md │ ├── __init__.py │ ├── enums.py │ ├── fusions │ │ ├── __init__.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_layer_norm.py │ │ └── fused_softmax.py │ ├── model_parallel_config.py │ ├── models │ │ ├── __init__.py │ │ └── gpt │ │ │ ├── __init__.py │ │ │ ├── gpt_embedding.py │ │ │ └── gpt_model.py │ ├── package_info.py │ ├── parallel_state.py │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── deepspeed_zbh1_engine.py │ │ ├── deepspeed_zbh1_schedule.py │ │ ├── p2p_communication.py │ │ └── schedules.py │ ├── requirements.txt │ ├── sequence_parallel │ │ ├── __init__.py │ │ └── cross_entropy.py │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ ├── utils.py │ │ └── weight_grad_store.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── core_attention.py │ │ ├── custom_layers │ │ │ └── transformer_engine.py │ │ ├── enums.py │ │ ├── mlp.py │ │ ├── module.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── utils.py │ └── utils.py ├── data │ ├── Makefile │ ├── __init__.py │ ├── autoaugment.py │ ├── bert_dataset.py │ ├── biencoder_dataset_utils.py │ ├── blendable_dataset.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── gpt_dataset.py │ ├── helpers.cpp │ ├── ict_dataset.py │ ├── image_folder.py │ ├── indexed_dataset.py │ ├── orqa_wiki_dataset.py │ ├── prompt_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ ├── t5_dataset.py │ ├── test │ │ ├── test_indexed_dataset.py │ │ └── test_preprocess_data.sh │ └── vit_dataset.py ├── dist_signal_handler.py ├── enums.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── scaled_masked_softmax.cpp │ ├── scaled_masked_softmax.h │ ├── scaled_masked_softmax_cuda.cu │ ├── scaled_softmax.cpp │ ├── scaled_softmax_cuda.cu │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax.h │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ ├── tests │ │ ├── __init__.py │ │ └── test_fused_kernels.py │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── distributed.py │ ├── enums.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_rmsnorm.py │ ├── fused_softmax.py │ ├── gpt_model.py │ ├── language_model.py │ ├── module.py │ ├── multiple_choice.py │ ├── realm_model.py │ ├── rmsnorm.py │ ├── rotary_pos_embedding.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vision │ │ ├── __init__.py │ │ ├── classification.py │ │ ├── dino.py │ │ ├── esvit_swin_backbone.py │ │ ├── inpainting.py │ │ ├── knn_monitor.py │ │ ├── mit_backbone.py │ │ ├── swin_backbone.py │ │ ├── utils.py │ │ └── vit_backbone.py ├── mpu │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py ├── optimizer │ ├── __init__.py │ ├── clip_grads.py │ ├── distrib_optimizer.py │ ├── grad_scaler.py │ └── optimizer.py ├── optimizer_param_scheduler.py ├── p2p_communication.py ├── profiler.py ├── static │ └── index.html ├── text_generation │ ├── __init__.py │ ├── api.py │ ├── beam_utils.py │ ├── communication.py │ ├── forward_step.py │ ├── generation.py │ ├── sampling.py │ └── tokenization.py ├── text_generation_server.py ├── text_generation_utils.py ├── timers.py ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_gpt_core.py ├── pretrain_ict.py ├── pretrain_retro.py ├── pretrain_t5.py ├── pretrain_vision_classify.py ├── pretrain_vision_dino.py ├── pretrain_vision_inpaint.py ├── setup.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_harness │ ├── download.py │ ├── evaluate.py │ └── report-to-csv.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── cola.py │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ ├── mrpc.py │ ├── qnli.py │ ├── qqp.py │ ├── rte.py │ ├── sst2.py │ └── stsb.py ├── main.py ├── msdp │ ├── README.md │ ├── evaluate.py │ ├── main.py │ ├── metrics.py │ ├── preprocessing.py │ └── prompt.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification │ │ ├── classification.py │ │ └── eval_utils.py │ ├── finetune_utils.py │ ├── main.py │ └── segmentation │ │ ├── cityscapes.py │ │ ├── data.py │ │ ├── finetune_segformer.py │ │ ├── finetune_setr.py │ │ ├── metrics.py │ │ ├── seg_heads.py │ │ ├── seg_models.py │ │ ├── transforms.py │ │ └── utils.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── __init__.py ├── conftest.py ├── functional_tests │ ├── __init__.py │ ├── python_test_utils │ │ ├── __init__.py │ │ ├── check_slurm_job_completion.py │ │ ├── get_test_results_from_tensorboard_logs.py │ │ ├── test_ci_pipeline.py │ │ └── test_resume_checkpoint_pipeline.py │ ├── shell_test_utils │ │ └── jobwait.sh │ ├── test_results │ │ ├── bert │ │ │ ├── bert_tp1_pp2_1nodes_50steps.json │ │ │ ├── bert_tp1_pp4_1nodes_50steps.json │ │ │ ├── bert_tp2_pp2_1nodes_50steps.json │ │ │ └── bert_tp4_pp1_1nodes_50steps.json │ │ └── gpt3 │ │ │ ├── gpt3_tp1_pp2_1nodes_50steps.json │ │ │ ├── gpt3_tp1_pp4_1nodes_50steps.json │ │ │ ├── gpt3_tp2_pp2_1nodes_50steps.json │ │ │ └── gpt3_tp4_pp1_1nodes_50steps.json │ └── test_scripts │ │ ├── bert │ │ ├── pretrain_bert_distributed_resume_checkpoint_test.sh │ │ ├── pretrain_bert_distributed_test.sh │ │ ├── sbatch_bert_distributed_resume_checkpoint_test.sh │ │ └── sbatch_bert_distributed_test.sh │ │ └── gpt3 │ │ ├── pretrain_gpt3_distributed_resume_checkpoint_test.sh │ │ ├── pretrain_gpt3_distributed_test.sh │ │ ├── sbatch_gpt3_distributed_resume_checkpoint_test.sh │ │ └── sbatch_gpt3_distributed_test.sh ├── models │ ├── __init__.py │ ├── test_gpt_embedding.py │ └── test_gpt_model.py ├── pipeline_parallel │ ├── __init__.py │ └── test_schedules.py ├── run_megatron.py ├── tensor_parallel │ └── __int__.py ├── test_megatron.py ├── transformer │ ├── __init__.py │ ├── test_core_attention.py │ ├── test_module.py │ ├── test_parallel_attention.py │ ├── test_parallel_mlp.py │ ├── test_parallel_transformer_block.py │ ├── test_parallel_transformer_layer.py │ └── test_transformer_config.py └── unit_tests │ ├── __init__.py │ ├── tensor_parallel │ ├── test_cross_entropy.py │ ├── test_data.py │ ├── test_mappings.py │ ├── test_random.py │ └── test_tensor_parallel_utils.py │ ├── test_basic.py │ ├── test_parallel_state.py │ ├── test_utilities.py │ └── test_utils.py └── tools ├── __init__.py ├── bert_embedding ├── __init__.py ├── dataset.py ├── embed.py ├── external_libs.py ├── huggingface.py └── utils.py ├── checkpoint_loader_megatron.py ├── checkpoint_saver_megatron.py ├── checkpoint_util.py ├── convert_checkpoint ├── README.md ├── deepspeed_checkpoint.py ├── deepspeed_to_megatron.py ├── deepspeed_to_transformers.py ├── inspect_checkpoint.py └── inspect_deepspeed_checkpoint.py ├── generate_samples_gpt.py ├── hf2megads_weight_converter.py ├── linter.py ├── merge_datasets.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py ├── preprocess_data.py ├── preprocess_data_nmt.py ├── retro ├── README.md ├── __init__.py ├── cli │ ├── __init__.py │ ├── __main__.py │ └── cli.py ├── db │ ├── __init__.py │ ├── build.py │ ├── dataset.py │ └── utils.py ├── examples │ ├── get_dataset_configs.sh │ ├── get_preprocess_cmd.sh │ ├── preprocess_data.sh │ └── pretrain_model.sh ├── external_libs.py ├── index │ ├── __init__.py │ ├── build.py │ ├── factory.py │ ├── index.py │ ├── indexes │ │ ├── __init__.py │ │ ├── faiss_base.py │ │ └── faiss_par_add.py │ └── utils.py ├── main.py ├── query │ ├── __init__.py │ ├── chunk_dataset.py │ ├── query.py │ ├── retro_dataset.py │ └── utils.py └── utils.py ├── run_text_generation_server.py └── text_generation_cli.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [html] 2 | directory = coverage 3 | 4 | [run] 5 | data_file = .coverage_$LOCAL_RANK 6 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: python 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | branches: 7 | '**' 8 | schedule: 9 | - cron: "0 0 * * *" 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | unit-tests: 17 | strategy: 18 | matrix: 19 | pyVersion: ["3.7", "3.8", "3.9", "3.10"] 20 | fail-fast: false 21 | 22 | runs-on: ubuntu-22.04 23 | container: 24 | image: deepspeed/gh-builder:py${{ matrix.pyVersion }} 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - name: environment 30 | run: | 31 | which python 32 | python --version 33 | - name: Install Megatron-DeepSpeed 34 | run: | 35 | pip3 install . 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | 3 | # Distribution / packaging 4 | build/ 5 | dist/ 6 | *.egg-info/ 7 | 8 | # binaries 9 | *.so 10 | 11 | # tmp files 12 | *.swp 13 | 14 | # AML workspace config file 15 | config.json 16 | 17 | .coverage_* 18 | *~ 19 | slurm* 20 | logs 21 | 22 | # Data folder 23 | bookcorpus_data/ -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft @guanhuawang 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | recursive-include megatron/fused_kernels *.cpp *.h *.cu *.tr *.cuh *.cc 4 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | # Run the scripts below to setup dataset 2 | 3 | bash download_books.sh 4 | 5 | bash download_vocab.sh 6 | -------------------------------------------------------------------------------- /dataset/download_books.sh: -------------------------------------------------------------------------------- 1 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin 2 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx -------------------------------------------------------------------------------- /dataset/download_ckpt.sh: -------------------------------------------------------------------------------- 1 | mkdir -p checkpoints/gpt2_345m 2 | 3 | cd checkpoints/gpt2_345m 4 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip 5 | unzip megatron_lm_345m_v0.0.zip 6 | rm megatron_lm_345m_v0.0.zip 7 | cd ../.. 8 | 9 | -------------------------------------------------------------------------------- /dataset/download_vocab.sh: -------------------------------------------------------------------------------- 1 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -------------------------------------------------------------------------------- /docs/images/distrib_optimizer/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/docs/images/distrib_optimizer/data_flow.png -------------------------------------------------------------------------------- /docs/images/distrib_optimizer/sharding_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/docs/images/distrib_optimizer/sharding_scheme.png -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Original examples by NVIDIA/Megatron-LM 2 | 3 | This folder includes examples from the original NVIDIA/Megatron-LM repo. All of them do NOT have DeepSpeed technologies integrations, and some of them may not work due to changes in this Megatron-DeepSpeed repo. Thus we recommend you to go to ```../examples_deepspeed/``` folder which includes examples that have DeepSpeed technologies integrated and are tested by DeepSpeed team. 4 | -------------------------------------------------------------------------------- /examples/detxoify_lm/annotations/preprocess.sh: -------------------------------------------------------------------------------- 1 | VOCAB_FILE=pt2-vocab.json 2 | MERGE_FILE=gpt2-merges.txt 3 | 4 | python3 tools/preprocess_data.py \ 5 | --input $1 \ 6 | --output-prefix $2 \ 7 | --vocab-file $VOCAB_FILE \ 8 | --merge-file $MERGE_FILE \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --append-eod --workers 20 --chunk-size 25 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Change for multinode config 4 | GPUS_PER_NODE=16 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=$(($RANDOM + 1024)) 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | # input 12 | DATA_PATH=$1 13 | SHARE_DATA=$PWD # current work dir 14 | FINETUNED_PATH="$SHARE_DATA/$2" 15 | lr=$3 16 | bs=$4 17 | iter=$5 18 | CHECKPOINT_PATH=$6 19 | 20 | # vocab 21 | VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab 22 | MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file 23 | 24 | # tensorboard 25 | TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2" 26 | mkdir -p ${TENSORBOARD_DIR} 27 | 28 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 29 | 30 | python -m torch.distributed.run $DISTRIBUTED_ARGS \ 31 | examples/detxoify_lm/finetune_gpt.py \ 32 | --num-layers 24 \ 33 | --hidden-size 2048 \ 34 | --num-attention-heads 32 \ 35 | --micro-batch-size 4 \ 36 | --global-batch-size $bs \ 37 | --seq-length 2048 \ 38 | --max-position-embeddings 2048 \ 39 | --train-iters $iter \ 40 | --save $FINETUNED_PATH \ 41 | --load $CHECKPOINT_PATH \ 42 | --data-path $DATA_PATH \ 43 | --data-path2 ${DATA_BLEND} \ 44 | --vocab-file $VOCAB_FILE \ 45 | --merge-file $MERGE_FILE \ 46 | --data-impl mmap \ 47 | --split 100,0,0 \ 48 | --distributed-backend nccl \ 49 | --lr-decay-style constant \ 50 | --lr $lr \ 51 | --clip-grad 1.0 \ 52 | --weight-decay 0.1 \ 53 | --adam-beta1 0.9 \ 54 | --adam-beta2 0.95 \ 55 | --checkpoint-activations \ 56 | --log-interval 1 \ 57 | --save-interval 78 \ 58 | --eval-interval 78 \ 59 | --eval-iters 50 \ 60 | --fp16 \ 61 | --DDP-impl local \ 62 | --finetune --no-load-optim \ 63 | --log-validation-ppl-to-tensorboard \ 64 | --tensorboard-dir ${TENSORBOARD_DIR} 65 | -------------------------------------------------------------------------------- /examples/detxoify_lm/generate-1.3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | VOCAB_FILE=gpt2-vocab.json 4 | MERGE_FILE=gpt2-merges.txt 5 | 6 | GPUS_PER_NODE=1 7 | # Change for multinode config 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=$(($RANDOM + 1024)) 10 | NNODES=1 11 | NODE_RANK=0 12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 13 | NUM_SAMPLES=$(wc -l < $1) 14 | PREFIX=$(basename $2) 15 | SEED=$(($RANDOM)) 16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl 17 | 18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 19 | 20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 21 | --tensor-model-parallel-size 1 \ 22 | --num-layers 24 \ 23 | --hidden-size 2048 \ 24 | --load $CHECKPOINT_PATH \ 25 | --num-attention-heads 32 \ 26 | --max-position-embeddings 2048 \ 27 | --tokenizer-type GPT2BPETokenizer \ 28 | --fp16 \ 29 | --micro-batch-size 400 \ 30 | --seq-length 2048 \ 31 | --out-seq-length 20 \ 32 | --temperature 1.0 \ 33 | --vocab-file $VOCAB_FILE \ 34 | --merge-file $MERGE_FILE \ 35 | --sample-input-file $1 \ 36 | --sample-output-file $OUTPUT \ 37 | --num-samples $NUM_SAMPLES \ 38 | --max-tokens-to-oom 1200000 \ 39 | --top_p 0.9 \ 40 | --seed $SEED 41 | 42 | -------------------------------------------------------------------------------- /examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | SHARE_DATA=$PWD # current work dir 4 | VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab 5 | MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file 6 | 7 | GPUS_PER_NODE=1 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=$(($RANDOM + 1024)) 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | SEED=$3 15 | SUFFIX=$(basename $CHECKPOINT_PATH) 16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/ 17 | mkdir -p $save_dir 18 | echo $save_dir/$SEED.out 19 | 20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 21 | 22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 23 | --tensor-model-parallel-size 1 \ 24 | --num-layers 24 \ 25 | --hidden-size 2048 \ 26 | --load $CHECKPOINT_PATH \ 27 | --num-attention-heads 32 \ 28 | --max-position-embeddings 2048 \ 29 | --tokenizer-type GPT2BPETokenizer \ 30 | --fp16 \ 31 | --micro-batch-size 150 \ 32 | --seq-length 2048 \ 33 | --out-seq-length 1000 \ 34 | --temperature 1.0 \ 35 | --vocab-file $VOCAB_FILE \ 36 | --merge-file $MERGE_FILE \ 37 | --num-samples $1 \ 38 | --top_p 0.9 \ 39 | --max-tokens-to-oom 1200000 \ 40 | --genfile $save_dir/$SEED.out \ 41 | --seed $SEED 42 | 43 | -------------------------------------------------------------------------------- /examples/evaluate_retriever_nq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained 4 | # ICT model or a finetuned model for Natural Question task 5 | 6 | # Datasets can be downloaded from the following link: 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 8 | 9 | EVIDENCE_DATA_DIR= 10 | EMBEDDING_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | QA_FILE= 14 | 15 | python tasks/main.py \ 16 | --task RETRIEVER-EVAL \ 17 | --tokenizer-type BertWordPieceLowerCase \ 18 | --num-layers 12 \ 19 | --hidden-size 768 \ 20 | --num-attention-heads 12 \ 21 | --tensor-model-parallel-size 1 \ 22 | --micro-batch-size 128 \ 23 | --activations-checkpoint-method uniform \ 24 | --seq-length 512 \ 25 | --max-position-embeddings 512 \ 26 | --load ${CHECKPOINT_PATH} \ 27 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 28 | --embedding-path ${EMBEDDING_PATH} \ 29 | --retriever-seq-length 256 \ 30 | --vocab-file bert-vocab.txt\ 31 | --qa-data-test ${QA_FILE} \ 32 | --faiss-use-gpu \ 33 | --retriever-report-topk-accuracies 1 5 20 100 \ 34 | --fp16 \ 35 | --indexer-log-interval 1000 \ 36 | --indexer-batch-size 128 37 | 38 | 39 | -------------------------------------------------------------------------------- /examples/evaluate_zeroshot_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TASK="LAMBADA" 12 | 13 | VALID_DATA= 14 | VOCAB_FILE=gpt2-vocab.json 15 | MERGE_FILE=gpt2-merges.txt 16 | CHECKPOINT=checkpoints/gpt2_345m 17 | 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 20 | --task $TASK \ 21 | --valid-data $VALID_DATA \ 22 | --tokenizer-type GPT2BPETokenizer \ 23 | --strict-lambada \ 24 | --vocab-file $VOCAB_FILE \ 25 | --merge-file $MERGE_FILE \ 26 | --load $CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 8 \ 32 | --activations-checkpoint-method uniform \ 33 | --seq-length 1024 \ 34 | --max-position-embeddings 1024 \ 35 | --log-interval 10 \ 36 | --fp16 \ 37 | --no-load-optim \ 38 | --no-load-rng 39 | -------------------------------------------------------------------------------- /examples/finetune_mnli_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv" 12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \ 13 | data/glue_data/MNLI/dev_mismatched.tsv" 14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 15 | VOCAB_FILE=bert-vocab.txt 16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task MNLI \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 5 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 8 \ 32 | --activations-checkpoint-method uniform \ 33 | --lr 5.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.065 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 500000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/finetune_race_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/RACE/train/middle" 12 | VALID_DATA="data/RACE/dev/middle \ 13 | data/RACE/dev/high" 14 | VOCAB_FILE=bert-vocab.txt 15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 16 | CHECKPOINT_PATH=checkpoints/bert_345m_race 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task RACE \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 3 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 4 \ 32 | --activations-checkpoint-method uniform \ 33 | --lr 1.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.06 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 100000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --clip-grad 1.0 \ 45 | --hidden-dropout 0.1 \ 46 | --attention-dropout 0.1 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /examples/finetune_retriever_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Finetune a BERT or pretrained ICT model using Google natural question data 4 | # Datasets can be downloaded from the following link: 5 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 6 | 7 | WORLD_SIZE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | CHECKPOINT_PATH= 16 | 17 | # Load either of the below 18 | BERT_LOAD_PATH= 19 | PRETRAINED_CHECKPOINT= 20 | 21 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 22 | --task RET-FINETUNE-NQ \ 23 | --train-with-neg \ 24 | --train-hard-neg 1 \ 25 | --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \ 26 | --num-layers 12 \ 27 | --hidden-size 768 \ 28 | --num-attention-heads 12 \ 29 | --tensor-model-parallel-size 1 \ 30 | --tokenizer-type BertWordPieceLowerCase \ 31 | --train-data nq-train.json \ 32 | --valid-data nq-dev.json \ 33 | --save ${CHECKPOINT_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --vocab-file bert-vocab.txt \ 36 | --bert-load ${BERT_LOAD_PATH} \ 37 | --save-interval 5000 \ 38 | --log-interval 10 \ 39 | --eval-interval 20000 \ 40 | --eval-iters 100 \ 41 | --indexer-log-interval 1000 \ 42 | --faiss-use-gpu \ 43 | --DDP-impl torch \ 44 | --fp16 \ 45 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 46 | --seq-length 512 \ 47 | --retriever-seq-length 256 \ 48 | --max-position-embeddings 512 \ 49 | --retriever-score-scaling \ 50 | --epochs 80 \ 51 | --micro-batch-size 8 \ 52 | --eval-micro-batch-size 16 \ 53 | --indexer-batch-size 128 \ 54 | --lr 2e-5 \ 55 | --lr-warmup-fraction 0.01 \ 56 | --weight-decay 1e-1 57 | -------------------------------------------------------------------------------- /examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TENSOR_MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /examples/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp). 5 | 6 | -------------------------------------------------------------------------------- /examples/msdp/eval_knwl_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################### 4 | # Evaluate the F1 scores. 5 | ######################### 6 | 7 | WORLD_SIZE=1 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 9 | --nnodes 1 \ 10 | --node_rank 0 \ 11 | --master_addr localhost \ 12 | --master_port 6000" 13 | 14 | MODEL_GEN_PATH= \ 15 | (e.g., /testseen_knowledge_generations.txt) 16 | GROUND_TRUTH_PATH= \ 17 | (e.g., /testseen_knowledge_reference.txt) 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --seq-length 2048 \ 24 | --max-position-embeddings 2048 \ 25 | --micro-batch-size 4 \ 26 | --task MSDP-EVAL-F1 \ 27 | --guess-file ${MODEL_GEN_PATH} \ 28 | --answer-file ${GROUND_TRUTH_PATH} 29 | 30 | 31 | ############################################ 32 | # Evaluate BLEU, METEOR, and ROUGE-L scores. 33 | ############################################ 34 | 35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 37 | 38 | # To evaluate on these metrics, please setup the environments based on 39 | # the nlg-eval github, and run the corresponding evaluation commands. 40 | 41 | nlg-eval \ 42 | --hypothesis= \ 43 | --references= 44 | -------------------------------------------------------------------------------- /examples/msdp/eval_resp_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################### 4 | # Evaluate the F1 scores. 5 | ######################### 6 | 7 | WORLD_SIZE=1 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 9 | --nnodes 1 \ 10 | --node_rank 0 \ 11 | --master_addr localhost \ 12 | --master_port 6000" 13 | 14 | MODEL_GEN_PATH= \ 15 | (e.g., /testseen_response_generations.txt) 16 | GROUND_TRUTH_PATH= \ 17 | (e.g., /testseen_response_reference.txt) 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --seq-length 2048 \ 24 | --max-position-embeddings 2048 \ 25 | --micro-batch-size 4 \ 26 | --task MSDP-EVAL-F1 \ 27 | --guess-file ${MODEL_GEN_PATH} \ 28 | --answer-file ${GROUND_TRUTH_PATH} 29 | 30 | 31 | ########################## 32 | # Evaluate the KF1 scores. 33 | ########################## 34 | 35 | MODEL_GEN_PATH= \ 36 | (e.g., /testseen_response_generations.txt) 37 | GROUND_TRUTH_PATH= \ 38 | (e.g., /testseen_knowledge_reference.txt) 39 | 40 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 41 | --num-layers 24 \ 42 | --hidden-size 1024 \ 43 | --num-attention-heads 16 \ 44 | --seq-length 2048 \ 45 | --max-position-embeddings 2048 \ 46 | --micro-batch-size 4 \ 47 | --task MSDP-EVAL-F1 \ 48 | --guess-file ${MODEL_GEN_PATH} \ 49 | --answer-file ${GROUND_TRUTH_PATH} 50 | 51 | 52 | ############################################ 53 | # Evaluate BLEU, METEOR, and ROUGE-L scores. 54 | ############################################ 55 | 56 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 57 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 58 | 59 | # To evaluate on these metrics, please setup the environments based on 60 | # the nlg-eval github, and run the corresponding evaluation commands. 61 | 62 | nlg-eval \ 63 | --hypothesis= \ 64 | --references= 65 | -------------------------------------------------------------------------------- /examples/msdp/prep_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Preparing the input file for the response generation (second-stage prompting) 4 | 5 | DIR=`pwd` 6 | 7 | TEST_FILE= \ 8 | (e.g., /testseen_processed.txt) 9 | KNOWLEDGE_FILE= \ 10 | (e.g., /testseen_knowledge_generations.txt) 11 | PROCESSED_FILE= \ 12 | (e.g., /testseen_processed_with_generated_knowledge.txt) 13 | 14 | python ${DIR}/tasks/msdp/preprocessing.py \ 15 | --func prepare_input \ 16 | --test_file ${TEST_FILE} \ 17 | --knwl_gen_file ${KNOWLEDGE_FILE} \ 18 | --processed_file ${PROCESSED_FILE} 19 | -------------------------------------------------------------------------------- /examples/msdp/prompt_knwl_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge 5 | # The size of the pretrained language model is 357M 6 | 7 | WORLD_SIZE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | CHECKPOINT_PATH= (e.g., /357m) 16 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 17 | MERGE_PATH= (e.g., /gpt2-merges.txt) 18 | INPUT_PATH= \ 19 | (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /testseen_knowledge_prompts.json) 22 | OUTPUT_PATH= \ 23 | (e.g., /testseen_knowledge_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type knowledge \ 42 | --num-prompt-examples 10 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/msdp/prompt_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1 5 | # The output is the corresponding response. 6 | # The size of the pretrained language model is 357M 7 | 8 | WORLD_SIZE=8 9 | 10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 11 | --nnodes 1 \ 12 | --node_rank 0 \ 13 | --master_addr localhost \ 14 | --master_port 6000" 15 | 16 | CHECKPOINT_PATH= (e.g., /357m) 17 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 18 | MERGE_PATH= (e.g., /gpt2-merges.txt) 19 | INPUT_PATH= (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /response_prompts.txt) 22 | OUTPUT_PATH= \ 23 | (e.g., /output_testseen_response_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type response \ 42 | --num-prompt-examples 20 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | CHECKPOINT_PATH= 6 | VOCAB_FILE=/bert-vocab.txt 7 | DATA_PATH=_text_sentence 8 | 9 | BERT_ARGS=" 10 | --num-layers 24 \ 11 | --hidden-size 1024 \ 12 | --num-attention-heads 16 \ 13 | --seq-length 512 \ 14 | --max-position-embeddings 512 \ 15 | --micro-batch-size 4 \ 16 | --global-batch-size 8 \ 17 | --lr 0.0001 \ 18 | --train-iters 2000000 \ 19 | --lr-decay-iters 990000 \ 20 | --lr-decay-style linear \ 21 | --min-lr 0.00001 \ 22 | --weight-decay 1e-2 \ 23 | --lr-warmup-fraction .01 \ 24 | --clip-grad 1.0 \ 25 | --fp16 26 | " 27 | 28 | DATA_ARGS=" 29 | --data-path $DATA_PATH \ 30 | --vocab-file $VOCAB_FILE \ 31 | --data-impl mmap \ 32 | --split 949,50,1 33 | " 34 | 35 | OUTPUT_ARGS=" 36 | --log-interval 100 \ 37 | --save-interval 10000 \ 38 | --eval-interval 1000 \ 39 | --eval-iters 10 40 | " 41 | 42 | torchrun pretrain_bert.py \ 43 | $BERT_ARGS \ 44 | $DATA_ARGS \ 45 | $OUTPUT_ARGS \ 46 | --save $CHECKPOINT_PATH \ 47 | --load $CHECKPOINT_PATH 48 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/bert-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | BERT_ARGS=" 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 512 \ 30 | --max-position-embeddings 512 \ 31 | --micro-batch-size 4 \ 32 | --global-batch-size 32 \ 33 | --lr 0.0001 \ 34 | --train-iters 1000000 \ 35 | --lr-decay-iters 990000 \ 36 | --lr-decay-style linear \ 37 | --min-lr 1.0e-5 \ 38 | --weight-decay 1e-2 \ 39 | --lr-warmup-fraction .01 \ 40 | --clip-grad 1.0 \ 41 | --fp16 42 | " 43 | 44 | DATA_ARGS=" 45 | --data-path $DATA_PATH \ 46 | --vocab-file $VOCAB_FILE \ 47 | --data-impl mmap \ 48 | --split 949,50,1 49 | " 50 | 51 | OUTPUT_ARGS=" 52 | --log-interval 100 \ 53 | --save-interval 10000 \ 54 | --eval-interval 1000 \ 55 | --eval-iters 10 56 | " 57 | 58 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \ 59 | $BERT_ARGS \ 60 | $DATA_ARGS \ 61 | $OUTPUT_ARGS \ 62 | --distributed-backend nccl \ 63 | --save $CHECKPOINT_PATH \ 64 | --load $CHECKPOINT_PATH 65 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/bert-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | BERT_ARGS=" 26 | --tensor-model-parallel-size 2 \ 27 | --pipeline-model-parallel-size 2 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --seq-length 512 \ 32 | --max-position-embeddings 512 \ 33 | --micro-batch-size 2 \ 34 | --global-batch-size 16 \ 35 | --lr 0.0001 \ 36 | --train-iters 1000000 \ 37 | --lr-decay-iters 990000 \ 38 | --lr-decay-style linear \ 39 | --min-lr 1.0e-5 \ 40 | --weight-decay 1e-2 \ 41 | --lr-warmup-fraction .01 \ 42 | --clip-grad 1.0 \ 43 | --fp16 44 | " 45 | 46 | DATA_ARGS=" 47 | --data-path $DATA_PATH \ 48 | --vocab-file $VOCAB_FILE \ 49 | --data-impl mmap \ 50 | --split 949,50,1 51 | " 52 | 53 | OUTPUT_ARGS=" 54 | --log-interval 100 \ 55 | --save-interval 10000 \ 56 | --eval-interval 1000 \ 57 | --eval-iters 10 58 | " 59 | 60 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \ 61 | $BERT_ARGS \ 62 | $DATA_ARGS \ 63 | $OUTPUT_ARGS \ 64 | --distributed-backend nccl \ 65 | --save $CHECKPOINT_PATH \ 66 | --load $CHECKPOINT_PATH 67 | -------------------------------------------------------------------------------- /examples/pretrain_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | 7 | CHECKPOINT_PATH= 8 | VOCAB_FILE=/gpt2-vocab.json 9 | MERGE_FILE=/gpt2-merges.txt 10 | DATA_PATH=_text_document 11 | 12 | GPT_ARGS=" 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 1024 \ 17 | --max-position-embeddings 1024 \ 18 | --micro-batch-size 4 \ 19 | --global-batch-size 8 \ 20 | --lr 0.00015 \ 21 | --train-iters 500000 \ 22 | --lr-decay-iters 320000 \ 23 | --lr-decay-style cosine \ 24 | --min-lr 1.0e-5 \ 25 | --weight-decay 1e-2 \ 26 | --lr-warmup-fraction .01 \ 27 | --clip-grad 1.0 \ 28 | --fp16 29 | " 30 | 31 | DATA_ARGS=" 32 | --data-path $DATA_PATH \ 33 | --vocab-file $VOCAB_FILE \ 34 | --merge-file $MERGE_FILE \ 35 | --data-impl mmap \ 36 | --split 949,50,1 37 | " 38 | 39 | OUTPUT_ARGS=" 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 44 | " 45 | 46 | torchrun pretrain_gpt.py \ 47 | $GPT_ARGS \ 48 | $DATA_ARGS \ 49 | $OUTPUT_ARGS \ 50 | --save $CHECKPOINT_PATH \ 51 | --load $CHECKPOINT_PATH 52 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_175B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #SBATCH --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b 5 | 6 | 7 | DIR=`pwd` 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 9 | mkdir -p $DIR/logs 10 | 11 | 12 | DATASET_1="" 13 | DATASET_2="" 14 | DATASET_3="" 15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" 16 | 17 | 18 | options=" \ 19 | --tensor-model-parallel-size 8 \ 20 | --pipeline-model-parallel-size 16 \ 21 | --num-layers 96 \ 22 | --hidden-size 12288 \ 23 | --num-attention-heads 96 \ 24 | --seq-length 2048 \ 25 | --max-position-embeddings 2048 \ 26 | --micro-batch-size 1 \ 27 | --global-batch-size 1536 \ 28 | --rampup-batch-size 16 16 5859375 \ 29 | --train-samples 146484375 \ 30 | --lr-decay-samples 126953125 \ 31 | --lr-warmup-samples 183105 \ 32 | --lr 6.0e-5 \ 33 | --min-lr 6.0e-6 \ 34 | --lr-decay-style cosine \ 35 | --log-interval 10 \ 36 | --eval-iters 40 \ 37 | --eval-interval 1000 \ 38 | --data-path ${DATASET} \ 39 | --vocab-file \ 40 | --merge-file \ 41 | --save-interval 1000 \ 42 | --save \ 43 | --load \ 44 | --split 98,2,0 \ 45 | --clip-grad 1.0 \ 46 | --weight-decay 0.1 \ 47 | --adam-beta1 0.9 \ 48 | --adam-beta2 0.95 \ 49 | --init-method-std 0.006 \ 50 | --tensorboard-dir \ 51 | --fp16 \ 52 | --activations-checkpoint-method uniform " 53 | 54 | 55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}" 56 | 57 | 58 | srun -l \ 59 | --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \ 60 | --container-mounts "" \ 61 | --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}" 62 | 63 | 64 | set +x 65 | 66 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | 7 | GPUS_PER_NODE=8 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=6000 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | 15 | CHECKPOINT_PATH= 16 | VOCAB_FILE=/gpt2-vocab.json 17 | MERGE_FILE=/gpt2-merges.txt 18 | DATA_PATH=_text_document 19 | 20 | DISTRIBUTED_ARGS=" 21 | --nproc_per_node $GPUS_PER_NODE \ 22 | --nnodes $NNODES \ 23 | --node_rank $NODE_RANK \ 24 | --master_addr $MASTER_ADDR \ 25 | --master_port $MASTER_PORT 26 | " 27 | 28 | GPT_ARGS=" 29 | --num-layers 24 \ 30 | --hidden-size 1024 \ 31 | --num-attention-heads 16 \ 32 | --seq-length 1024 \ 33 | --max-position-embeddings 1024 \ 34 | --micro-batch-size 8 \ 35 | --global-batch-size 64 \ 36 | --lr 0.00015 \ 37 | --train-iters 500000 \ 38 | --lr-decay-iters 320000 \ 39 | --lr-decay-style cosine \ 40 | --min-lr 1.0e-5 \ 41 | --weight-decay 1e-2 \ 42 | --lr-warmup-fraction .01 \ 43 | --clip-grad 1.0 \ 44 | --fp16 45 | " 46 | 47 | DATA_ARGS=" 48 | --data-path $DATA_PATH \ 49 | --vocab-file $VOCAB_FILE \ 50 | --merge-file $MERGE_FILE \ 51 | --data-impl mmap \ 52 | --split 949,50,1 53 | " 54 | 55 | OUTPUT_ARGS=" 56 | --log-interval 100 \ 57 | --save-interval 10000 \ 58 | --eval-interval 1000 \ 59 | --eval-iters 10 60 | " 61 | 62 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 63 | $GPT_ARGS \ 64 | $DATA_ARGS \ 65 | $OUTPUT_ARGS \ 66 | --distributed-backend nccl \ 67 | --save $CHECKPOINT_PATH \ 68 | --load $CHECKPOINT_PATH 69 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | 7 | GPUS_PER_NODE=8 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=6000 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | 15 | CHECKPOINT_PATH= 16 | VOCAB_FILE=/gpt2-vocab.json 17 | MERGE_FILE=/gpt2-merges.txt 18 | DATA_PATH=_text_document 19 | 20 | DISTRIBUTED_ARGS=" 21 | --nproc_per_node $GPUS_PER_NODE \ 22 | --nnodes $NNODES \ 23 | --node_rank $NODE_RANK \ 24 | --master_addr $MASTER_ADDR \ 25 | --master_port $MASTER_PORT 26 | " 27 | 28 | GPT_ARGS=" 29 | --tensor-model-parallel-size 2 \ 30 | --pipeline-model-parallel-size 2 \ 31 | --sequence-parallel \ 32 | --num-layers 24 \ 33 | --hidden-size 1024 \ 34 | --num-attention-heads 16 \ 35 | --seq-length 1024 \ 36 | --max-position-embeddings 1024 \ 37 | --micro-batch-size 4 \ 38 | --global-batch-size 16 \ 39 | --lr 0.00015 \ 40 | --train-iters 500000 \ 41 | --lr-decay-iters 320000 \ 42 | --lr-decay-style cosine \ 43 | --min-lr 1.0e-5 \ 44 | --weight-decay 1e-2 \ 45 | --lr-warmup-fraction .01 \ 46 | --clip-grad 1.0 \ 47 | --fp16 48 | " 49 | 50 | DATA_ARGS=" 51 | --data-path $DATA_PATH \ 52 | --vocab-file $VOCAB_FILE \ 53 | --merge-file $MERGE_FILE \ 54 | --data-impl mmap \ 55 | --split 949,50,1 56 | " 57 | 58 | OUTPUT_ARGS=" 59 | --log-interval 100 \ 60 | --save-interval 10000 \ 61 | --eval-interval 1000 \ 62 | --eval-iters 10 63 | " 64 | 65 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 66 | $GPT_ARGS \ 67 | $DATA_ARGS \ 68 | $OUTPUT_ARGS \ 69 | --distributed-backend nccl \ 70 | --save $CHECKPOINT_PATH \ 71 | --load $CHECKPOINT_PATH 72 | 73 | -------------------------------------------------------------------------------- /examples/pretrain_ict.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "217M" parameter biencoder model for ICT retriever 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | PRETRAINED_BERT_PATH= 9 | TEXT_DATA_PATH= 10 | TITLE_DATA_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | 14 | python pretrain_ict.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 32 \ 20 | --seq-length 256 \ 21 | --max-position-embeddings 512 \ 22 | --train-iters 100000 \ 23 | --vocab-file bert-vocab.txt \ 24 | --tokenizer-type BertWordPieceLowerCase \ 25 | --DDP-impl torch \ 26 | --bert-load ${PRETRAINED_BERT_PATH} \ 27 | --log-interval 100 \ 28 | --eval-interval 1000 \ 29 | --eval-iters 10 \ 30 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 31 | --retriever-score-scaling \ 32 | --load $CHECKPOINT_PATH \ 33 | --save $CHECKPOINT_PATH \ 34 | --data-path ${TEXT_DATA_PATH} \ 35 | --titles-data-path ${TITLE_DATA_PATH} \ 36 | --lr 0.0001 \ 37 | --lr-decay-style linear \ 38 | --weight-decay 1e-2 \ 39 | --clip-grad 1.0 \ 40 | --lr-warmup-fraction 0.01 \ 41 | --save-interval 4000 \ 42 | --exit-interval 8000 \ 43 | --query-in-block-prob 0.1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_t5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | CHECKPOINT_PATH= 6 | VOCAB_FILE=/t5-vocab.txt 7 | DATA_PATH=_text_sentence 8 | 9 | T5_ARGS=" 10 | --num-layers 12 \ 11 | --hidden-size 768 \ 12 | --num-attention-heads 12 \ 13 | --kv-channels 64 \ 14 | --ffn-hidden-size 3072 \ 15 | --encoder-seq-length 512 \ 16 | --decoder-seq-length 128 \ 17 | --max-position-embeddings 512 \ 18 | --micro-batch-size 16 \ 19 | --global-batch-size 16 \ 20 | --lr 0.0001 \ 21 | --train-iters 1000000 \ 22 | --lr-decay-iters 1000000 \ 23 | --lr-decay-style linear \ 24 | --min-lr 0.00001 \ 25 | --weight-decay 1e-2 \ 26 | --lr-warmup-fraction .01 \ 27 | --clip-grad 1.0 \ 28 | --fp16 \ 29 | --vocab-extra-ids 100 30 | " 31 | 32 | DATA_ARGS=" 33 | --data-path $DATA_PATH \ 34 | --vocab-file $VOCAB_FILE \ 35 | --data-impl mmap \ 36 | --split 949,50,1 37 | " 38 | 39 | OUTPUT_ARGS=" 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 44 | " 45 | 46 | torchrun pretrain_t5.py \ 47 | $T5_ARGS \ 48 | $DATA_ARGS \ 49 | $OUTPUT_ARGS \ 50 | --save $CHECKPOINT_PATH \ 51 | --load $CHECKPOINT_PATH 52 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/t5-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | T5_ARGS=" 26 | --num-layers 12 \ 27 | --hidden-size 768 \ 28 | --num-attention-heads 12 \ 29 | --kv-channels 64 \ 30 | --ffn-hidden-size 3072 \ 31 | --encoder-seq-length 512 \ 32 | --decoder-seq-length 128 \ 33 | --max-position-embeddings 512 \ 34 | --micro-batch-size 16 \ 35 | --global-batch-size 128 \ 36 | --lr 0.0001 \ 37 | --train-iters 1000000 \ 38 | --lr-decay-iters 1000000 \ 39 | --lr-decay-style linear \ 40 | --min-lr 0.00001 \ 41 | --weight-decay 1e-2 \ 42 | --lr-warmup-fraction .01 \ 43 | --clip-grad 1.0 \ 44 | --fp16 \ 45 | --vocab-extra-ids 100 46 | " 47 | 48 | DATA_ARGS=" 49 | --data-path $DATA_PATH \ 50 | --vocab-file $VOCAB_FILE \ 51 | --data-impl mmap \ 52 | --split 949,50,1 53 | " 54 | 55 | OUTPUT_ARGS=" 56 | --log-interval 100 \ 57 | --save-interval 10000 \ 58 | --eval-interval 1000 \ 59 | --eval-iters 10 60 | " 61 | 62 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \ 63 | $T5_ARGS \ 64 | $DATA_ARGS \ 65 | $OUTPUT_ARGS \ 66 | --distributed-backend nccl \ 67 | --save $CHECKPOINT_PATH \ 68 | --load $CHECKPOINT_PATH 69 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/t5-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | T5_ARGS=" 26 | --tensor-model-parallel-size 2 \ 27 | --num-layers 12 \ 28 | --hidden-size 768 \ 29 | --num-attention-heads 12 \ 30 | --kv-channels 64 \ 31 | --ffn-hidden-size 3072 \ 32 | --encoder-seq-length 512 \ 33 | --decoder-seq-length 128 \ 34 | --max-position-embeddings 512 \ 35 | --micro-batch-size 16 \ 36 | --global-batch-size 128 \ 37 | --lr 0.0001 \ 38 | --train-iters 1000000 \ 39 | --lr-decay-iters 1000000 \ 40 | --lr-decay-style linear \ 41 | --min-lr 0.00001 \ 42 | --weight-decay 1e-2 \ 43 | --lr-warmup-fraction .01 \ 44 | --clip-grad 1.0 \ 45 | --fp16 \ 46 | --vocab-extra-ids 100 47 | " 48 | 49 | DATA_ARGS=" 50 | --data-path $DATA_PATH \ 51 | --vocab-file $VOCAB_FILE \ 52 | --data-impl mmap \ 53 | --split 949,50,1 54 | " 55 | 56 | OUTPUT_ARGS=" 57 | --log-interval 100 \ 58 | --save-interval 10000 \ 59 | --eval-interval 1000 \ 60 | --eval-iters 10 61 | " 62 | 63 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \ 64 | $T5_ARGS \ 65 | $DATA_ARGS \ 66 | $OUTPUT_ARGS \ 67 | --distributed-backend nccl \ 68 | --save $CHECKPOINT_PATH \ 69 | --load $CHECKPOINT_PATH 70 | -------------------------------------------------------------------------------- /examples/run_text_generation_server_345M.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model. 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | export CUDA_DEVICE_MAX_CONNECTIONS=1 14 | 15 | pip install flask-restful 16 | 17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 18 | --tensor-model-parallel-size 1 \ 19 | --pipeline-model-parallel-size 1 \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --load ${CHECKPOINT} \ 23 | --num-attention-heads 16 \ 24 | --max-position-embeddings 1024 \ 25 | --tokenizer-type GPT2BPETokenizer \ 26 | --fp16 \ 27 | --micro-batch-size 1 \ 28 | --seq-length 1024 \ 29 | --out-seq-length 1024 \ 30 | --temperature 1.0 \ 31 | --vocab-file $VOCAB_FILE \ 32 | --merge-file $MERGE_FILE \ 33 | --top_p 0.9 \ 34 | --seed 42 35 | -------------------------------------------------------------------------------- /examples/run_text_generation_server_345M_8_tensor_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | pip install flask-restful 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 16 | --tensor-model-parallel-size 8 \ 17 | --pipeline-model-parallel-size 1 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --load ${CHECKPOINT} \ 21 | --num-attention-heads 16 \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --micro-batch-size 1 \ 26 | --seq-length 1024 \ 27 | --out-seq-length 1024 \ 28 | --temperature 1.0 \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --top_p 0.9 \ 32 | --seed 42 33 | -------------------------------------------------------------------------------- /examples/sc21/CONFIG.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # SLURM options. 5 | export SLURM_PARTITION= 6 | export SLURM_ACCOUNT= 7 | 8 | 9 | # Source code. 10 | export MEGATRON_CODE_DIR= 11 | 12 | 13 | # This variable is used to mount the relevant part of the filesystem 14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the 15 | # launch directory already get mounted; this variable should be used to 16 | # mount the directories that contain the data and tokenizer files. 17 | export DOCKER_MOUNT_DIR= 18 | 19 | 20 | # Data and tokenizer files. 21 | MEGATRON_DATA= 22 | BPE_VOCAB_FILE= 23 | BPE_MERGE_FILE= 24 | 25 | 26 | # Megatron input parameters. 27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters 28 | # that are not listed here. 29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ 30 | --tensor-model-parallel-size ${TP} \ 31 | --pipeline-model-parallel-size ${PP} \ 32 | --micro-batch-size ${MBS} \ 33 | --global-batch-size ${GBS} \ 34 | --num-layers ${NLS} \ 35 | --hidden-size ${HS} \ 36 | --num-attention-heads ${NAH} \ 37 | --DDP-impl ${DDP} \ 38 | --data-path ${MEGATRON_DATA} \ 39 | --vocab-file ${BPE_VOCAB_FILE} \ 40 | --merge-file ${BPE_MERGE_FILE} \ 41 | --log-interval 5 \ 42 | --seq-length 2048 \ 43 | --max-position-embeddings 2048 \ 44 | --train-iters 500 \ 45 | --lr-decay-iters 320 \ 46 | --lr 0.0001 \ 47 | --min-lr 0.00001 \ 48 | --lr-decay-style cosine \ 49 | --lr-warmup-fraction 0.01 \ 50 | --split 969,30,1 \ 51 | --eval-iters 100 \ 52 | --eval-interval 1000 \ 53 | --clip-grad 1.0 \ 54 | --fp16 \ 55 | --loss-scale 8192 " 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/sc21/README.md: -------------------------------------------------------------------------------- 1 | # Reproducing Figures in SC21 Paper 2 | 3 | 4 | This directory contains some of the scripts that were used to produce the 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other 9 | schedulers as well. 10 | 11 | 12 | ## Setup 13 | 14 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please 15 | update the unspecified values (in angle brackets `<...>`) before launching any 16 | scripts. 17 | 18 | 19 | 20 | ## Scripts 21 | 22 | Below is a list of scripts that can be used to reproduce various figures in our 23 | [paper](https://arxiv.org/pdf/2104.04473.pdf): 24 | 25 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput 26 | for GPT models ranging from 1 billion to 1 trillion parameters. 27 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling 28 | performance of pipeline parallelism. 29 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of 30 | the interleaved schedule on a 175B GPT model. 31 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of 32 | different degrees of pipeline and tensor model parallelism on a model with 33 | 162.2 billion parameters. 34 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of 35 | different degrees of data and pipeline model parallelism on a model with 36 | 5.9 billion parameters. 37 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of 38 | different degrees of data and tensor model parallelism on a model with 39 | 5.9 billion parameters. 40 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of 41 | microbatch size. 42 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of 43 | activation recomputation. 44 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of 45 | the scatter-gather communication optimization. 46 | -------------------------------------------------------------------------------- /examples/sc21/SBATCH.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | sbatch -p ${SLURM_PARTITION} \ 5 | -A ${SLURM_ACCOUNT} \ 6 | --job-name=${JOB_NAME} \ 7 | --nodes=${NNODES} \ 8 | --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh 9 | 10 | exit 0 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/sc21/SRUN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 4 | 5 | 6 | THIS_DIR=`pwd` 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 8 | mkdir -p ${THIS_DIR}/logs 9 | 10 | 11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" 12 | 13 | 14 | srun -l \ 15 | --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ 16 | --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ 17 | --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" 18 | 19 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [1, 2, 4, 8]. 8 | PP=1 9 | 10 | # Batch size (global batch size) options = [8, 128]. 11 | GBS=8 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel size options. 18 | NLS=$((3*PP)) 19 | NNODES=${PP} 20 | 21 | 22 | # Other params. 23 | TP=8 24 | MBS=1 25 | HS=20480 26 | NAH=128 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Interleaved schedule options = [YES, NO]. 8 | INTERLEAVED=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set interleaved schedule options. 18 | if [ ${INTERLEAVED} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${INTERLEAVED} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 128]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and tensor-parallel size options. 18 | TP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | MBS=1 23 | NLS=32 24 | HS=20480 25 | NAH=128 26 | DDP=local 27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 28 | NNODES=8 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and data-parallel size options. 18 | DP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | TP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32]. 8 | TP=2 9 | 10 | # Batch size (global batch size) options = [32, 128, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set tensor-parallel and data-parallel size options. 18 | DP=$((64/TP)) 19 | 20 | 21 | # Other params. 22 | PP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Microbatch size options = [1, 2, 4, 8]. 8 | MBS=1 9 | 10 | # Batch size (global batch size) options = [128, 512]. 11 | GBS=128 12 | 13 | 14 | 15 | 16 | 17 | # Other params. 18 | TP=8 19 | PP=8 20 | NLS=32 21 | HS=15360 22 | NAH=128 23 | DDP=local 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | NNODES=8 26 | 27 | 28 | # Name of the job. 29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} 30 | 31 | 32 | # Import the configs. 33 | . `pwd`/CONFIG.sh 34 | 35 | 36 | # Submit the job. 37 | . `pwd`/SBATCH.sh 38 | 39 | 40 | exit 0 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_17.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Activation recomputation options = [YES, NO]. 8 | ACTIVATION_RECOMPUTATION=YES 9 | 10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256]. 11 | GBS=1 12 | 13 | 14 | 15 | 16 | 17 | # Set activation recomputation. 18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="" 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=16 31 | MBS=1 32 | NLS=80 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=16 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_18.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Scatter-gather communication optimization options = [YES, NO]. 8 | SCATTER_GATHER=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set scatter-gather communication optimization options. 18 | if [ ${SCATTER_GATHER} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${SCATTER_GATHER} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples_deepspeed/MoE/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | "curriculum_learning": { 26 | "enabled": CONFIG_CL_ENABLED, 27 | "curriculum_type": "seqlen", 28 | "min_difficulty": CONFIG_CL_MIN, 29 | "max_difficulty": CONFIG_CL_MAX, 30 | "schedule_type": "fixed_linear", 31 | "schedule_config": { 32 | "total_curriculum_step": CONFIG_CL_DURATION, 33 | "difficulty_step": 8 34 | } 35 | }, 36 | 37 | "wall_clock_breakdown" : false 38 | } 39 | -------------------------------------------------------------------------------- /examples_deepspeed/MoE/ds_config_gpt_Zero2_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": 2 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": false, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | "curriculum_learning": { 26 | "enabled": CONFIG_CL_ENABLED, 27 | "curriculum_type": "seqlen", 28 | "min_difficulty": CONFIG_CL_MIN, 29 | "max_difficulty": CONFIG_CL_MAX, 30 | "schedule_type": "fixed_linear", 31 | "schedule_config": { 32 | "total_curriculum_step": CONFIG_CL_DURATION, 33 | "difficulty_step": 8 34 | } 35 | }, 36 | 37 | "wall_clock_breakdown" : false 38 | } 39 | -------------------------------------------------------------------------------- /examples_deepspeed/azure/README.md: -------------------------------------------------------------------------------- 1 | ## Recipes for experimentation on Azure 2 | 3 | The recipes have been tested on command line on a cluster setup using Azure VMs and VMSS as well as inside Docker based environments. 4 | 5 | To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows 6 | 7 | ```bash examples_deepspeed/azure/run-benchmark-model.sh``` 8 | 9 | ### Pre-requisites 10 | 11 | To run the above script, you will need to either setup your own dataset and modify the scripts or use our helper scripts to download the publicly available Books dataset and GPT vocab files. Please use the following from the ```dataset``` folder 12 | 13 | ```bash dataset/download_books.sh``` 14 | 15 | ```bash dataset/download_vocab.sh``` 16 | 17 | ### Run 175B and 1T models 18 | 19 | We have included two recipes for the 175B model and the 1T model. To train the model, we assume that the users will modify and tune hyperparameters and configurations by themselves. To facilitate initial training, we have made the recipes runnable with the Books dataset as follows. 20 | 21 | ```bash examples_deepspeed/azure/run-175b.sh``` 22 | 23 | ```bash examples_deepspeed/azure/run-1t.sh``` 24 | 25 | ### Note about ZeRO stage 3 and CPU offload 26 | 27 | By default, we have enabled ZeRO Stage 3 for both the recipes above. For the 1T model, we have also enabled the CPU-offload feature to save on memory and enable a larger batch size that offers better performance. 28 | -------------------------------------------------------------------------------- /examples_deepspeed/azureml/Dockerfile.dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-cuda11.3:12 2 | USER root:root 3 | 4 | RUN pip install pybind11 5 | RUN pip install regex -------------------------------------------------------------------------------- /examples_deepspeed/azureml/README.md: -------------------------------------------------------------------------------- 1 | ## Megatron-DeepSpeed on AzureML 2 | Example script for running Megatron-DeepSpeed using Azure Machine Learning. 3 | 4 | ------ 5 | 6 | # Workspace Setup 7 | Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk#set-up). 8 | 9 | # Dataset Preparation 10 | Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 11 | Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset. 12 | 13 | > Note: The folder `bookcorpus_data` used by [prepare_dataset script](prepare_dataset.py) should not be under `azureml` directories. It is because Azure ML does not allow to include large files (limit: 100 files or 1048576 bytes) for Docker build context. 14 | 15 | # Training 16 | Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py). 17 | -------------------------------------------------------------------------------- /examples_deepspeed/azureml/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | # Use this script to upload data to blob store 2 | 3 | # AzureML libraries 4 | from azureml.core import Workspace 5 | from azureml.core.dataset import Dataset 6 | from azureml.data.datapath import DataPath 7 | 8 | ws = Workspace.from_config() 9 | print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n') 10 | 11 | data_dir = "bookcorpus_data" # Local directory for where data is located that includes .bin and .idx files 12 | blobstore_datadir = data_dir # Blob store directory to store data in 13 | 14 | datastore = ws.get_default_datastore() 15 | 16 | # Book Corpus Data 17 | print("upload dataset to blob store") 18 | uploaded_data = Dataset.File.upload_directory( 19 | src_dir=data_dir, 20 | target=DataPath(datastore, blobstore_datadir), 21 | show_progress=True 22 | ) 23 | 24 | # Usage after uploading the directory 25 | # To refer to the folder directly: 26 | train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)]) 27 | print(train_dataset) 28 | # To refer to a specific file: 29 | # train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir + "/filename.ext")]) 30 | # Create DatasetConsumptionConfig to specify how to deliver the dataset to a compute target. 31 | # In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target. 32 | # input_data_dir = train_dataset.as_mount() 33 | # input_data_dir = train_dataset.as_download() 34 | -------------------------------------------------------------------------------- /examples_deepspeed/bert_with_pile/ds_config_bert_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | 26 | "wall_clock_breakdown" : false 27 | } 28 | -------------------------------------------------------------------------------- /examples_deepspeed/compression/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | "curriculum_learning": { 26 | "enabled": CONFIG_CL_ENABLED, 27 | "curriculum_type": "seqlen", 28 | "min_difficulty": CONFIG_CL_MIN, 29 | "max_difficulty": CONFIG_CL_MAX, 30 | "schedule_type": "fixed_linear", 31 | "schedule_config": { 32 | "total_curriculum_step": CONFIG_CL_DURATION, 33 | "difficulty_step": 8 34 | } 35 | }, 36 | 37 | "wall_clock_breakdown" : false 38 | } 39 | -------------------------------------------------------------------------------- /examples_deepspeed/compression/ds_config_gpt_TEMPLATE_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | "curriculum_learning": { 26 | "enabled": CONFIG_CL_ENABLED, 27 | "curriculum_type": "seqlen", 28 | "min_difficulty": CONFIG_CL_MIN, 29 | "max_difficulty": CONFIG_CL_MAX, 30 | "schedule_type": "fixed_linear", 31 | "schedule_config": { 32 | "total_curriculum_step": CONFIG_CL_DURATION, 33 | "difficulty_step": 8 34 | } 35 | }, 36 | 37 | "wall_clock_breakdown" : false, 38 | 39 | "compression_training": { 40 | "weight_quantization": { 41 | "shared_parameters":{ 42 | "enabled": true, 43 | "quantizer_kernel": false, 44 | "schedule_offset": 50, 45 | "quantize_groups": 48, 46 | "quantize_verbose": false, 47 | "quantization_type": "symmetric", 48 | "rounding": "nearest", 49 | "fp16_mixed_quantize":{ 50 | "enabled": false, 51 | "quantize_change_ratio": 0.001 52 | } 53 | }, 54 | "different_groups":{ 55 | "wq1": { 56 | "params": { 57 | "start_bits": 12, 58 | "target_bits": 4, 59 | "quantization_period": 50 60 | }, 61 | "modules": [ 62 | "encoder.layers" 63 | ] 64 | } 65 | } 66 | }, 67 | "activation_quantization": { 68 | "shared_parameters":{ 69 | "enabled": true, 70 | "quantization_type": "asymmetric", 71 | "range_calibration": "static", 72 | "schedule_offset": 50 73 | }, 74 | "different_groups":{ 75 | "aq1": { 76 | "params": { 77 | "bits": 8 78 | }, 79 | "modules": [ 80 | "encoder.layers" 81 | ] 82 | } 83 | } 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /examples_deepspeed/curriculum_learning/README.md: -------------------------------------------------------------------------------- 1 | This is an example of how to use DeepSpeed's curriculum learning (CL) feature which provides faster and more stable language model pre-training. Currently it is only integrated for GPT pre-training. Note that there are two curriculum learning examples in two different repos for Megatron-LM GPT-2 pre-training. Both of them have some unique features and limitations. See details in our [tutorial](https://www.deepspeed.ai/tutorials/curriculum-learning/). For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084). -------------------------------------------------------------------------------- /examples_deepspeed/curriculum_learning/ds_config_gpt_slw_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false, 23 | "curriculum_learning": { 24 | "enabled": true, 25 | "curriculum_type": "seqlen", 26 | "min_difficulty": CONFIG_CL_MIN, 27 | "max_difficulty": CONFIG_CL_MAX, 28 | "schedule_type": "fixed_linear", 29 | "schedule_config": { 30 | "total_curriculum_step": CONFIG_CL_DURATION, 31 | "difficulty_step": 8 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /examples_deepspeed/curriculum_learning/ds_train.sh: -------------------------------------------------------------------------------- 1 | # # baseline 2 | # CONFIG=baseline 3 | # TAG=baseline 4 | # MODEL_SIZE=1558 5 | # LR=1.5e-4 6 | # BSZ=512 7 | # SEQ_LEN=1024 8 | # MP_SIZE=1 9 | # SEED=1234 10 | # SAVE_INTERVAL=5000 11 | # NUM_ITER=600000 12 | # NUM_TOKEN=157286400000 13 | # LR_DECAY_TOKEN=157286400000 14 | # LR_WARMUP_ITER=3000 15 | # CONFIG_TEMPLATE=false 16 | # CURRICULUM_STEP=0 17 | # CURRICULUM_MIN=0 18 | 19 | # curriculum learning 20 | CONFIG=curriculum_fixed_linear 21 | MODEL_SIZE=1558 22 | LR=6e-4 23 | BSZ=4096 24 | SEQ_LEN=1024 25 | MP_SIZE=1 26 | SEED=1234 27 | SAVE_INTERVAL=1000 28 | NUM_ITER=75000 29 | NUM_TOKEN=157286400000 30 | LR_DECAY_TOKEN=157286400000 31 | LR_WARMUP_ITER=3000 32 | CONFIG_TEMPLATE=true 33 | CURRICULUM_STEP=45000 34 | CURRICULUM_MIN=64 35 | TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}" 36 | 37 | bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN 38 | -------------------------------------------------------------------------------- /examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_baseline.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false 26 | } 27 | -------------------------------------------------------------------------------- /examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false, 26 | "curriculum_learning": { 27 | "enabled": true, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /examples_deepspeed/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false 23 | } 24 | -------------------------------------------------------------------------------- /examples_deepspeed/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false 23 | } 24 | -------------------------------------------------------------------------------- /examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh: -------------------------------------------------------------------------------- 1 | hostname_and_rank=$1 2 | master_port=$2 3 | pretrained_checkpoint=$3 4 | 5 | # hostname_and_rank="worker-0:0,1,2,3" 6 | # master_port=12345 7 | # pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp" 8 | 9 | tasks=( 10 | RTE 11 | MRPC 12 | STS-B 13 | CoLA 14 | SST-2 15 | QNLI 16 | QQP 17 | MNLI 18 | ) 19 | 20 | seeds=( 21 | 1234 22 | 1235 23 | 1236 24 | 1237 25 | 1238 26 | ) 27 | 28 | lrs=( 29 | 2e-5 30 | 3e-5 31 | 4e-5 32 | 5e-5 33 | ) 34 | 35 | for ((i=0;i<${#tasks[@]};++i)); do 36 | task=${tasks[i]} 37 | for ((j=0;j<${#seeds[@]};++j)); do 38 | seed=${seeds[j]} 39 | for ((k=0;k<${#lrs[@]};++k)); do 40 | lr=${lrs[k]} 41 | bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint} 42 | done 43 | done 44 | done -------------------------------------------------------------------------------- /examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false, 23 | "dataloader_drop_last": true, 24 | "data_efficiency": { 25 | "enabled": true, 26 | "seed": DATA_EFFICIENCY_SEED, 27 | "data_routing": { 28 | "enabled": LTD_ENABLED, 29 | "random_ltd":{ 30 | "enabled": LTD_ENABLED, 31 | "total_layer_num": 24, 32 | "random_ltd_layer_num": 22, 33 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], 34 | "model_mask_name": "attention_mask", 35 | "model_type": "encoder", 36 | "hidden_state_order": "seq_batch_dim", 37 | "random_ltd_schedule": { 38 | "min_value": LTD_MIN, 39 | "max_value": LTD_MAX, 40 | "schedule_type":"fixed_linear", 41 | "schedule_config": { 42 | "require_steps": LTD_STEP, 43 | "seq_per_step": 16 44 | } 45 | } 46 | } 47 | }, 48 | "data_sampling": { 49 | "enabled": CL_ENABLED, 50 | "num_workers": DATA_SAMPLING_NUM_WORKERS, 51 | "curriculum_learning": { 52 | "enabled": CL_ENABLED, 53 | "data_cluster_path": "CL_CLUSTER_PATH", 54 | "curriculum_metrics": { 55 | "CL_1st_METRIC_NAME": { 56 | "index_to_sample_path": "CL_1st_SAMPLE_PATH", 57 | "index_to_metric_path": "CL_1st_METRIC_PATH", 58 | "difficulty_type": "CL_1st_DIFF_TYPE", 59 | "clustering_type": "CL_1st_CLUSTER_TYPE", 60 | "min_difficulty": CL_1st_MIN, 61 | "max_difficulty": CL_1st_MAX, 62 | "schedule_type": "fixed_root", 63 | "schedule_config": { 64 | "total_curriculum_step": CL_1st_TOTAL_STEP, 65 | "difficulty_step": CL_1st_DIFF_STEP, 66 | "root_degree": CL_1st_ROOT 67 | } 68 | } 69 | } 70 | } 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /examples_deepspeed/data_efficiency/gpt/eval/ds_config_eval_dummy.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 2048, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 10, 5 | 6 | "zero_optimization": { 7 | "stage": 0 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": true, 12 | 13 | "fp16": { 14 | "enabled": false, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": false 24 | }, 25 | 26 | "wall_clock_breakdown" : false 27 | } -------------------------------------------------------------------------------- /examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false, 23 | "dataloader_drop_last": true, 24 | "data_efficiency": { 25 | "enabled": true, 26 | "seed": DATA_EFFICIENCY_SEED, 27 | "data_routing": { 28 | "enabled": LTD_ENABLED, 29 | "random_ltd":{ 30 | "enabled": LTD_ENABLED, 31 | "total_layer_num": 24, 32 | "random_ltd_layer_num": 22, 33 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], 34 | "model_mask_name": "attention_mask", 35 | "model_type": "decoder", 36 | "hidden_state_order": "seq_batch_dim", 37 | "random_ltd_schedule": { 38 | "min_value": LTD_MIN, 39 | "max_value": LTD_MAX, 40 | "schedule_type":"fixed_linear", 41 | "schedule_config": { 42 | "require_steps": LTD_STEP, 43 | "seq_per_step": 16 44 | } 45 | } 46 | } 47 | }, 48 | "data_sampling": { 49 | "enabled": CL_ENABLED, 50 | "num_workers": DATA_SAMPLING_NUM_WORKERS, 51 | "curriculum_learning": { 52 | "enabled": CL_ENABLED, 53 | "data_cluster_path": "CL_CLUSTER_PATH", 54 | "curriculum_metrics": { 55 | "CL_1st_METRIC_NAME": { 56 | "index_to_sample_path": "CL_1st_SAMPLE_PATH", 57 | "index_to_metric_path": "CL_1st_METRIC_PATH", 58 | "difficulty_type": "CL_1st_DIFF_TYPE", 59 | "clustering_type": "CL_1st_CLUSTER_TYPE", 60 | "min_difficulty": CL_1st_MIN, 61 | "max_difficulty": CL_1st_MAX, 62 | "schedule_type": "fixed_root", 63 | "schedule_config": { 64 | "total_curriculum_step": CL_1st_TOTAL_STEP, 65 | "difficulty_step": CL_1st_DIFF_STEP, 66 | "root_degree": CL_1st_ROOT 67 | } 68 | } 69 | } 70 | } 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /examples_deepspeed/deepspeed4science/megatron_long_seq_support/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "flops_profiler": { 23 | "enabled": true, 24 | "profile_step": 1, 25 | "module_depth": -1, 26 | "top_modules": 3, 27 | "detailed": true, 28 | "output_file": null 29 | }, 30 | 31 | "wall_clock_breakdown" : false 32 | } 33 | -------------------------------------------------------------------------------- /examples_deepspeed/deepspeed4science/megatron_long_seq_support/host_file: -------------------------------------------------------------------------------- 1 | worker-1 slots=4 2 | -------------------------------------------------------------------------------- /examples_deepspeed/finetune_hf_llama/README.md: -------------------------------------------------------------------------------- 1 | ## Example of Finetuning LLAMA-7B from Hugging Face Weights 2 | 3 | ### Dataset 4 | You can access the dataset from [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). 5 | 6 | ### Pre-trained Weights 7 | The pre-trained weights can be found at [Hugging Face - LLAMA-7B](https://huggingface.co/huggyllama/llama-7b). 8 | 9 | ### Usage: 10 | 11 | #### 1. Converting Hugging Face Model Weights to Megatron-Deepspeed Model 12 | ```bash 13 | bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh convert_hf2mds 14 | ``` 15 | This command writes the Hugging Face model weights into the Megatron-Deepspeed model and saves it. You can adjust the parallel configuration in the script.```convert_mds2hf``` can convert a Megatron-Deepspeed model into the Hugging Face format 16 | 17 | #### 2. Fine-tuning Process 18 | ```bash 19 | bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh 20 | ``` 21 | Execute this command to initiate the finetuning process. The task originates from [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca.git). 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /examples_deepspeed/finetune_hf_llama/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 256, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100, 5 | "zero_optimization": { 6 | "stage": 0 7 | }, 8 | "bf16": { 9 | "enabled": true 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /examples_deepspeed/finetune_hf_llama/ds_config_empty.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 256, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 100 5 | } 6 | -------------------------------------------------------------------------------- /examples_deepspeed/generate_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export TORCH_CUDA_ARCH_LIST=8.6+PTX 3 | CHECKPOINT_PATH=dataset/checkpoints/gpt2_345m 4 | VOCAB_FILE=dataset/gpt2-vocab.json 5 | MERGE_FILE=dataset/gpt2-merges.txt 6 | b=8 7 | mp=1 8 | experts=1 9 | nodes=1 10 | gpus=1 11 | 12 | 13 | use_tutel="" 14 | #use_tutel="--use-tutel" 15 | 16 | 17 | ds_inference="" 18 | #ds_inference="--ds-inference" 19 | 20 | export CUDA_DEVICE_MAX_CONNECTIONS=1 21 | 22 | launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus" 23 | L=24 24 | H=1024 25 | A=16 26 | #experts1=${experts[$k]} 27 | program_cmd="tools/generate_samples_gpt.py \ 28 | --tensor-model-parallel-size $mp \ 29 | --num-layers $L \ 30 | --hidden-size $H \ 31 | --num-attention-heads $A \ 32 | --max-position-embeddings 1024 \ 33 | --tokenizer-type GPT2BPETokenizer \ 34 | --fp16 \ 35 | --num-experts ${experts} \ 36 | --mlp-type standard \ 37 | --micro-batch-size $b \ 38 | --seq-length 1024 \ 39 | --out-seq-length 1024 \ 40 | --temperature 1.0 \ 41 | --vocab-file $VOCAB_FILE \ 42 | --merge-file $MERGE_FILE \ 43 | --genfile unconditional_samples.json \ 44 | --top_p 0.9 \ 45 | --log-interval 1 \ 46 | --num-samples 0 \ 47 | --load $CHECKPOINT_PATH \ 48 | $use_tutel $ds_inference" 49 | 50 | echo $launch_cmd $program_cmd 51 | $launch_cmd $program_cmd 52 | -------------------------------------------------------------------------------- /examples_deepspeed/offload_pp/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": 3, 8 | "offload_optimizer": { 9 | "device": "cpu", 10 | "pin_memory": true, 11 | "ratio": 0.3 12 | } 13 | }, 14 | 15 | "gradient_clipping": 1.0, 16 | "prescale_gradients":false, 17 | 18 | "fp16": { 19 | "enabled": CONFIG_FP16_ENABLED, 20 | "loss_scale": 0, 21 | "loss_scale_window": 500, 22 | "hysteresis": 2, 23 | "min_loss_scale": 1, 24 | "initial_scale_power": 11 25 | }, 26 | 27 | "bf16": { 28 | "enabled": CONFIG_BF16_ENABLED 29 | }, 30 | 31 | "wall_clock_breakdown" : false 32 | } 33 | -------------------------------------------------------------------------------- /examples_deepspeed/offload_pp/twin-offload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/offload_pp/twin-offload.png -------------------------------------------------------------------------------- /examples_deepspeed/rebase/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false 23 | } 24 | -------------------------------------------------------------------------------- /examples_deepspeed/rebase/ds_config_gpt_slw_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false, 23 | "curriculum_learning": { 24 | "enabled": true, 25 | "curriculum_type": "seqlen", 26 | "min_difficulty": CONFIG_CL_MIN, 27 | "max_difficulty": CONFIG_CL_MAX, 28 | "schedule_type": "fixed_linear", 29 | "schedule_config": { 30 | "total_curriculum_step": CONFIG_CL_DURATION, 31 | "difficulty_step": 8 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /examples_deepspeed/run_deepspeed_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | BASE_PATH=/vc_data/Megatron-LM/data 5 | DATA_PATH=${BASE_PATH}/indexed_datasets/megatron 6 | DS_CONFIG=ds_config.json 7 | 8 | TP=1 9 | PP=1 10 | NLAYERS=24 11 | HIDDEN=512 12 | 13 | GLOBAL_BATCH=64 14 | MICRO_BATCH=4 15 | 16 | ZERO_STAGE=2 17 | 18 | OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH} 19 | #OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH} 20 | mkdir -p $OUTPUT_DIR 21 | 22 | cat < $DS_CONFIG 23 | { 24 | "train_batch_size" : $GLOBAL_BATCH, 25 | "train_micro_batch_size_per_gpu": $MICRO_BATCH, 26 | "steps_per_print": 1, 27 | 28 | "zero_optimization": { 29 | "stage": $ZERO_STAGE 30 | }, 31 | 32 | "fp16": { 33 | "enabled": true, 34 | "initial_scale_power": 12 35 | }, 36 | 37 | "wall_clock_breakdown" : true 38 | } 39 | EOT 40 | 41 | export NCCL_DEBUG=warn 42 | 43 | ds_args="" 44 | ds_args=" --deepspeed ${ds_args}" 45 | ds_args=" --no-pipeline-parallel ${ds_args}" 46 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 47 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 48 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}" 49 | 50 | 51 | deepspeed pretrain_gpt.py \ 52 | --tensor-model-parallel-size $TP \ 53 | --pipeline-model-parallel-size $PP \ 54 | --num-layers $NLAYERS \ 55 | --hidden-size $HIDDEN \ 56 | --num-attention-heads 16 \ 57 | --seq-length 256 \ 58 | --loss-scale 12 \ 59 | --max-position-embeddings 1024 \ 60 | --micro-batch-size 4 \ 61 | --global-batch-size 1024 \ 62 | --train-iters 1000 \ 63 | --lr 6.0e-5 \ 64 | --min-lr 6.0e-6 \ 65 | --lr-decay-style cosine \ 66 | --log-interval 1 \ 67 | --eval-iters 40 \ 68 | --eval-interval 1000 \ 69 | --data-path $DATA_PATH \ 70 | --vocab-file $BASE_PATH/gpt2-vocab.json \ 71 | --merge-file $BASE_PATH/gpt2-merges.txt \ 72 | --save-interval 1000 \ 73 | --split 98,2,0 \ 74 | --clip-grad 1.0 \ 75 | --weight-decay 0.1 \ 76 | --adam-beta1 0.9 \ 77 | --adam-beta2 0.95 \ 78 | --init-method-std 0.006 \ 79 | --fp16 \ 80 | --checkpoint-activations \ 81 | --tensorboard-dir $OUTPUT_DIR \ 82 | $ds_args \ 83 | --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log 84 | 85 | -------------------------------------------------------------------------------- /examples_deepspeed/sequence_parallel/README.md: -------------------------------------------------------------------------------- 1 | # Sequence Parallelism 2 | 3 | This folder contains examples that demonstrate how to use DeepSpeed's sequence parallelism. 4 | 5 | ## Setting Up the Environment for FlashAttention 6 | 7 | DeepSpeed's sequence parallelism can be combined with the following types of attention. 8 | 9 | - Classic attention 10 | - FlashAttention (enabled by `--use-flash-attn`) 11 | - FlashAttention + Triton (enabled by `--use-flash-attn-triton`) 12 | 13 | For the best performance, we recommend using FlashAttention + Triton. Here are the installation steps and the versions we have tested. Note that FlashAttention is compatible only with Turing, Ampere, Ada, or Hopper GPUs. 14 | 15 | ```shell 16 | # install triton 17 | git clone -b legacy-backend https://github.com/openai/triton 18 | cd triton/python/ 19 | pip install cmake 20 | pip install . 21 | 22 | # install 23 | cd ${WORK_DIR} 24 | git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention 25 | cd flash-attention 26 | python setup.py install 27 | ``` 28 | 29 | ## Enabling Sequence Parallelism 30 | 31 | To enable sequence parallelism, set the degree of parallelism using the `--ds-sequence-parallel-size` argument. Ensure that the number of attention heads is divisible by this value. 32 | Ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the document of [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/v1.0.4) for more details. 33 | 34 | Some working examples ([GPT1.3B](ds_pretrain_gpt_1.3B_seq_parallel_32k.sh), [GPT30B](ds_pretrain_gpt_30B_seq_parallel_32k.sh)), that enable sequence parallelism, are available in this foloder. 35 | 36 | Please note that our sequence parallelism feature is currently incompatible with Megatron-LM's tensor or pipeline parallelism. 37 | -------------------------------------------------------------------------------- /examples_deepspeed/sequence_parallel/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": PRESCALE_GRAD, 12 | 13 | "fp16": { 14 | "enabled": true, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "wall_clock_breakdown" : false 23 | } 24 | -------------------------------------------------------------------------------- /examples_deepspeed/sequence_parallel/preprocess_bookcorpus.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | train_data = load_dataset('bookcorpus/bookcorpus', split='train') 4 | train_data.to_json("BookCorpusDataset_text_document.json", lines=True) 5 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/assets/image/uc_char_training_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_char_training_loss.png -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/assets/image/uc_char_validation_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_char_validation_loss.png -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 16, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 1, 5 | 6 | "zero_optimization": { 7 | "stage": 1 8 | }, 9 | 10 | "bf16": { 11 | "enabled": true 12 | }, 13 | 14 | "data_types": { 15 | "grad_accum_dtype": "fp32" 16 | }, 17 | 18 | "wall_clock_breakdown" : false 19 | } 20 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/llama/run_tb_analysis_llama.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | OUTPUT_PATH=$1 8 | 9 | if [ "$OUTPUT_PATH" == "" ]; then 10 | OUTPUT_PATH="z1_uni_ckpt" 11 | fi 12 | 13 | # Training Loss 14 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ 15 | --tb_dir $OUTPUT_PATH \ 16 | --tb_event_key "lm-loss-training/lm loss" \ 17 | --plot_name "uc_char_training_loss.png" \ 18 | --plot_title "Llama 7B Universal Checkpointing - Training Loss" \ 19 | 20 | # Validation Loss 21 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ 22 | --tb_dir $OUTPUT_PATH \ 23 | --tb_event_key "lm-loss-validation/lm loss validation" \ 24 | --csv_name "val_" \ 25 | --plot_name "uc_char_validation_loss.png" \ 26 | --plot_title "Llama 7B Universal Checkpointing - Validation Loss" \ 27 | --plot_y_label "Validation LM Loss" \ 28 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | OUTPUT_PATH=$1 8 | 9 | if [ "$OUTPUT_PATH" == "" ]; then 10 | OUTPUT_PATH="z1_uni_ckpt" 11 | fi 12 | 13 | # Training Loss 14 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ 15 | --tb_dir $OUTPUT_PATH \ 16 | --tb_event_key "lm-loss-training/lm loss" \ 17 | --plot_name "uc_char_training_loss.png" \ 18 | --plot_title "Megatron-GPT Universal Checkpointing - Training Loss" \ 19 | 20 | # Validation Loss 21 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ 22 | --tb_dir $OUTPUT_PATH \ 23 | --tb_event_key "lm-loss-validation/lm loss validation" \ 24 | --csv_name "val_" \ 25 | --plot_name "uc_char_validation_loss.png" \ 26 | --plot_title "Megatron-GPT Universal Checkpointing - Validation Loss" \ 27 | --plot_y_label "Validation LM Loss" \ 28 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt_plot_only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | OUTPUT_PATH=$1 8 | 9 | if [ "$OUTPUT_PATH" == "" ]; then 10 | OUTPUT_PATH="z1_uni_ckpt" 11 | fi 12 | 13 | # Training Loss 14 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ 15 | --tb_dir $OUTPUT_PATH \ 16 | --tb_event_key "lm-loss-training/lm loss" \ 17 | --plot_name "uc_char_training_loss.png" \ 18 | --plot_title "Megatron-GPT Universal Checkpointing - Training Loss" \ 19 | --plot_only \ 20 | --csv_dir "/workspace/uc/megatron/loss_csv" \ 21 | 22 | # Validation Loss 23 | python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \ 24 | --tb_dir $OUTPUT_PATH \ 25 | --tb_event_key "lm-loss-validation/lm loss validation" \ 26 | --csv_name "val_" \ 27 | --plot_name "uc_char_validation_loss.png" \ 28 | --plot_title "Megatron-GPT Universal Checkpointing - Validation Loss" \ 29 | --plot_y_label "Validation LM Loss" \ 30 | --plot_only \ 31 | --csv_dir "/workspace/uc/megatron/val_csv" \ 32 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/tb_analysis/abstract_analysis.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import abc 7 | from abc import ABC 8 | 9 | 10 | class TensorBoardAnalysis(ABC): 11 | 12 | def __init__(self): 13 | self._name = None 14 | self._label_name = None 15 | self._csv_name = None 16 | 17 | @abc.abstractmethod 18 | def set_names(self, path_name): 19 | ... 20 | 21 | @abc.abstractmethod 22 | def get_label_name(self): 23 | ... 24 | 25 | @abc.abstractmethod 26 | def get_csv_filename(self): 27 | ... 28 | 29 | @abc.abstractmethod 30 | def path_regex(self): 31 | ... 32 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from argparse import ArgumentParser 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--tb_dir", required=True, type=str, help="Directory for tensorboard output") 10 | parser.add_argument("--analyzer", default="universal_checkpointing", type=str, choices=["universal_checkpointing"], help="Specify the analyzer to use") 11 | parser.add_argument("--tb_event_key", required=False, default="lm-loss-training/lm loss", type=str, help="Optional override of the TensorBoard event key") 12 | parser.add_argument("--plot_title", required=False, default="Megatron-GPT Universal Checkpointing", type=str, help="Optional override of the plot title") 13 | parser.add_argument("--plot_x_label", required=False, default="Training Step", type=str, help="Optional override of the plot x-label") 14 | parser.add_argument("--plot_y_label", required=False, default="LM Loss", type=str, help="Optional override of the plot y-label") 15 | parser.add_argument("--plot_name", required=False, default="uni_ckpt_char.png", type=str, help="Optional override of the plot file name") 16 | parser.add_argument("--skip_plot", action='store_true', help="Skip generation of plot file") 17 | parser.add_argument("--skip_csv", action='store_true', help="Skip generation of csv files") 18 | parser.add_argument("--use_sns", action='store_true', help="Use the SNS library to format plot") 19 | parser.add_argument("--csv_name", required=False, default="", type=str, help="Unique name for CSV files") 20 | parser.add_argument("--plot_only", action='store_true', help="Plot only using csv files") 21 | parser.add_argument("--csv_dir", required=False, type=str, help="Directory for csv files") 22 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import re 7 | from abstract_analysis import TensorBoardAnalysis 8 | 9 | 10 | class UniversalCheckpointingAnalysis(TensorBoardAnalysis): 11 | 12 | def __init__(self): 13 | self._name = "universal_checkpointing" 14 | 15 | def set_names(self, path_name): 16 | match = re.match(self.path_regex(), path_name) 17 | if not match: 18 | raise ValueError(f"Path ({path_name}) did not match regex ({self.path_regex()})") 19 | tp, pp, dp, sp = match.groups() 20 | 21 | self._label_name = f"Training Run: TP: {tp}, PP: {pp}, DP: {dp}" 22 | self._csv_name = f"uc_out_tp{tp}_pp{pp}_dp{dp}_sp{sp}" 23 | 24 | def get_label_name(self): 25 | return self._label_name 26 | 27 | def get_csv_filename(self): 28 | return self._csv_name 29 | 30 | def path_regex(self): 31 | return '.*tp(\d+).*pp(\d+).*dp(\d+).*sp(\d+)' 32 | -------------------------------------------------------------------------------- /examples_deepspeed/universal_checkpointing/tb_analysis/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import os 7 | from uc_analysis import UniversalCheckpointingAnalysis 8 | 9 | 10 | def find_files_prefix(directory, file_prefix): 11 | """ 12 | Searches for files with a specific prefix in a directory using os.walk(). 13 | 14 | Args: 15 | directory (str): The path to the directory to search. 16 | file_prefix (str): The desired file prefix. 17 | 18 | Returns: 19 | list: A list of paths to matching files. 20 | """ 21 | matching_paths = [] 22 | for root, _, files in os.walk(directory): 23 | for filename in files: 24 | if root not in matching_paths and filename.lower().startswith(file_prefix.lower()): 25 | matching_paths.append(os.path.join(root)) 26 | return matching_paths 27 | 28 | def find_files_suffix(directory, file_suffix): 29 | """ 30 | Searches for files with a specific suffix in a directory using os.walk(). 31 | 32 | Args: 33 | directory (str): The path to the directory to search. 34 | file_suffix (str): The desired file suffix. 35 | 36 | Returns: 37 | list: A list of paths to matching files. 38 | """ 39 | matching_paths = [] 40 | for root, _, files in os.walk(directory): 41 | for filename in files: 42 | if root not in matching_paths and filename.lower().endswith(file_suffix.lower()): 43 | matching_paths.append(os.path.join(filename)) 44 | return matching_paths 45 | 46 | def get_analyzer(analyzer_name): 47 | if analyzer_name == 'universal_checkpointing': 48 | return UniversalCheckpointingAnalysis() 49 | else: 50 | raise ValueError(f"Unsupported analyzer {analyzer_name}") 51 | -------------------------------------------------------------------------------- /examples_deepspeed/zero_bubble_pp/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/zero_bubble_pp/benchmark.png -------------------------------------------------------------------------------- /examples_deepspeed/zero_bubble_pp/bw_split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/zero_bubble_pp/bw_split.png -------------------------------------------------------------------------------- /examples_deepspeed/zero_bubble_pp/zbh1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/examples_deepspeed/zero_bubble_pp/zbh1.png -------------------------------------------------------------------------------- /images/Achieved_petaFLOPs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/images/Achieved_petaFLOPs.png -------------------------------------------------------------------------------- /images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/images/cases_april2021.png -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args, get_retro_args 6 | from .global_vars import get_current_global_batch_size 7 | from .global_vars import get_num_microbatches 8 | from .global_vars import get_signal_handler 9 | from .global_vars import update_num_microbatches 10 | from .global_vars import get_tokenizer 11 | from .global_vars import get_tensorboard_writer 12 | from .global_vars import get_wandb_writer 13 | from .global_vars import get_adlr_autoresume 14 | from .global_vars import get_timers 15 | from .initialize import initialize_megatron 16 | 17 | from .utils import (print_rank_0, 18 | is_last_rank, 19 | print_rank_last, 20 | is_rank_0, 21 | is_aml) 22 | -------------------------------------------------------------------------------- /megatron/core/README.md: -------------------------------------------------------------------------------- 1 | Megatron Core is a library for efficient and scalable training of transformer based models. 2 | -------------------------------------------------------------------------------- /megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | import megatron.core.parallel_state 2 | import megatron.core.tensor_parallel 3 | import megatron.core.utils 4 | 5 | from .model_parallel_config import ModelParallelConfig 6 | 7 | # Alias parallel_state as mpu, its legacy name 8 | mpu = parallel_state 9 | 10 | __all__ = [ 11 | "parallel_state", 12 | "tensor_parallel", 13 | "utils", 14 | "ModelParallelConfig" 15 | ] 16 | -------------------------------------------------------------------------------- /megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class ModelType(enum.Enum): 6 | encoder_or_decoder = 1 7 | encoder_and_decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | -------------------------------------------------------------------------------- /megatron/core/fusions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/core/fusions/__init__.py -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from typing import Tuple, Optional 5 | 6 | def _bias_dropout_add_func(x, bias, residual, prob, training): 7 | # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor 8 | # NOTE: Previously, the argument `bias` used to be passed as 9 | # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the 10 | # transformer layer but broadcasting should automatically take care of that. 11 | # Also, looking at broadcasting semantics, `expand_as` and broadcasting 12 | # seem to be identical performance-wise (both just change the view). 13 | if bias is not None: 14 | x = x + bias 15 | out = torch.nn.functional.dropout(x, p=prob, training=training) 16 | out = residual + out 17 | return out 18 | 19 | def get_bias_dropout_add(training, fused): 20 | 21 | def unfused_bias_dropout_add(x_with_bias, residual, prob): 22 | x, bias = x_with_bias # unpack 23 | return _bias_dropout_add_func(x, bias, residual, prob, training) 24 | 25 | @torch.jit.script 26 | def bias_dropout_add_fused_train( 27 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], 28 | residual: torch.Tensor, 29 | prob: float 30 | ) -> torch.Tensor: 31 | x, bias = x_with_bias # unpack 32 | return _bias_dropout_add_func(x, bias, residual, prob, True) 33 | 34 | @torch.jit.script 35 | def bias_dropout_add_fused_inference( 36 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], 37 | residual: torch.Tensor, 38 | prob: float 39 | ) -> torch.Tensor: 40 | x, bias = x_with_bias # unpack 41 | return _bias_dropout_add_func(x, bias, residual, prob, False) 42 | 43 | if fused: 44 | # jit scripting for a nn.module (with dropout) is not 45 | # triggering the fusion kernel. For now, we use two 46 | # different nn.functional routines to account for varying 47 | # dropout semantics during training and inference phases. 48 | if training: 49 | return bias_dropout_add_fused_train 50 | else: 51 | return bias_dropout_add_fused_inference 52 | else: 53 | return unfused_bias_dropout_add 54 | -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 7 | # 1/sqrt(2*pi)-> 0.3989423 8 | # 1/sqrt(2) -> 0.70710678 9 | # sqrt(2/pi) -> 0.79788456 10 | # this function is tanh approximation of gelu 11 | # actual gelu is: 12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 13 | 14 | @torch.jit.script 15 | def bias_gelu(bias, y): 16 | x = bias + y 17 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 18 | 19 | # gradient of tanh approximation of gelu 20 | # gradient of actual gelu is: 21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 22 | @torch.jit.script 23 | def bias_gelu_back(g, bias, y): 24 | x = bias + y 25 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 26 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 27 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 28 | return ff*g 29 | 30 | class GeLUFunction(torch.autograd.Function): 31 | @staticmethod 32 | # bias is an optional argument 33 | def forward(ctx, input, bias): 34 | ctx.save_for_backward(input, bias) 35 | return bias_gelu(bias, input) 36 | 37 | @staticmethod 38 | def backward(ctx, grad_output): 39 | input, bias = ctx.saved_tensors 40 | tmp = bias_gelu_back(grad_output, bias, input) 41 | return tmp, tmp 42 | 43 | bias_gelu_impl = GeLUFunction.apply 44 | -------------------------------------------------------------------------------- /megatron/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/core/models/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_model import GPTModel 2 | -------------------------------------------------------------------------------- /megatron/core/package_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | MAJOR = 0 5 | MINOR = 2 6 | PATCH = 0 7 | PRE_RELEASE = '' 8 | 9 | # Use the following formatting: (major, minor, patch, pre-release) 10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) 11 | 12 | __shortversion__ = '.'.join(map(str, VERSION[:3])) 13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) 14 | 15 | __package_name__ = 'megatron_core' 16 | __contact_names__ = 'NVIDIA' 17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email 18 | __homepage__ = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage 19 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' 20 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 21 | __description__ = 'Megatron Core - a library for efficient and scalable training of transformer based models' 22 | __license__ = 'BSD-3' 23 | __keywords__ = 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' 24 | -------------------------------------------------------------------------------- /megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .schedules import get_forward_backward_func 2 | -------------------------------------------------------------------------------- /megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | pybind11 2 | torch 3 | regex -------------------------------------------------------------------------------- /megatron/core/sequence_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy import vocab_sequence_parallel_cross_entropy 2 | -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy import vocab_parallel_cross_entropy 2 | from .data import broadcast_data 3 | 4 | from .layers import ( 5 | ColumnParallelLinear, 6 | RowParallelLinear, 7 | VocabParallelEmbedding, 8 | set_tensor_model_parallel_attributes, 9 | set_defaults_if_not_set_tensor_model_parallel_attributes, 10 | copy_tensor_model_parallel_attributes, 11 | param_is_not_tensor_parallel_duplicate, 12 | linear_with_grad_accumulation_and_async_allreduce 13 | 14 | ) 15 | 16 | from .mappings import ( 17 | copy_to_tensor_model_parallel_region, 18 | gather_from_tensor_model_parallel_region, 19 | gather_from_sequence_parallel_region, 20 | scatter_to_tensor_model_parallel_region, 21 | scatter_to_sequence_parallel_region, 22 | ) 23 | 24 | from .random import ( 25 | checkpoint, 26 | get_cuda_rng_tracker, 27 | model_parallel_cuda_manual_seed, 28 | model_parallel_reconfigure_tp_seed, 29 | init_checkpointed_activations_memory_buffer, 30 | reset_checkpointed_activations_memory_buffer, 31 | ) 32 | 33 | from .utils import ( 34 | split_tensor_along_last_dim, 35 | split_tensor_into_1d_equal_chunks, 36 | gather_split_1d_tensor, 37 | ) 38 | 39 | __all__ = [ 40 | # cross_entropy.py 41 | "vocab_parallel_cross_entropy", 42 | # data.py 43 | "broadcast_data", 44 | #layers.py 45 | "ColumnParallelLinear", 46 | "RowParallelLinear", 47 | "VocabParallelEmbedding", 48 | "set_tensor_model_parallel_attributes", 49 | "set_defaults_if_not_set_tensor_model_parallel_attributes", 50 | "copy_tensor_model_parallel_attributes", 51 | "param_is_not_tensor_parallel_duplicate", 52 | "linear_with_grad_accumulation_and_async_allreduce", 53 | # mappings.py 54 | "copy_to_tensor_model_parallel_region", 55 | "gather_from_tensor_model_parallel_region", 56 | "gather_from_sequence_parallel_region", 57 | # "reduce_from_tensor_model_parallel_region", 58 | "scatter_to_tensor_model_parallel_region", 59 | "scatter_to_sequence_parallel_region", 60 | # random.py 61 | "checkpoint", 62 | "get_cuda_rng_tracker", 63 | "model_parallel_cuda_manual_seed", 64 | "init_checkpointed_activations_memory_buffer", 65 | "reset_checkpointed_activations_memory_buffer", 66 | # utils.py 67 | "split_tensor_along_last_dim", 68 | "split_tensor_into_1d_equal_chunks", 69 | "gather_split_1d_tensor", 70 | ] 71 | -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/weight_grad_store.py: -------------------------------------------------------------------------------- 1 | import queue 2 | 3 | class WeightGradStore: 4 | 5 | cache = [] 6 | weight_grad_queue = queue.Queue() 7 | combine_bw = True 8 | 9 | @classmethod 10 | def set_combine_bw(cls, combine_bw): 11 | # For the following backward pass, combine W with B and skip next W. 12 | cls.combine_bw = combine_bw 13 | 14 | @classmethod 15 | def put(cls, total_input, grad_output, weight, func): 16 | if cls.combine_bw == True: 17 | func(total_input, grad_output, weight) 18 | return 19 | # Store the weight gradient computation of linear layers. 20 | cls.cache.append((total_input, grad_output, weight, func)) 21 | 22 | @classmethod 23 | def flush(cls): 24 | # Collect all stored computations during backward as a W. 25 | cls.weight_grad_queue.put(cls.cache) 26 | cls.cache = [] 27 | 28 | @classmethod 29 | def pop(cls): 30 | # Execute a single W. 31 | assert cls.weight_grad_queue.qsize() > 0 32 | stored_grads = cls.weight_grad_queue.get() 33 | for total_input, grad_output, weight, func in stored_grads: 34 | func(total_input, grad_output, weight) -------------------------------------------------------------------------------- /megatron/core/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .transformer_config import TransformerConfig 4 | from .core_attention import CoreAttention 5 | -------------------------------------------------------------------------------- /megatron/core/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | # can we get rid of this? 7 | # it's being used in pipeline schedules 8 | class ModelType(enum.Enum): 9 | encoder_or_decoder = 1 10 | encoder_and_decoder = 2 11 | 12 | 13 | # class LayerType(enum.Enum): 14 | # encoder = 1 15 | # decoder = 2 16 | 17 | 18 | class AttnType(enum.Enum): 19 | self_attn = 1 20 | cross_attn = 2 21 | 22 | 23 | class AttnMaskType(enum.Enum): 24 | padding = 1 25 | causal = 2 26 | -------------------------------------------------------------------------------- /megatron/core/transformer/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for transformer layers.""" 4 | 5 | import torch 6 | 7 | from megatron import get_args 8 | 9 | from deepspeed.runtime.zero import GatheredParameters 10 | 11 | def attention_mask_func(attention_scores, attention_mask): 12 | attention_scores.masked_fill_(attention_mask, -10000.0) 13 | return attention_scores 14 | 15 | 16 | def get_linear_layer(rows, columns, init_method, gather_params_on_init=False): 17 | """Simple linear layer with weight initialization.""" 18 | layer = torch.nn.Linear(rows, columns) 19 | if get_args().perform_initialization: 20 | with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init): 21 | init_method(layer.weight) 22 | with torch.no_grad(): 23 | with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init): 24 | layer.bias.zero_() 25 | return layer 26 | 27 | 28 | @torch.jit.script 29 | def gelu_impl(x): 30 | """OpenAI's gelu implementation.""" 31 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x))) 32 | 33 | 34 | def openai_gelu(x): 35 | return gelu_impl(x) 36 | 37 | 38 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 39 | @torch.jit.script 40 | def erf_gelu(x): 41 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype)) 42 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /megatron/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 29 | prefix = 3 30 | 31 | class PositionEmbeddingType(enum.Enum): 32 | rotary = 1 33 | absolute = 2 34 | alibi = 3 35 | -------------------------------------------------------------------------------- /megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | float scale_factor); 14 | 15 | torch::Tensor bwd_cuda( 16 | torch::Tensor const& output_grads, 17 | torch::Tensor const& softmax_results, 18 | float scale_factor); 19 | 20 | torch::Tensor fwd( 21 | torch::Tensor const& input, 22 | float scale_factor) { 23 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 24 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 25 | (input.scalar_type() == at::ScalarType::BFloat16), 26 | "Only fp16 and bf16 are supported"); 27 | 28 | return fwd_cuda(input, scale_factor); 29 | } 30 | 31 | torch::Tensor bwd( 32 | torch::Tensor const& output_grads, 33 | torch::Tensor const& softmax_results, 34 | float scale_factor) { 35 | 36 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 37 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 38 | 39 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 40 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 41 | "Only fp16 and bf16 are supported"); 42 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 43 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 44 | "Only fp16 and bf16 are supported"); 45 | 46 | return bwd_cuda(output_grads, softmax_results, scale_factor); 47 | } 48 | 49 | } // end namespace scaled_softmax 50 | } // end namespace fused_softmax 51 | } // end namespace multihead_attn 52 | 53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 54 | m.def("forward", 55 | &multihead_attn::fused_softmax::scaled_softmax::fwd, 56 | "Self Multihead Attention scaled, softmax -- Forward."); 57 | m.def("backward", 58 | &multihead_attn::fused_softmax::scaled_softmax::bwd, 59 | "Self Multihead Attention scaled, softmax -- Backward."); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_upper_triang_masked_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | float scale_factor); 14 | 15 | torch::Tensor bwd_cuda( 16 | torch::Tensor const& output_grads, 17 | torch::Tensor const& softmax_results, 18 | float scale_factor); 19 | 20 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 21 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 22 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 23 | (input.scalar_type() == at::ScalarType::BFloat16), 24 | "Only fp16 and bf16 are supported"); 25 | 26 | return fwd_cuda(input, scale_factor); 27 | } 28 | 29 | torch::Tensor bwd( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor) { 33 | 34 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 35 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 36 | 37 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 38 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 39 | "Only fp16 and bf16 are supported"); 40 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 41 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 42 | "Only fp16 and bf16 are supported"); 43 | 44 | return bwd_cuda(output_grads, softmax_results, scale_factor); 45 | } 46 | 47 | } // end namespace scaled_upper_triang_masked_softmax 48 | } // end namespace fused_softmax 49 | } // end namespace multihead_attn 50 | 51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 52 | m.def("forward", 53 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 54 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 55 | m.def("backward", 56 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 57 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 58 | } 59 | -------------------------------------------------------------------------------- /megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from deepspeed.accelerator.real_accelerator import get_accelerator 5 | 6 | if get_accelerator().device_name() == 'xpu': 7 | import intel_extension_for_pytorch 8 | if get_accelerator().device_name() == 'cuda': 9 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 10 | from apex.normalization import MixedFusedRMSNorm as RMSNorm 11 | else: 12 | if hasattr(torch.xpu, "IpexRmsNorm"): 13 | from .fused_rmsnorm import RMSNorm 14 | else: 15 | from .rmsnorm import RMSNorm 16 | from torch.nn import LayerNorm 17 | 18 | from .distributed import DistributedDataParallel 19 | from .bert_model import BertModel 20 | from .gpt_model import GPTModel, GPTModelPipe 21 | from .t5_model import T5Model 22 | from .language_model import get_language_model 23 | from .module import Float16Module 24 | -------------------------------------------------------------------------------- /megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | retro_decoder_with_retriever = 5 11 | 12 | class AttnType(enum.Enum): 13 | self_attn = 1 14 | cross_attn = 2 15 | 16 | class AttnMaskType(enum.Enum): 17 | padding = 1 18 | causal = 2 19 | 20 | # For backward compatibility with old model checkpoints 21 | from megatron.core.enums import ModelType 22 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 7 | # 1/sqrt(2*pi)-> 0.3989423 8 | # 1/sqrt(2) -> 0.70710678 9 | # sqrt(2/pi) -> 0.79788456 10 | # this function is tanh approximation of gelu 11 | # actual gelu is: 12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 13 | 14 | @torch.jit.script 15 | def bias_gelu(bias, y): 16 | x = bias + y 17 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 18 | 19 | # gradient of tanh approximation of gelu 20 | # gradient of actual gelu is: 21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 22 | @torch.jit.script 23 | def bias_gelu_back(g, bias, y): 24 | x = bias + y 25 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 26 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 27 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 28 | return ff*g 29 | 30 | class GeLUFunction(torch.autograd.Function): 31 | @staticmethod 32 | # bias is an optional argument 33 | def forward(ctx, input, bias): 34 | ctx.save_for_backward(input, bias) 35 | return bias_gelu(bias, input) 36 | 37 | @staticmethod 38 | def backward(ctx, grad_output): 39 | input, bias = ctx.saved_tensors 40 | tmp = bias_gelu_back(grad_output, bias, input) 41 | return tmp, tmp 42 | 43 | bias_gelu_impl = GeLUFunction.apply 44 | -------------------------------------------------------------------------------- /megatron/model/fused_rmsnorm.py: -------------------------------------------------------------------------------- 1 | from megatron import get_args 2 | 3 | import torch 4 | from torch.nn.parameter import Parameter 5 | from torch.nn import init 6 | import intel_extension_for_pytorch as ipex # noqa 7 | 8 | # Taken from facebookresearch/llama 9 | class RMSNorm(torch.nn.Module): 10 | def __init__(self, dim: int, eps: float = 1e-6, sequence_parallel=False): 11 | super().__init__() 12 | self.eps = eps 13 | self.weight = Parameter(torch.ones(dim, 14 | dtype=get_args().params_dtype)) 15 | self.sequence_parallel = sequence_parallel 16 | setattr(self.weight, 'sequence_parallel', self.sequence_parallel) 17 | 18 | def forward(self, x): 19 | output = torch.xpu.IpexRmsNorm(x, self.weight.shape, self.weight, self.eps) 20 | return output 21 | -------------------------------------------------------------------------------- /megatron/model/rmsnorm.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. 2 | 3 | from deepspeed.accelerator import get_accelerator 4 | from megatron import get_args 5 | 6 | import torch 7 | from torch.nn import init 8 | from torch.nn.parameter import Parameter 9 | 10 | # Taken from facebookresearch/llama 11 | class RMSNorm(torch.nn.Module): 12 | def __init__(self, dim: int, eps: float = 1e-6, sequence_parallel=False): 13 | super().__init__() 14 | self.eps = eps 15 | init_device = None 16 | if get_accelerator().device_name() == 'hpu': 17 | init_device = get_accelerator().current_device_name() 18 | self.weight = Parameter(torch.empty(dim, 19 | device=init_device, 20 | dtype=get_args().params_dtype)) 21 | init.ones_(self.weight) 22 | self.sequence_parallel = sequence_parallel 23 | setattr(self.weight, 'sequence_parallel', self.sequence_parallel) 24 | 25 | def _norm(self, x): 26 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 27 | 28 | def forward(self, x): 29 | output = self._norm(x.float()).type_as(x) 30 | return output * self.weight 31 | -------------------------------------------------------------------------------- /megatron/model/vision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/model/vision/__init__.py -------------------------------------------------------------------------------- /megatron/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def resize(input, 7 | size=None, 8 | scale_factor=None, 9 | mode='nearest', 10 | align_corners=None, 11 | warning=True): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ((output_h > 1 and output_w > 1 and input_h > 1 18 | and input_w > 1) and (output_h - 1) % (input_h - 1) 19 | and (output_w - 1) % (input_w - 1)): 20 | warnings.warn( 21 | f'When align_corners={align_corners}, ' 22 | 'the output would more aligned if ' 23 | f'input size {(input_h, input_w)} is `x+1` and ' 24 | f'out size {(output_h, output_w)} is `nx+1`') 25 | if isinstance(size, torch.Size): 26 | size = tuple(int(x) for x in size) 27 | return F.interpolate(input, size, scale_factor, mode, align_corners) 28 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import os 5 | import random 6 | import numpy 7 | import torch 8 | 9 | import mpu 10 | from deepspeed.accelerator import get_accelerator 11 | 12 | class IdentityLayer(torch.nn.Module): 13 | def __init__(self, size, scale=1.0): 14 | super(IdentityLayer, self).__init__() 15 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 16 | 17 | def forward(self): 18 | return self.weight 19 | 20 | 21 | def set_random_seed(seed): 22 | """Set random seed for reproducability.""" 23 | random.seed(seed) 24 | numpy.random.seed(seed) 25 | torch.manual_seed(seed) 26 | mpu.model_parallel_cuda_manual_seed(seed) 27 | 28 | 29 | def initialize_distributed(backend='nccl'): 30 | """Initialize torch.distributed.""" 31 | # Get local rank in case it is provided. 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--local_rank', type=int, default=None, 34 | help='local rank passed from distributed launcher') 35 | args = parser.parse_args() 36 | local_rank = args.local_rank 37 | 38 | # Get rank and world size. 39 | rank = int(os.getenv('RANK', '0')) 40 | world_size = int(os.getenv("WORLD_SIZE", '1')) 41 | 42 | print('> initializing torch.distributed with local rank: {}, ' 43 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 44 | 45 | # Set the device id. 46 | device = rank % get_accelerator().device_count() 47 | if local_rank is not None: 48 | device = local_rank 49 | get_accelerator().set_device(device) 50 | 51 | # Call the init process. 52 | init_method = 'tcp://' 53 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 54 | master_port = os.getenv('MASTER_PORT', '6000') 55 | init_method += master_ip + ':' + master_port 56 | torch.distributed.init_process_group( 57 | backend=backend, 58 | world_size=world_size, 59 | rank=rank, 60 | init_method=init_method) 61 | 62 | 63 | def print_separator(message): 64 | torch.distributed.barrier() 65 | filler_len = (78 - len(message)) // 2 66 | filler = '-' * filler_len 67 | string = '\n' + filler + ' {} '.format(message) + filler 68 | if torch.distributed.get_rank() == 0: 69 | print(string, flush=True) 70 | torch.distributed.barrier() 71 | -------------------------------------------------------------------------------- /megatron/profiler.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. 2 | 3 | import torch 4 | 5 | on_step_begin = [] 6 | on_step_end = [] 7 | 8 | def trigger(phase): 9 | [f() for f in phase] 10 | 11 | def setup_profiler(args, device): 12 | if args.profile is None: 13 | return 14 | 15 | start_step, end_step = map(int, args.profile_steps.split(',')) 16 | active_steps = end_step - start_step + 1 17 | cur_step = 0 18 | 19 | def on_step_begin_fn(): 20 | nonlocal cur_step 21 | cur_step = cur_step + 1 22 | on_step_begin.append(on_step_begin_fn) 23 | 24 | def when(cond, clbk): 25 | def fn(): 26 | if cond(): 27 | clbk() 28 | return fn 29 | 30 | def is_start_step(): 31 | return cur_step == start_step 32 | 33 | def is_end_step(): 34 | return cur_step == end_step 35 | 36 | def is_capture_step(): 37 | return cur_step >= start_step and cur_step <= end_step 38 | 39 | if args.profile.startswith('pt') and ( 40 | args.profile_ranks is None or torch.distributed.get_rank() in args.profile_ranks 41 | ): 42 | schedule = torch.profiler.schedule(wait=0, warmup=0, active=active_steps, repeat=1) 43 | activities = [torch.profiler.ProfilerActivity.CPU] 44 | activities.extend([torch.profiler.ProfilerActivity.HPU] if device.startswith("hpu") else []) 45 | activities.extend([torch.profiler.ProfilerActivity.CUDA] if device.startswith("cuda") else []) 46 | full = args.profile == 'pt-full' 47 | 48 | profiler = torch.profiler.profile( 49 | schedule=schedule, 50 | activities=activities, 51 | on_trace_ready=torch.profiler.tensorboard_trace_handler(args.tensorboard_dir, use_gzip=True), 52 | with_stack=full) 53 | 54 | on_step_begin.append(when(is_start_step, profiler.start)) 55 | on_step_end.append(when(is_capture_step, profiler.step)) 56 | on_step_end.append(when(is_end_step, profiler.stop)) 57 | -------------------------------------------------------------------------------- /megatron/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /tasks/eval_harness/download.py: -------------------------------------------------------------------------------- 1 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed 2 | # under the license https://huggingface.co/spaces/bigscience/license 3 | 4 | # Downloads the specified taks in the evaluation harness 5 | # This is particularly useful when running in environments where the GPU nodes 6 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation. 7 | 8 | from lm_eval import tasks 9 | from lm_eval.tasks import ALL_TASKS 10 | import argparse 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False) 15 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.') 16 | args = parser.parse_args() 17 | 18 | def main(): 19 | task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') 20 | tasks.get_task_dict(task_list) 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | 26 | -------------------------------------------------------------------------------- /tasks/eval_harness/report-to-csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed 4 | # under the license https://huggingface.co/spaces/bigscience/license 5 | 6 | # this script converts results.json: 7 | # 8 | # "results": { 9 | # "arc_challenge": { 10 | # "acc": 0.24232081911262798, 11 | # "acc_stderr": 0.01252159329580012, 12 | # "acc_norm": 0.2764505119453925, 13 | # "acc_norm_stderr": 0.013069662474252425 14 | # }, 15 | # 16 | # into a format expected by a spreadsheet, which is: 17 | # 18 | # task metric value err 19 | # arc_challenge acc xxx yyy 20 | # arc_challenge acc_norm xxx yyy 21 | # arc_challenge f1 xxx yyy 22 | # 23 | # usage: 24 | # report-to-csv.py results.json 25 | 26 | 27 | import sys 28 | import json 29 | import io 30 | import csv 31 | 32 | results_file = sys.argv[1] 33 | 34 | csv_file = results_file.replace("json", "csv") 35 | 36 | print(f"Converting {results_file} to {csv_file}") 37 | 38 | with io.open(results_file, 'r', encoding='utf-8') as f: 39 | results = json.load(f) 40 | 41 | with io.open(csv_file, 'w', encoding='utf-8') as f: 42 | 43 | writer = csv.writer(f) 44 | writer.writerow(["task", "metric", "value", "err", "version"]) 45 | 46 | versions = results["versions"] 47 | 48 | for k,v in sorted(results["results"].items()): 49 | if k not in versions: 50 | versions[k] = -1 51 | 52 | if "acc" in v: 53 | writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]]) 54 | if "acc_norm" in v: 55 | writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]]) 56 | if "f1" in v: 57 | writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]]) 58 | # if "ppl" in v: 59 | # writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]]) 60 | # if "em" in v: 61 | # writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]]) 62 | -------------------------------------------------------------------------------- /tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """GLUE dataset.""" 4 | 5 | from abc import ABC 6 | from abc import abstractmethod 7 | 8 | from torch.utils.data import Dataset 9 | 10 | from megatron import print_rank_0 11 | from tasks.data_utils import build_sample 12 | from tasks.data_utils import build_tokens_types_paddings_from_text 13 | 14 | 15 | class GLUEAbstractDataset(ABC, Dataset): 16 | """GLUE base dataset class.""" 17 | 18 | def __init__(self, task_name, dataset_name, datapaths, 19 | tokenizer, max_seq_length): 20 | # Store inputs. 21 | self.task_name = task_name 22 | self.dataset_name = dataset_name 23 | self.tokenizer = tokenizer 24 | self.max_seq_length = max_seq_length 25 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 26 | self.dataset_name)) 27 | # Process the files. 28 | string = ' > paths:' 29 | for path in datapaths: 30 | string += ' ' + path 31 | print_rank_0(string) 32 | self.samples = [] 33 | for datapath in datapaths: 34 | self.samples.extend(self.process_samples_from_single_path(datapath)) 35 | print_rank_0(' >> total number of samples: {}'.format( 36 | len(self.samples))) 37 | 38 | def __len__(self): 39 | return len(self.samples) 40 | 41 | def __getitem__(self, idx): 42 | raw_sample = self.samples[idx] 43 | ids, types, paddings = build_tokens_types_paddings_from_text( 44 | raw_sample['text_a'], raw_sample['text_b'], 45 | self.tokenizer, self.max_seq_length) 46 | sample = build_sample(ids, types, paddings, 47 | raw_sample['label'], raw_sample['uid']) 48 | return sample 49 | 50 | @abstractmethod 51 | def process_samples_from_single_path(self, datapath): 52 | """Abstract method that takes a single path / filename and 53 | returns a list of dataset samples, each sample being a dict of 54 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 55 | """ 56 | pass 57 | -------------------------------------------------------------------------------- /tasks/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework. 5 | 6 | ## Multi-Stage Dialogue Prompting 7 | 8 | ### Data Preparation 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/) 10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets. 11 | 12 | ### Stage-1: Prompting for Knowledge Generation 13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation. 14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation. 15 | 16 | ### Stage-2: Prompting for Response Generation 17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file). 18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation. 19 | 3. We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation. 20 | -------------------------------------------------------------------------------- /tasks/msdp/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Model evaluation""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from tasks.msdp.metrics import F1Metric 8 | from tqdm import tqdm 9 | 10 | 11 | def evaluate_f1(guess_file, answer_file): 12 | """Evaluating F1 Score""" 13 | 14 | guess_list = [] 15 | print_rank_0('reading %s' % guess_file) 16 | with open(guess_file, "r") as f: 17 | for i, line in enumerate(tqdm(f)): 18 | line = line.strip() 19 | if "<|endoftext|>" in line: 20 | line = line.replace("<|endoftext|>", "") 21 | guess_list.append(line) 22 | 23 | answer_list = [] 24 | print_rank_0('reading %s' % answer_file) 25 | with open(answer_file, "r") as f: 26 | for i, line in enumerate(tqdm(f)): 27 | line = line.strip() 28 | if line == "no_passages_used": 29 | line = "" 30 | answer_list.append(line) 31 | 32 | assert len(guess_list) == len(answer_list), \ 33 | "lengths of guess and answer are different!" 34 | 35 | precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list) 36 | print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1)) 37 | 38 | print_rank_0('done :-)') 39 | 40 | 41 | def main(): 42 | args = get_args() 43 | 44 | evaluate_f1(args.guess_file, args.answer_file) 45 | 46 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | from megatron import get_args, print_rank_0 6 | from megatron.indexer import IndexBuilder 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator 8 | 9 | def main(): 10 | """ 11 | Main program 12 | """ 13 | 14 | args = get_args() 15 | 16 | """ 17 | Create a BlockData data structure by running an IndexBuilder over an 18 | ICT Dataset and then evaluate on NQ task 19 | """ 20 | 21 | print_rank_0("Starting index builder!") 22 | 23 | index_builder = IndexBuilder() 24 | index_builder.build_and_save_index() 25 | print_rank_0("Build and save indices: done!") 26 | 27 | 28 | print_rank_0("Starting evaluations!") 29 | 30 | # Set up the model and evaluator 31 | evaluator = ORQAEvaluator() 32 | 33 | # Run evaluation 34 | if args.qa_data_dev is not None: 35 | evaluator.evaluate(args.qa_data_dev, "DEV") 36 | 37 | if args.qa_data_test is not None: 38 | evaluator.evaluate(args.qa_data_test, "TEST") 39 | 40 | -------------------------------------------------------------------------------- /tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Race.""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from megatron import get_tokenizer 8 | from megatron.model.multiple_choice import MultipleChoice 9 | from tasks.eval_utils import accuracy_func_provider 10 | from tasks.finetune_utils import finetune 11 | from tasks.race.data import RaceDataset 12 | from megatron.arguments import core_transformer_config_from_args 13 | 14 | 15 | def train_valid_datasets_provider(): 16 | """Provide train and validation datasets.""" 17 | args = get_args() 18 | tokenizer = get_tokenizer() 19 | 20 | train_dataset = RaceDataset('training', args.train_data, 21 | tokenizer, args.seq_length) 22 | valid_dataset = RaceDataset('validation', args.valid_data, 23 | tokenizer, args.seq_length) 24 | 25 | return train_dataset, valid_dataset 26 | 27 | 28 | def model_provider(pre_process=True, post_process=True): 29 | """Build the model.""" 30 | config = core_transformer_config_from_args(get_args()) 31 | print_rank_0('building multichoice model for RACE ...') 32 | model = MultipleChoice(config=config, 33 | num_tokentypes=2, 34 | pre_process=pre_process, 35 | post_process=post_process) 36 | 37 | return model 38 | 39 | 40 | def metrics_func_provider(): 41 | """Privde metrics callback function.""" 42 | args = get_args() 43 | tokenizer = get_tokenizer() 44 | 45 | def single_dataset_provider(datapath): 46 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 47 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 48 | 49 | return accuracy_func_provider(single_dataset_provider) 50 | 51 | 52 | def main(): 53 | 54 | finetune(train_valid_datasets_provider, model_provider, 55 | end_of_epoch_callback_provider=metrics_func_provider) 56 | -------------------------------------------------------------------------------- /tasks/vision/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | import os 6 | import sys 7 | 8 | sys.path.append( 9 | os.path.abspath( 10 | os.path.join( 11 | os.path.join(os.path.dirname(__file__), os.path.pardir), 12 | os.path.pardir, 13 | ) 14 | ) 15 | ) 16 | from megatron import get_args 17 | from megatron.initialize import initialize_megatron 18 | 19 | def get_tasks_args(parser): 20 | """Provide extra arguments required for tasks.""" 21 | group = parser.add_argument_group(title="tasks") 22 | 23 | group.add_argument('--task', type=str, default='segment', 24 | choices=['classify', 'segment_setr', 'segment_segformer'], 25 | help='task name.') 26 | group.add_argument("--epochs", type=int, default=None, 27 | help="Number of finetunning epochs. Zero results in " 28 | "evaluation only.") 29 | group.add_argument('--pretrained-checkpoint-type', type=str, default='default', 30 | choices=['default', 'external', 'constrastive'], 31 | help='Type of pretrained checkpoint') 32 | group.add_argument("--pretrained-checkpoint", type=str, default=None, 33 | help="Pretrained checkpoint used for finetunning.") 34 | group.add_argument('--seg-stride', type=int, default=None, 35 | help='sliding window stride during evaluation') 36 | return parser 37 | 38 | 39 | if __name__ == "__main__": 40 | 41 | initialize_megatron(extra_args_provider=get_tasks_args) 42 | args = get_args() 43 | 44 | if args.task == 'classify': 45 | from tasks.vision.classification.classification import main 46 | main() 47 | elif args.task == 'segment_setr': 48 | from tasks.vision.segmentation.finetune_setr import main 49 | main() 50 | elif args.task == 'segment_segformer': 51 | from tasks.vision.segmentation.finetune_segformer import main 52 | main() 53 | 54 | -------------------------------------------------------------------------------- /tasks/zeroshot_gpt/detokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Detokenization.""" 4 | 5 | import re 6 | 7 | 8 | def ptb_detokenizer(string): 9 | string = string.replace(" '", "'") 10 | string = string.replace(" \n", "\n") 11 | string = string.replace("\n ", "\n") 12 | string = string.replace(" n't", "n't") 13 | string = string.replace(" N ", "1 ") 14 | string = string.replace("$ 1", "$1") 15 | string = string.replace("# 1", "#1") 16 | return string 17 | 18 | 19 | def wikitext_detokenizer(string): 20 | # contractions 21 | string = string.replace("s '", "s'") 22 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 23 | # number separators 24 | string = string.replace(" @-@ ", "-") 25 | string = string.replace(" @,@ ", ",") 26 | string = string.replace(" @.@ ", ".") 27 | # punctuation 28 | string = string.replace(" : ", ": ") 29 | string = string.replace(" ; ", "; ") 30 | string = string.replace(" . ", ". ") 31 | string = string.replace(" ! ", "! ") 32 | string = string.replace(" ? ", "? ") 33 | string = string.replace(" , ", ", ") 34 | # double brackets 35 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 36 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 37 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 38 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 39 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 40 | # miscellaneous 41 | string = string.replace("= = = =", "====") 42 | string = string.replace("= = =", "===") 43 | string = string.replace("= =", "==") 44 | string = string.replace(" " + chr(176) + " ", chr(176)) 45 | string = string.replace(" \n", "\n") 46 | string = string.replace("\n ", "\n") 47 | string = string.replace(" N ", " 1 ") 48 | string = string.replace(" 's", "'s") 49 | 50 | return string 51 | 52 | 53 | def lambada_detokenizer(string): 54 | return string 55 | 56 | 57 | _DETOKENIZERS = { 58 | 'ptb': ptb_detokenizer, 59 | 'wiki': wikitext_detokenizer, 60 | 'lambada': lambada_detokenizer, 61 | } 62 | 63 | 64 | def get_detokenizer(path): 65 | for key in _DETOKENIZERS.keys(): 66 | if key in path: 67 | return _DETOKENIZERS[key] 68 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import pytest 4 | 5 | from megatron.core import parallel_state 6 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed 7 | 8 | from megatron.core.transformer.transformer_config import TransformerConfig 9 | 10 | # initialize model parallel for tests 11 | parallel_state.set_tensor_model_parallel_world_size(1) 12 | parallel_state.set_tensor_model_parallel_rank(0) 13 | parallel_state._set_global_memory_buffer() 14 | parallel_state.set_pipeline_model_parallel_rank(0) 15 | parallel_state.set_pipeline_model_parallel_world_size(1) 16 | 17 | model_parallel_cuda_manual_seed(123) 18 | 19 | 20 | @pytest.fixture 21 | def transformer_config(): 22 | return TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) 23 | -------------------------------------------------------------------------------- /tests/functional_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/functional_tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/functional_tests/python_test_utils/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/check_slurm_job_completion.py: -------------------------------------------------------------------------------- 1 | """Check if a given slurm job id completed successfully 2 | Usage: 3 | python3 check_slurm_job_completion.py 4 | """ 5 | 6 | import sys 7 | import subprocess 8 | 9 | 10 | cmd = f"sacct -j {sys.argv[1]}" 11 | result = subprocess.check_output(cmd, shell=True).decode().split() 12 | assert len(result) > 14, "JOB state not available." 13 | 14 | status = result[19] 15 | exit_code = result[20] 16 | 17 | assert status == "COMPLETED", f"Job {sys.argv[1]} not completed." 18 | assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully." 19 | 20 | -------------------------------------------------------------------------------- /tests/functional_tests/shell_test_utils/jobwait.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | JOBID=$1 4 | echo "Job id : $JOBID" 5 | 6 | if [[ $JOBID -eq "" ]]; then 7 | exit 1 8 | fi 9 | 10 | sleep 10s 11 | 12 | while true; do 13 | export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1` 14 | case "${STATE}" in 15 | PENDING|RUNNING|REQUEUED) 16 | echo "Job is still in $STATE" 17 | sleep 15s 18 | ;; 19 | *) 20 | sleep 30s 21 | echo "Exiting with SLURM job status '${STATE}'" 22 | exit 0 23 | ;; 24 | esac 25 | done 26 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 48, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87978, 10.84463, 10.67266, 10.62932, 10.52767, 10.25362]}, "num-zeros": {"start_step": 0, "end_step": 31, "step_interval": 5, "values": [2450.0, 2396.0, 2523.0, 2242.0, 2225.0, 2478.0, 2536.0]}, "iteration_timing_avg": 0.11416968750000002} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89173, 10.84724, 10.6886, 10.62864, 10.53925, 10.26646]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2306.0, 2412.0, 2032.0, 2077.0, 2475.0, 2347.0]}, "iteration_timing_avg": 0.15481029411764707} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -o xtrace 3 | 4 | DATA_PATH=$1 5 | CHECKPOINT_PATH=$2 6 | TENSORBOARD_DIR=$3 7 | TP_SIZE=$4 8 | PP_SIZE=$5 9 | NNODES=$6 10 | MAX_STEPS=$7 11 | VP_SIZE=$8 12 | GPUS_PER_NODE=8 13 | # Change for multinode config 14 | MASTER_ADDR=localhost 15 | MASTER_PORT=6000 16 | NODE_RANK=0 17 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 18 | export CUDA_DEVICE_MAX_CONNECTIONS=1 19 | 20 | 21 | # Runs the "345M" parameter model 22 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 23 | 24 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 25 | pretrain_bert.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --log-params-norm \ 30 | --log-num-zeros-in-grad \ 31 | --log-validation-ppl-to-tensorboard \ 32 | --log-timers-to-tensorboard \ 33 | --tensorboard-dir ${TENSORBOARD_DIR} \ 34 | --micro-batch-size 4 \ 35 | --global-batch-size 128 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --train-iters $MAX_STEPS \ 39 | --timing-log-level 2 \ 40 | --lr-decay-iters 990000 \ 41 | --save $CHECKPOINT_PATH \ 42 | --load $CHECKPOINT_PATH \ 43 | --data-path $DATA_PATH \ 44 | --vocab-file /workspace/data/bert_data/vocab.txt \ 45 | --data-impl mmap \ 46 | --split 949,50,1 \ 47 | --distributed-backend nccl \ 48 | --lr 0.0001 \ 49 | --min-lr 0.00001 \ 50 | --lr-warmup-fraction 0.01 \ 51 | --log-interval 1 \ 52 | --save-interval 10000 \ 53 | --eval-interval 1000 \ 54 | --eval-iters 10 \ 55 | --tensor-model-parallel-size $TP_SIZE \ 56 | --pipeline-model-parallel-size $PP_SIZE \ 57 | ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \ 58 | --no-gradient-accumulation-fusion \ 59 | --fp16 -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr 5 | #SBATCH --job-name=adlr-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | 13 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 14 | ls 15 | cd /workspace/megatron-lm 16 | ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES" -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr 5 | #SBATCH --job-name=adlr-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | 13 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 14 | ls 15 | cd /workspace/megatron-lm 16 | ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE" -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr 5 | #SBATCH --job-name=adlr-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | 13 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 14 | ls 15 | cd /workspace/megatron-lm 16 | ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES" -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr 5 | #SBATCH --job-name=adlr-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | IMAGE=gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel 13 | 14 | if [[ $USE_TE -eq 1 ]]; then 15 | echo "Using container nvcr.io/nvidia/pytorch:23.04-py3 for running with TE ..." 16 | IMAGE=nvcr.io/nvidia/pytorch:23.04-py3 17 | fi 18 | 19 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 20 | ls 21 | cd /workspace/megatron-lm 22 | ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE $MBS $GBS" 23 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/pipeline_parallel/__init__.py -------------------------------------------------------------------------------- /tests/tensor_parallel/__int__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/tensor_parallel/__int__.py -------------------------------------------------------------------------------- /tests/test_megatron.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import re 4 | import subprocess 5 | 6 | 7 | @pytest.fixture(params=[1]) 8 | def moe_num_experts(request): 9 | return str(request.param) 10 | 11 | 12 | @pytest.fixture(params=[1]) 13 | def mp_size(request): 14 | return str(request.param) 15 | 16 | 17 | @pytest.fixture 18 | def params(moe_num_experts, mp_size): 19 | base_dir = os.getenv("MEGATRON_CKPT_DIR") 20 | assert base_dir, "Please set MEGATRON_CKPT_DIR in your environment" 21 | 22 | vocab_file = os.path.join(base_dir, "gpt2-vocab.json") 23 | merge_file = os.path.join(base_dir, "gpt2-merges.txt") 24 | ckpt_path = os.path.join(base_dir, "checkpoints/gpt2_345m") 25 | 26 | return [ 27 | "--micro-batch-size", "1", 28 | "--num-layers", "24", 29 | "--hidden-size", "1024", 30 | "--num-attention-heads", "16", 31 | "--max-position-embeddings", "1024", 32 | "--vocab-file", vocab_file, 33 | "--merge-file", merge_file, 34 | "--load", ckpt_path, 35 | "--seq-length", "1024", 36 | "--out-seq-length", "1024", 37 | "--tensor-model-parallel-size", mp_size, 38 | "--tokenizer-type", "GPT2BPETokenizer", 39 | "--num-experts", moe_num_experts, 40 | "--mlp-type", "standard", 41 | "--num-samples", "0", 42 | "--fp16", 43 | ] 44 | 45 | 46 | def test_moe_megatron(params, mp_size): 47 | output_re = r"===START OUTPUT===([\S\s]*)===END OUTPUT===" 48 | 49 | # Run the baseline 50 | baseline_cmd = ["deepspeed", "--num_gpus", mp_size, "./run_megatron.py"] + params 51 | result = subprocess.run(baseline_cmd, stdout=subprocess.PIPE) 52 | baseline_output = re.search(output_re, result.stdout.decode("utf-8")).group(1) 53 | 54 | # Run with DeepSpeed 55 | deepspeed_cmd = baseline_cmd + ["--ds-inference"] 56 | result = subprocess.run(deepspeed_cmd, stdout=subprocess.PIPE) 57 | deepspeed_output = re.search(output_re, result.stdout.decode("utf-8")).group(1) 58 | 59 | assert ( 60 | baseline_output == deepspeed_output 61 | ), f"outputs do not match: {baseline_output}\n{deepspeed_output}" 62 | -------------------------------------------------------------------------------- /tests/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/transformer/__init__.py -------------------------------------------------------------------------------- /tests/transformer/test_core_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import pytest 5 | 6 | import torch 7 | 8 | from megatron.core.transformer.core_attention import CoreAttention 9 | 10 | 11 | @pytest.fixture 12 | def core_attention(transformer_config): 13 | return CoreAttention(transformer_config) 14 | 15 | 16 | class TestCoreAttention: 17 | def test_constructor(self, core_attention): 18 | assert isinstance(core_attention, CoreAttention) 19 | assert core_attention.layer_number == 1 20 | 21 | num_weights = sum([p.numel() for p in core_attention.parameters()]) 22 | assert num_weights == 0 23 | 24 | def test_cpu_forward(self, core_attention): 25 | # we can't currently do this because the global memory buffer is on GPU 26 | pass 27 | 28 | def test_gpu_forward(self, core_attention): 29 | 30 | # destroy_global_memory_buffer() 31 | # _set_global_memory_buffer() 32 | # model_parallel_cuda_manual_seed(123) 33 | 34 | core_attention.cuda() 35 | config = core_attention.config 36 | sequence_length = 32 37 | micro_batch_size = 2 38 | # query_layer (float): [sequence_length, micro_batch_size, num_attention_heads, hidden_size / num_attention_heads] 39 | query_layer = torch.ones( 40 | ( 41 | sequence_length, 42 | micro_batch_size, 43 | config.num_attention_heads, 44 | config.hidden_size // config.num_attention_heads, 45 | ) 46 | ).cuda() 47 | 48 | key_layer = torch.ones_like(query_layer).cuda() 49 | 50 | value_layer = torch.ones_like(query_layer).cuda() 51 | 52 | attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() 53 | 54 | context_layer = core_attention( 55 | query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask 56 | ) 57 | 58 | assert context_layer.shape[0] == sequence_length 59 | assert context_layer.shape[1] == micro_batch_size 60 | assert context_layer.shape[2] == config.hidden_size 61 | assert context_layer.device.type == 'cuda' 62 | assert context_layer.dtype == torch.float32 63 | 64 | -------------------------------------------------------------------------------- /tests/transformer/test_parallel_mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | 4 | import pytest 5 | 6 | import torch 7 | import types 8 | 9 | from megatron.core.transformer.parallel_mlp import ParallelMLP 10 | from megatron.global_vars import set_args 11 | 12 | from deepspeed.accelerator import get_accelerator 13 | device_name = get_accelerator().device_name() 14 | 15 | @pytest.fixture 16 | def mlp(transformer_config): 17 | mlp_args = types.SimpleNamespace( 18 | swiglu=False, 19 | openai_gelu=True, 20 | onnx_safe=False, 21 | bias_gelu_fusion=False, 22 | transformer_impl="", 23 | cache_fp8_weight=False, 24 | fp8_interval=False, 25 | cache_fp8_weight_fwd=False 26 | ) 27 | set_args(mlp_args) 28 | return ParallelMLP(transformer_config) 29 | 30 | 31 | class TestParallelMLP: 32 | def test_constructor(self, mlp): 33 | assert isinstance(mlp, ParallelMLP) 34 | 35 | num_weights = sum([p.numel() for p in mlp.parameters()]) 36 | assert num_weights == 1212 37 | 38 | def test_cpu_forward(self, mlp, transformer_config): 39 | # [sequence length, micro batch size, hidden size] 40 | hidden_states = torch.ones((32, 2, transformer_config.hidden_size)) 41 | output, output_bias = mlp(hidden_states) 42 | assert output.shape[0] == 32 43 | assert output.shape[1] == 2 44 | assert output.shape[2] == transformer_config.hidden_size 45 | assert output_bias == None 46 | assert output.dtype == torch.float32 47 | 48 | @pytest.mark.skipif(not get_accelerator().is_available(), reason="accelerator not available") 49 | def test_accelerator_forward(self, mlp, transformer_config): 50 | mlp.to(device_name) 51 | # [sequence length, batch size, hidden size] 52 | hidden_states = torch.ones((32, 2, transformer_config.hidden_size)) 53 | hidden_states = hidden_states.to(device_name) 54 | output, output_bias = mlp(hidden_states) 55 | assert output.shape[0] == 32 56 | assert output.shape[1] == 2 57 | assert output.shape[2] == transformer_config.hidden_size 58 | assert output_bias == None 59 | assert output.dtype == torch.float32 60 | assert output.device.type == device_name 61 | 62 | -------------------------------------------------------------------------------- /tests/transformer/test_parallel_transformer_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import pytest 5 | 6 | import torch 7 | 8 | from megatron.core.transformer.transformer_config import TransformerConfig 9 | from megatron.core.transformer.parallel_transformer_layer import ParallelTransformerLayer 10 | 11 | 12 | @pytest.fixture 13 | def parallel_transformer_layer(transformer_config): 14 | return ParallelTransformerLayer(transformer_config) 15 | 16 | 17 | class TestParallelTransformerLayer: 18 | def test_constructor(self, parallel_transformer_layer): 19 | assert isinstance(parallel_transformer_layer, ParallelTransformerLayer) 20 | assert parallel_transformer_layer.layer_number == 1 21 | 22 | num_weights = sum([p.numel() for p in parallel_transformer_layer.parameters()]) 23 | assert num_weights == 1884 24 | 25 | def test_gpu_forward(self, parallel_transformer_layer): 26 | config: TransformerConfig = parallel_transformer_layer.config 27 | sequence_length = 32 28 | micro_batch_size = 2 29 | parallel_transformer_layer.cuda() 30 | 31 | # [sequence length, batch size, hidden size] 32 | hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) 33 | hidden_states = hidden_states.cuda() 34 | 35 | attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() 36 | 37 | hidden_states = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask) 38 | assert hidden_states.shape[0] == sequence_length 39 | assert hidden_states.shape[1] == micro_batch_size 40 | assert hidden_states.shape[2] == config.hidden_size 41 | -------------------------------------------------------------------------------- /tests/transformer/test_transformer_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | class TestTransformerConfig: 5 | def test_transformer_config(self, transformer_config): 6 | 7 | assert transformer_config.hidden_size == 12 8 | assert transformer_config.ffn_hidden_size == 48 9 | assert transformer_config.num_attention_heads == 4 10 | assert transformer_config.kv_channels == 3 11 | -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy 2 | import torch 3 | from tests.unit_tests.test_utilities import Utils 4 | import numpy as np 5 | 6 | def test_vocab_parallel_cross_entropy(): 7 | Utils.initialize_model_parallel(4,2) 8 | vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda() 9 | target = torch.arange(0,32,2).cuda() 10 | output = vocab_parallel_cross_entropy(vocab_parallel_logits, target) 11 | expected_output = torch.tensor([10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309, 12 | 10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309]).cuda() 13 | assert(torch.equal(torch.round(expected_output), torch.round(output))) 14 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_data.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.data import broadcast_data 2 | import torch 3 | from tests.unit_tests.test_utilities import Utils 4 | 5 | def test_broadcast_data(): 6 | Utils.initialize_model_parallel(2,4) 7 | input_data = { 8 | 0 : torch.ones((8,8)).cuda() * 0.0, 9 | 1 : torch.ones((8,8)).cuda() * 1.0, 10 | 2 : torch.ones((8,8)).cuda() * 2.0, 11 | 3 : torch.ones((8,8)).cuda() * 3.0, 12 | 4 : torch.ones((8,8)).cuda() * 4.0, 13 | 5 : torch.ones((8,8)).cuda() * 5.0, 14 | 6 : torch.ones((8,8)).cuda() * 6.0, 15 | 7 : torch.ones((8,8)).cuda() * 7.0 16 | } 17 | dtype = torch.float32 18 | actual_output = broadcast_data([0,1],input_data, dtype) 19 | assert(torch.equal(actual_output[0], input_data[0])) 20 | assert(torch.equal(actual_output[1], input_data[1])) 21 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_random.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed 3 | from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER 4 | from megatron.core.tensor_parallel.random import checkpoint 5 | from tests.unit_tests.test_utilities import Utils 6 | import pytest 7 | import torch 8 | 9 | def test_cuda_rng_states_tracker(): 10 | rng_tracker = CudaRNGStatesTracker() 11 | rng_tracker.set_states({"state1":1234}) 12 | assert(rng_tracker.get_states()["state1"] == 1234) 13 | rng_tracker.reset() 14 | assert(rng_tracker.get_states() == {}) 15 | seed = 1111 16 | rng_tracker.add("state2",seed) 17 | with pytest.raises(Exception): 18 | assert(rng_tracker.add("state3",seed)) 19 | with pytest.raises(Exception): 20 | assert(rng_tracker.add("state2",111)) 21 | assert(rng_tracker.get_states()['state2'] is not None) 22 | with pytest.raises(Exception): 23 | assert() 24 | 25 | rng_tracker.fork("state2") 26 | torch.cuda.manual_seed(seed) 27 | rng_state = torch.cuda.get_rng_state() 28 | assert torch.equal(rng_tracker.get_states()['state2'], rng_state) 29 | 30 | def test_model_parallel_cuda_manual_seed(): 31 | Utils.initialize_model_parallel(4,2) 32 | model_parallel_cuda_manual_seed(0) 33 | assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None) 34 | Utils.destroy_model_parallel() 35 | 36 | def test_checkpoint(): 37 | def test_forward(*input): 38 | return input[0]+input[1] 39 | assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2))) 40 | Utils.initialize_model_parallel() 41 | input1 = torch.ones((4,4)) 42 | checkpoint(test_forward, True, input1, torch.ones((4,4))*2) 43 | assert(torch.equal(torch.ones(input1.numel()).cuda(), input1)) 44 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import megatron.core.tensor_parallel.utils as util 3 | import megatron.core.parallel_state as ps 4 | from tests.unit_tests.test_utilities import Utils 5 | 6 | rank = Utils.rank 7 | 8 | def test_split_tensor_along_last_dim(): 9 | input_tensor = torch.rand((3,4)) 10 | torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0]) 11 | torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1]) 12 | 13 | def test_split_tensor_into_1d_equal_chunks(): 14 | Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) 15 | input_tensor = torch.rand((3,4)) 16 | output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor) 17 | if rank % 2 == 0 : 18 | start = 0 19 | end = int(input_tensor.numel()/2) 20 | else : 21 | start = int(input_tensor.numel()/2) 22 | end = input_tensor.numel() 23 | 24 | assert torch.equal(output_tensor, input_tensor.flatten()[start:end]) 25 | Utils.destroy_model_parallel() 26 | 27 | def test_gather_split_1d_tensor(): 28 | Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) 29 | input_tensor = torch.ones((2,4)).cuda() * rank 30 | actual_output_tensor = util.gather_split_1d_tensor(input_tensor) 31 | if rank %2 == 0: 32 | expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1)) 33 | else : 34 | expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten())) 35 | assert(torch.equal(actual_output_tensor, expected_output_tensor)) 36 | Utils.destroy_model_parallel() 37 | 38 | def test_vocab(): 39 | global_vocab_size = 1600 40 | per_partition_vocab_size = 1600 / Utils.world_size 41 | assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size))) 42 | assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size))) 43 | -------------------------------------------------------------------------------- /tests/unit_tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /tests/unit_tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. 2 | 3 | import os 4 | import torch 5 | import megatron.core.parallel_state as ps 6 | 7 | from deepspeed.accelerator import get_accelerator 8 | 9 | class Utils: 10 | 11 | world_size = int(os.getenv("WORLD_SIZE", '1')) 12 | rank = int(os.getenv('LOCAL_RANK', '0')) 13 | 14 | @staticmethod 15 | def initialize_distributed(): 16 | print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') 17 | get_accelerator().set_device(Utils.rank % get_accelerator().device_count()) 18 | init_method = 'tcp://' 19 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 20 | master_port = os.getenv('MASTER_PORT', '6000') 21 | init_method += master_ip + ':' + master_port 22 | torch.distributed.init_process_group(backend=get_accelerator().communication_backend_name(), world_size=Utils.world_size, rank=Utils.rank, init_method=init_method) 23 | 24 | @staticmethod 25 | def destroy_model_parallel(): 26 | ps.destroy_model_parallel() 27 | torch.distributed.barrier() 28 | 29 | @staticmethod 30 | def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, sequence_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None): 31 | ps.destroy_model_parallel() 32 | if not torch.distributed.is_initialized(): 33 | Utils.initialize_distributed() 34 | ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, sequence_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank) -------------------------------------------------------------------------------- /tests/unit_tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import megatron.core.utils as util 4 | import numpy as np 5 | 6 | def test_divide_properly(): 7 | assert util.divide(4,2) == 2 8 | 9 | def test_divide_improperly(): 10 | with pytest.raises(AssertionError): 11 | util.divide(4,5) 12 | 13 | def test_global_memory_buffer(): 14 | global_memory_buffer = util.GlobalMemoryBuffer() 15 | obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor") 16 | expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device()) 17 | assert torch.equal(obtained_tensor, expected_tensor) 18 | 19 | def test_make_viewless_tensor(): 20 | inp = torch.rand((3,4)) 21 | assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True))) 22 | assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False))) 23 | 24 | def test_safely_set_viewless_tensor_data(): 25 | tensor = torch.zeros((3,4)) 26 | new_data_tensor = torch.tensor(np.random.rand(3,4)) 27 | util.safely_set_viewless_tensor_data(tensor, new_data_tensor) 28 | assert(torch.equal(tensor, new_data_tensor)) 29 | 30 | def test_assert_viewless_tensor(): 31 | tensor = torch.rand((3,4)) 32 | assert(torch.equal(util.assert_viewless_tensor(tensor), tensor)) 33 | input_tensor_list=[tensor,tensor,tensor] 34 | output_tensor_list = util.assert_viewless_tensor(input_tensor_list) 35 | for inp,out in zip(input_tensor_list, output_tensor_list): 36 | assert(torch.equal(inp,out)) 37 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tools/__init__.py -------------------------------------------------------------------------------- /tools/bert_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder 4 | -------------------------------------------------------------------------------- /tools/bert_embedding/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "h5py", 7 | "transformers", # for huggingface bert 8 | ] 9 | 10 | for lib in required_libs: 11 | try: 12 | globals()[lib] = importlib.import_module(lib) 13 | except ImportError as e: 14 | raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.") 15 | -------------------------------------------------------------------------------- /tools/convert_checkpoint/inspect_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os 4 | from collections import OrderedDict 5 | 6 | 7 | def dump_data(datum, name_list=[]): 8 | if type(datum) in (dict, OrderedDict): 9 | for k, v in datum.items(): 10 | dump_data(v, name_list+[str(k)]) 11 | elif type(datum) in (list, tuple): 12 | for v in datum: 13 | dump_data(v, name_list) 14 | elif torch.is_tensor(datum): 15 | prefix = '.'.join(name_list) 16 | print(f'[tensor] {prefix} = {datum.shape}') 17 | else: 18 | #pass 19 | prefix = '.'.join(name_list) 20 | print(f'[other] {prefix} = {datum}') 21 | 22 | def main(): 23 | if len(sys.argv) < 2: 24 | print(f'Usage: {sys.argv[0]} ') 25 | exit(1) 26 | 27 | ckpt_file = sys.argv[1] 28 | if not os.path.isfile(ckpt_file): 29 | print(f'{ckpt_file} is not a valid file') 30 | exit(1) 31 | 32 | print(f'loading checkpoint file: {ckpt_file}') 33 | sd = torch.load(ckpt_file) 34 | dump_data(sd) 35 | 36 | quit() 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | 8 | """ 9 | This code adds id to each json object in a json file. User can add prefix 10 | to the ids. 11 | """ 12 | 13 | if __name__ == '__main__': 14 | 15 | print('parsing the arguments ...') 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 19 | ' json file where id needs to be added') 20 | parser.add_argument('--output-file', type=str, default=None, help=\ 21 | 'Output file name with id') 22 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 23 | 'Id prefix') 24 | parser.add_argument('--log-interval', type=int, default=100, 25 | help='Log interval') 26 | args = parser.parse_args() 27 | 28 | print('Adding ids to dataset ...') 29 | 30 | f_input = open(args.input_file, 'r', encoding='utf-8') 31 | f_output = open(args.output_file, 'wb') 32 | 33 | unique_ids = 1 34 | start_time = time.time() 35 | for row in f_input: 36 | each_row = json.loads(row) 37 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 38 | each_row['adlr_id'] = adlr_id_string 39 | myjson = json.dumps(each_row, ensure_ascii=False) 40 | 41 | f_output.write(myjson.encode('utf-8')) 42 | f_output.write('\n'.encode('utf-8')) 43 | 44 | if unique_ids % args.log_interval == 0: 45 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 46 | unique_ids, time.time() - start_time), flush=True) 47 | 48 | unique_ids += 1 49 | 50 | # Close the file. 51 | f_input.close() 52 | f_output.close() 53 | 54 | print('done :-)', flush=True) 55 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import glob 5 | import sys 6 | import json 7 | import argparse 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--json_path", type=str, default=".", 13 | help="path where all the json files are located") 14 | 15 | parser.add_argument("--output_file", type=str, default="merged_output.json", 16 | help="filename where the merged json should go") 17 | 18 | args = parser.parse_args() 19 | 20 | json_path = args.json_path 21 | out_file = args.output_file 22 | 23 | json_files = glob.glob(json_path + '/*.json') 24 | 25 | counter = 0 26 | 27 | with open(out_file, 'w') as outfile: 28 | for fname in json_files: 29 | counter += 1 30 | 31 | if counter % 1024 == 0: 32 | print("Merging at ", counter, flush=True) 33 | 34 | with open(fname, 'r') as infile: 35 | for row in infile: 36 | each_row = json.loads(row) 37 | outfile.write(row) 38 | 39 | 40 | print("Merged file", out_file, flush=True) 41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import json 5 | import time 6 | import sys 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | url_filename = sys.argv[1] 12 | data_filename = sys.argv[2] 13 | output_filename = sys.argv[3] 14 | 15 | urls = set() 16 | with open(url_filename, 'r') as f: 17 | for line in f: 18 | myjson = json.loads(line) 19 | for key in myjson: 20 | this_urls = myjson[key] 21 | for i in range(1, len(this_urls)): 22 | urls.add(this_urls[i]) 23 | print('will be removing {} urls'.format(len(urls)), flush=True) 24 | 25 | written_docs = 0 26 | removed_docs = 0 27 | removed_chars = 0 28 | start_time = time.time() 29 | with open(output_filename, 'wb') as fout: 30 | with open(data_filename, 'r') as fin: 31 | for line in fin: 32 | try: 33 | myjson = json.loads(line) 34 | url = myjson['url'] 35 | if url in urls: 36 | print('removing', myjson) 37 | removed_docs += 1 38 | removed_chars += len(myjson['text']) 39 | continue 40 | myjson = json.dumps(myjson, ensure_ascii=False) 41 | fout.write(myjson.encode('utf-8')) 42 | fout.write('\n'.encode('utf-8')) 43 | written_docs += 1 44 | if written_docs % 10000 == 0: 45 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 46 | '| removed: {} (char: {})'.format( 47 | time.time() - start_time, 48 | written_docs, removed_docs, removed_chars)) 49 | except Exception as e: 50 | print('[SKIPPING]', line, e) 51 | 52 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 53 | '| removed: {} (char: {})'.format( 54 | time.time() - start_time, 55 | written_docs, removed_docs, removed_chars)) 56 | print('done :-)') 57 | -------------------------------------------------------------------------------- /tools/retro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/Megatron-DeepSpeed/3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26/tools/retro/__init__.py -------------------------------------------------------------------------------- /tools/retro/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .cli import retro 4 | -------------------------------------------------------------------------------- /tools/retro/cli/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | from . import retro 6 | 7 | 8 | if __name__ == "__main__": 9 | retro.init(os.environ["RETRO_WORKDIR"]) 10 | -------------------------------------------------------------------------------- /tools/retro/db/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .build import build_db 4 | -------------------------------------------------------------------------------- /tools/retro/examples/get_dataset_configs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Small English Wikipedia dataset (~2M chunks). 4 | get_wiki_tiny_config() { 5 | RETRO_INDEX_STR="IVF4096_HNSW4,Flat" 6 | RETRO_NCHUNKS_SAMPLED=2281307 7 | RETRO_GPT_TRAIN_SAMPLES=31250 8 | LR_DECAY_SAMPLES=2 9 | LR_WARMUP_SAMPLES=1 10 | RETRO_GPT_EVAL_INTERVAL=2000 11 | RETRO_GPT_EVAL_ITERS=100 12 | RETRO_EF_SEARCH=4 13 | RETRO_NPROBE=64 14 | DATALOADER_TYPE=cyclic 15 | } 16 | 17 | # English Wikipedia dataset (~67M chunks). 18 | get_wiki_config() { 19 | RETRO_INDEX_STR="IVF262144_HNSW32,Flat" 20 | RETRO_NCHUNKS_SAMPLED=66625331 21 | RETRO_GPT_TRAIN_SAMPLES=2037248 22 | LR_DECAY_SAMPLES=2 23 | LR_WARMUP_SAMPLES=1 24 | RETRO_GPT_EVAL_INTERVAL=2000 25 | RETRO_GPT_EVAL_ITERS=100 26 | RETRO_EF_SEARCH=16 27 | RETRO_NPROBE=4096 28 | DATALOADER_TYPE=cyclic 29 | } 30 | 31 | # Full corpus (~5B chunks). 32 | get_corpus_config() { 33 | RETRO_INDEX_STR="OPQ64_128,IVF4194304_HNSW32,PQ64" 34 | RETRO_NCHUNKS_SAMPLED=300000000 35 | RETRO_GPT_TRAIN_SAMPLES=192000000 36 | LR_DECAY_SAMPLES=166400000 37 | LR_WARMUP_SAMPLES=162761 38 | RETRO_GPT_EVAL_INTERVAL=2000 39 | RETRO_GPT_EVAL_ITERS=50 40 | RETRO_EF_SEARCH=32 41 | RETRO_NPROBE=4096 42 | DATALOADER_TYPE=single 43 | } 44 | -------------------------------------------------------------------------------- /tools/retro/examples/preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -u 4 | unset NCCL_DEBUG 5 | 6 | NPROCS=8 # NPROCS must be <= number of GPUs. 7 | 8 | set_current_dir() { 9 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 10 | } 11 | 12 | ################ Dataset configs. ################ 13 | # This script contains methods to customize arguments to specific dataset 14 | # types. Customize this script as needed for your datasets. 15 | set_current_dir 16 | . $DIR/get_dataset_configs.sh 17 | 18 | ################ Environment variables. ################ 19 | # *Note*: See 'Required environment variables' in 'get_preprocess_cmd.sh' for 20 | # a description of the required environment variables. These variables can be 21 | # set however a user would like. In our setup, we use another bash script 22 | # (location defined by $RETRO_ENV_VARS) that sets all the environment variables 23 | # at once. 24 | . $RETRO_ENV_VARS 25 | 26 | ######## Environment vars. ######## 27 | set_current_dir 28 | . ${DIR}/get_preprocess_cmd.sh 29 | 30 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 31 | echo "DIR = '$DIR'." 32 | echo "RETRO_PREPROCESS_CMD = '$RETRO_PREPROCESS_CMD'." 33 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 34 | 35 | ######## Command. ######## 36 | FULL_CMD="\ 37 | pwd && cd ${REPO_DIR} && pwd && \ 38 | export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \ 39 | python -m torch.distributed.run \ 40 | --nproc_per_node ${NPROCS} \ 41 | --nnodes 1 \ 42 | --node_rank ${NODE_RANK} \ 43 | --master_addr ${MASTER_ADDR} \ 44 | --master_port 6000 \ 45 | $RETRO_PREPROCESS_CMD \ 46 | " 47 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 48 | echo "FULL_CMD = '$FULL_CMD'." 49 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 50 | eval $FULL_CMD 51 | -------------------------------------------------------------------------------- /tools/retro/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "faiss", 7 | "h5py", 8 | "transformers", # for huggingface bert 9 | ] 10 | 11 | for lib in required_libs: 12 | try: 13 | globals()[lib] = importlib.import_module(lib) 14 | except ImportError as e: 15 | raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.") 16 | -------------------------------------------------------------------------------- /tools/retro/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .build import add_to_index, build_index, train_index 4 | # from .index import Index 5 | -------------------------------------------------------------------------------- /tools/retro/index/factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .indexes import FaissBaseIndex, FaissParallelAddIndex 4 | 5 | 6 | class IndexFactory: 7 | '''Get index. 8 | 9 | Index type generally read from argument '--retro-index-ty'. 10 | ''' 11 | 12 | @classmethod 13 | def get_index_class(cls, index_type): 14 | return { 15 | "faiss-base" : FaissBaseIndex, 16 | "faiss-par-add" : FaissParallelAddIndex, 17 | }[index_type] 18 | 19 | @classmethod 20 | def get_index(cls, index_type): 21 | index_class = cls.get_index_class(index_type) 22 | index = index_class() 23 | return index 24 | -------------------------------------------------------------------------------- /tools/retro/index/index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import abc 4 | import numpy as np 5 | import os 6 | import torch 7 | 8 | from megatron import get_retro_args 9 | from tools.retro.external_libs import faiss 10 | 11 | from .utils import get_index_dir 12 | 13 | 14 | class Index(abc.ABC): 15 | 16 | '''Abstract base class for indexes. 17 | 18 | *Note* : While currently only Faiss-based classes are implemented, in the 19 | future, this class will be extended with other types of indexes that have 20 | different performance-accuracy trade-offs. 21 | 22 | The primary methods to override are: 23 | - train() : Train index on the sampled training chunks. 24 | - add() : Add all training chunks to index. 25 | ''' 26 | 27 | @classmethod 28 | def c_verbose(cls, index, v): 29 | '''Make index object verbose.''' 30 | assert isinstance(v, bool) 31 | faiss.ParameterSpace().set_index_parameter(index, "verbose", v) 32 | 33 | def get_empty_index_path(self): 34 | args = get_retro_args() 35 | return os.path.join( 36 | get_index_dir(), 37 | "empty_%.3f.faissindex" % args.retro_index_train_load_fraction, 38 | ) 39 | 40 | def get_empty_index(self): 41 | return faiss.read_index(self.get_empty_index_path()) 42 | 43 | def get_added_index_path(self): 44 | args = get_retro_args() 45 | return os.path.join( 46 | get_index_dir(), 47 | "added_%.3f_%.3f.faissindex" % ( 48 | args.retro_index_train_load_fraction, 49 | args.retro_index_add_load_fraction, 50 | ), 51 | ) 52 | 53 | def get_added_index(self): 54 | return faiss.read_index(self.get_added_index_path()) 55 | 56 | @abc.abstractmethod 57 | def train(self, *args): 58 | pass 59 | 60 | @abc.abstractmethod 61 | def add(self, *args): 62 | pass 63 | 64 | def embed_text_dataset_block(self, embedder, text_dataset, _range): 65 | '''Embed a range of a text dataset.''' 66 | sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range)) 67 | return embedder.embed_text_dataset(sub_dataset) 68 | -------------------------------------------------------------------------------- /tools/retro/index/indexes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .faiss_base import FaissBaseIndex 4 | from .faiss_par_add import FaissParallelAddIndex 5 | -------------------------------------------------------------------------------- /tools/retro/index/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import concurrent 4 | import gc 5 | import glob 6 | import numpy as np 7 | import os 8 | import psutil 9 | import time 10 | import torch 11 | from tqdm import tqdm 12 | 13 | from megatron import get_retro_args, print_rank_0 14 | from tools.retro.db.utils import get_indexed_dataset_infos 15 | from tools.retro.external_libs import h5py 16 | 17 | 18 | def get_index_dir(): 19 | """Create sub-directory for this index.""" 20 | 21 | args = get_retro_args() 22 | 23 | # Directory path. 24 | index_dir_path = os.path.join( 25 | args.retro_workdir, 26 | "index", 27 | args.retro_index_type, 28 | args.retro_index_str, 29 | ) 30 | 31 | # Make directory. 32 | os.makedirs(index_dir_path, exist_ok=True) 33 | 34 | return index_dir_path 35 | 36 | 37 | def num_samples_to_block_ranges(num_samples): 38 | '''Split a range (length num_samples) into sequence of block ranges 39 | of size block_size.''' 40 | args = get_retro_args() 41 | block_size = args.retro_block_size 42 | start_idxs = list(range(0, num_samples, block_size)) 43 | end_idxs = [min(num_samples, s + block_size) for s in start_idxs] 44 | ranges = list(zip(start_idxs, end_idxs)) 45 | return ranges 46 | 47 | 48 | def get_training_data_root_dir(): 49 | args = get_retro_args() 50 | return os.path.join(args.retro_workdir, "index", "train_emb") 51 | 52 | 53 | def get_training_data_block_dir(): 54 | return os.path.join(get_training_data_root_dir(), "blocks") 55 | 56 | 57 | def get_training_data_block_paths(): 58 | return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5")) 59 | 60 | 61 | def get_training_data_merged_path(): 62 | args = get_retro_args() 63 | return os.path.join(get_training_data_root_dir(), 64 | "train_%.3f.bin" % args.retro_index_train_load_fraction) 65 | 66 | 67 | def get_added_codes_dir(): 68 | return os.path.join(get_index_dir(), "add_codes") 69 | 70 | 71 | def get_added_code_paths(): 72 | return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5")) 73 | -------------------------------------------------------------------------------- /tools/retro/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .query import query_pretraining_neighbors 4 | -------------------------------------------------------------------------------- /tools/retro/query/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import hashlib 4 | import os 5 | 6 | from megatron import get_retro_args 7 | 8 | 9 | def get_query_workdir(): 10 | args = get_retro_args() 11 | return os.path.join(args.retro_workdir, "query") 12 | 13 | 14 | def get_neighbor_dirname(key, dataset): 15 | hashes = ",".join([ d.desc_hash for d in dataset.datasets ]) 16 | hash = hashlib.md5(hashes.encode()).hexdigest() 17 | return os.path.join(get_query_workdir(), os.path.basename(f"{key}_{hash}")) 18 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import sys 3 | import json 4 | import requests 5 | 6 | 7 | if __name__ == "__main__": 8 | url = sys.argv[1] 9 | url = 'http://' + url + '/api' 10 | headers = {'Content-Type': 'application/json'} 11 | 12 | while True: 13 | sentence = input("Enter prompt: ") 14 | tokens_to_generate = int(eval(input("Enter number of tokens to generate: "))) 15 | 16 | data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} 17 | response = requests.put(url, data=json.dumps(data), headers=headers) 18 | 19 | if response.status_code != 200: 20 | print(f"Error {response.status_code}: {response.json()['message']}") 21 | else: 22 | print("Megatron Response: ") 23 | print(response.json()['text'][0]) 24 | --------------------------------------------------------------------------------