├── .gitignore ├── .gitlab-ci.yml ├── CODEOWNERS ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── dataset ├── README.md ├── download_books.sh ├── download_ckpt.sh └── download_vocab.sh ├── examples ├── MoE │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_config_gpt_Zero2_TEMPLATE.json │ ├── ds_evalharness.sh │ ├── ds_pretrain_gpt_1.3B_MoE128.sh │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128.sh │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh │ ├── ds_pretrain_gpt_1.3B_dense.sh │ ├── ds_pretrain_gpt_1.3B_dense_cl.sh │ ├── ds_pretrain_gpt_125M_MoE64.sh │ ├── ds_pretrain_gpt_125M_dense_cl.sh │ ├── ds_pretrain_gpt_350M_MoE128.sh │ ├── ds_pretrain_gpt_350M_PR-MoE32or64.sh │ ├── ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh │ ├── ds_pretrain_gpt_350M_dense.sh │ ├── ds_pretrain_gpt_6.7B_dense.sh │ └── readme_evalharness.md ├── README.md ├── azure │ ├── README.md │ ├── run-175b.sh │ ├── run-1t.sh │ └── run-benchmark-model.sh ├── azureml │ ├── Dockerfile.dockerfile │ ├── README.md │ ├── aml_submit.py │ └── prepare_dataset.py ├── bert_with_pile │ ├── README.md │ ├── ds_config_bert_TEMPLATE.json │ ├── ds_finetune_bert_mnli.sh │ ├── ds_finetune_bert_qqp.sh │ ├── ds_finetune_bert_race.sh │ ├── ds_pretrain_bert.sh │ └── prepare_pile_data.py ├── compression │ ├── 125M-Int8-test-64gpu-distilled-group48.sh │ ├── 125M-L10-Int8-test-64gpu-distilled-group48.sh │ ├── 125M-L12-Int8-test-64gpu-distilled-group48.sh │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_config_gpt_TEMPLATE_compression.json │ ├── ds_evalharness.sh │ ├── ds_pretrain_gpt_1.3B_dense_cl_kd.sh │ ├── ds_pretrain_gpt_125M_dense_cl_kd.sh │ ├── ds_pretrain_gpt_125M_dense_kd.sh │ └── ds_pretrain_gpt_350M_dense_kd.sh ├── create_embeddings.sh ├── curriculum_learning │ ├── README.md │ ├── ds_pretrain_gpt2.sh │ ├── ds_train.sh │ ├── ds_zero_stage_1_config_baseline.json │ └── ds_zero_stage_1_config_curriculum_fixed_linear.json ├── data_efficiency │ ├── README.md │ ├── analyze_data.py │ ├── bert │ │ ├── ds_analyze_bert_data_map.sh │ │ ├── ds_analyze_bert_data_reduce.sh │ │ ├── finetune │ │ │ ├── ds_config_bert_TEMPLATE.json │ │ │ ├── ds_finetune_bert_mnli.sh │ │ │ ├── ds_finetune_bert_qqp.sh │ │ │ ├── ds_finetune_bert_race.sh │ │ │ └── ds_finetune_gather_result.py │ │ ├── finetune_glue │ │ │ ├── ds_config_bert_TEMPLATE.json │ │ │ ├── ds_finetune_bert_glue.sh │ │ │ ├── ds_finetune_bert_glue_run.sh │ │ │ └── ds_finetune_gather_result.py │ │ ├── pile_data_download_preprocess.py │ │ └── pretrain │ │ │ ├── ds_config_bert_1clmetric_TEMPLATE.json │ │ │ ├── ds_config_bert_2clmetrics_TEMPLATE.json │ │ │ ├── ds_pretrain_bert_336M_base_script.sh │ │ │ └── ds_pretrain_bert_336M_run.sh │ └── gpt │ │ ├── ds_analyze_gpt_data_map.sh │ │ ├── ds_analyze_gpt_data_reduce.sh │ │ ├── eval │ │ ├── ds_config_eval_dummy.json │ │ ├── ds_evalharness_1gpu.sh │ │ ├── ds_evalharness_gather_result.py │ │ ├── ds_evalharness_parallel_run.sh │ │ └── ds_evalharness_parallel_run_10shot.sh │ │ └── pretrain │ │ ├── ds_config_gpt_1clmetric_TEMPLATE.json │ │ ├── ds_config_gpt_2clmetrics_TEMPLATE.json │ │ ├── ds_pretrain_gpt_1.3B_dense_base_script.sh │ │ └── ds_pretrain_gpt_1.3B_dense_run.sh ├── evaluate_ict_zeroshot_nq.sh ├── evaluate_zeroshot_gpt.sh ├── finetune_mnli_distributed.sh ├── finetune_race_distributed.sh ├── generate_text.sh ├── merge_mp_bert.sh ├── pretrain_bert.sh ├── pretrain_bert_distributed.sh ├── pretrain_bert_distributed_with_mp.sh ├── pretrain_gpt.sh ├── pretrain_gpt3_175B.sh ├── pretrain_gpt_distributed.sh ├── pretrain_gpt_distributed_with_mp.sh ├── pretrain_ict.sh ├── pretrain_llama_distributed.sh ├── pretrain_t5.sh ├── pretrain_t5_distributed.sh ├── pretrain_t5_distributed_with_mp.sh └── run_deepspeed_example.sh ├── images └── cases_april2021.png ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── data │ ├── Makefile │ ├── __init__.py │ ├── autoaugment.py │ ├── bert_dataset.py │ ├── biencoder_dataset_utils.py │ ├── blendable_dataset.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── gpt_dataset.py │ ├── helpers.cpp │ ├── ict_dataset.py │ ├── indexed_dataset.py │ ├── orqa_wiki_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ ├── t5_dataset.py │ ├── test │ │ ├── test_indexed_dataset.py │ │ └── test_preprocess_data.sh │ └── vit_dataset.py ├── enums.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── layer_norm_cuda.cpp │ ├── layer_norm_cuda_kernel.cu │ ├── scaled_masked_softmax.cpp │ ├── scaled_masked_softmax.h │ ├── scaled_masked_softmax_cuda.cu │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax.h │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── learning_rates.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── distributed.py │ ├── enums.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_softmax.py │ ├── gpt_model.py │ ├── language_model.py │ ├── module.py │ ├── multiple_choice.py │ ├── realm_model.py │ ├── rotary_pos_embedding.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vit_model.py ├── mpu │ ├── __init__.py │ ├── cross_entropy.py │ ├── data.py │ ├── initialize.py │ ├── layers.py │ ├── mappings.py │ ├── random.py │ ├── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py │ └── utils.py ├── optimizer │ ├── __init__.py │ ├── clip_grads.py │ ├── grad_scaler.py │ └── optimizer.py ├── p2p_communication.py ├── package_info.py ├── schedules.py ├── text_generation_utils.py ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ ├── sp_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_ict.py ├── pretrain_t5.py ├── pretrain_vit.py ├── requirements.txt ├── setup.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_harness │ ├── download.py │ ├── evaluate.py │ └── report-to-csv.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── cola.py │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ ├── mrpc.py │ ├── qnli.py │ ├── qqp.py │ ├── rte.py │ ├── sst2.py │ └── stsb.py ├── main.py ├── orqa │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ └── natural_questions │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification.py │ ├── eval_utils.py │ ├── finetune_utils.py │ └── main.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── run_megatron.py ├── test_basic.py └── test_megatron.py └── tools ├── convert_checkpoint ├── README.md ├── deepspeed_checkpoint.py ├── deepspeed_to_megatron.py ├── deepspeed_to_transformers.py ├── inspect_checkpoint.py └── inspect_deepspeed_checkpoint.py ├── create_doc_index.py ├── generate_samples_gpt.py ├── linter.py ├── merge_mp_partitions.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py └── preprocess_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | 3 | # Distribution / packaging 4 | build/ 5 | dist/ 6 | *.egg-info/ 7 | 8 | # binaries 9 | *.so 10 | 11 | # tmp files 12 | *.swp 13 | 14 | # AML workspace config file 15 | *config.json -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel 2 | 3 | test: 4 | script: 5 | - pytest --junitxml=report.xml tests 6 | artifacts: 7 | when: always 8 | reports: 9 | junit: report.xml 10 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft @guanhuawang 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | recursive-include megatron/fused_kernels *.cpp *.h *.cu *.tr *.cuh *.cc 4 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | # Run the scripts below to setup dataset 2 | 3 | bash download_books.sh 4 | 5 | bash download_vocab.sh 6 | -------------------------------------------------------------------------------- /dataset/download_books.sh: -------------------------------------------------------------------------------- 1 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin 2 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx -------------------------------------------------------------------------------- /dataset/download_ckpt.sh: -------------------------------------------------------------------------------- 1 | mkdir -p checkpoints/gpt2_345m 2 | 3 | cd checkpoints/gpt2_345m 4 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip 5 | unzip megatron_lm_345m_v0.0.zip 6 | rm megatron_lm_345m_v0.0.zip 7 | cd ../.. 8 | 9 | -------------------------------------------------------------------------------- /dataset/download_vocab.sh: -------------------------------------------------------------------------------- 1 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -------------------------------------------------------------------------------- /examples/MoE/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | "curriculum_learning": { 27 | "enabled": CONFIG_CL_ENABLED, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | }, 37 | 38 | "wall_clock_breakdown" : false 39 | } 40 | -------------------------------------------------------------------------------- /examples/MoE/ds_config_gpt_Zero2_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": 2 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": false, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | "curriculum_learning": { 26 | "enabled": CONFIG_CL_ENABLED, 27 | "curriculum_type": "seqlen", 28 | "min_difficulty": CONFIG_CL_MIN, 29 | "max_difficulty": CONFIG_CL_MAX, 30 | "schedule_type": "fixed_linear", 31 | "schedule_config": { 32 | "total_curriculum_step": CONFIG_CL_DURATION, 33 | "difficulty_step": 8 34 | } 35 | }, 36 | 37 | "wall_clock_breakdown" : false 38 | } 39 | -------------------------------------------------------------------------------- /examples/MoE/ds_evalharness.sh: -------------------------------------------------------------------------------- 1 | # This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory. 2 | 3 | CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B/global_step81566/ 4 | CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B.json 5 | RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log 6 | 7 | PP_SIZE=1 8 | TP_SIZE=1 9 | NO_PP="true" 10 | EP_PARALLEL_SIZE=1 11 | # Currently eval harness does not support data parallel 12 | # However, for MoE models it's possible to enable a "fake data parallel" 13 | # in order to load experts on multiple gpus. At the same time, it's not 14 | # real data parallel because we load the same data on all gpus. 15 | # On the other hand, it's better to use less number of gpus than training, 16 | # to reduce communication overhead. 17 | NUM_NODE=1 18 | NUM_GPU_PER_NODE=1 19 | 20 | TASKS="lambada" 21 | # WikiText-2, not used in GPT-3 paper but used in GPT-2 paper 22 | # TASKS="wikitext" 23 | # Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2. 24 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext" 25 | # All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test. 26 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli" 27 | 28 | VOCAB_FILE=/data/Megatron-LM/data/gpt2-vocab.json 29 | MERGE_FILE=/data/Megatron-LM/data/gpt2-merges.txt 30 | 31 | export HF_DATASETS_OFFLINE=1 32 | 33 | # Dummy arguments to make megatron happy. No need to configure them. 34 | # The reason we don't need to configure them and many other arguments is 35 | # because the eval framework will read the arguments from checkpoint file. 36 | MEGATRON_REQUIRED_ARGS="\ 37 | --num-layers -1\ 38 | --hidden-size -1\ 39 | --num-attention-heads -1\ 40 | --seq-length -1 \ 41 | --max-position-embeddings -1 42 | " 43 | 44 | CMD="../../tasks/eval_harness/evaluate.py \ 45 | --load $CHECKPOINT_PATH\ 46 | --tensor-model-parallel-size $TP_SIZE \ 47 | --pipeline-model-parallel-size $PP_SIZE\ 48 | --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \ 49 | --vocab-file $VOCAB_FILE\ 50 | --merge-file $MERGE_FILE\ 51 | --micro-batch-size 12\ 52 | --no-load-optim \ 53 | --no-load-rng \ 54 | --inference \ 55 | --disable-moe-token-dropping \ 56 | --adaptive_seq_len\ 57 | --eval_fp32\ 58 | --task_list $TASKS\ 59 | --results_path $RESULT_PATH \ 60 | --deepspeed \ 61 | --deepspeed_config $CONFIG_PATH \ 62 | $MEGATRON_REQUIRED_ARGS\ 63 | " 64 | 65 | if [[ "${NO_PP}" = "true" ]]; then 66 | CMD="${CMD} \ 67 | --no-pipeline-parallel" 68 | fi 69 | 70 | LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE" 71 | $LAUNCHER $CMD -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## Recipes and Scripts 2 | 3 | Please note that some of the script examples (e.g., pretrain_*.sh directly under ```Megatron-DeepSpeed/examples/``` folder) are from the original NVIDIA's Megatron-LM and does not have DeepSpeed integration (scripts with DeepSpeed integration should include the ```deepspeed``` keyword). Below we list various examples that do have DeepSpeed integration. 4 | 5 | ### Azure 6 | 7 | We strongly recommend to start with AzureML recipe in the ```azureml``` folder. 8 | 9 | If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder. 10 | 11 | ### MoE 12 | 13 | Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models. 14 | 15 | ### Data Efficiency 16 | 17 | The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library. Please refer to the detailed tutorials in data_efficiency/README.MD. 18 | 19 | ### Curriculum Learning 20 | 21 | Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models. 22 | Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above. 23 | 24 | ### Model Compression 25 | 26 | The ```compression``` folder includes examples about layer reduction for task-agnostic compression. Please refer to [this tutorial](https://www.deepspeed.ai/tutorials/model-compression/#11-layer-reduction) about the DeepSpeed Model Compression Library. These recipes are for GPT-style NLG models. 27 | 28 | ### BERT example 29 | 30 | The ```bert_with_pile``` folder includes examples about BERT-style model pre-training (using the public Pile data or user's own data) with DeepSpeed integration. Please refer to the readme in the folder for tutorial. 31 | -------------------------------------------------------------------------------- /examples/azure/README.md: -------------------------------------------------------------------------------- 1 | ## Recipes for experimentation on Azure 2 | 3 | The recipes have been tested on command line on a cluster setup using Azure VMs and VMSS as well as inside Docker based environments. 4 | 5 | To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows 6 | 7 | ```bash examples/azure/run-benchmark-model.sh``` 8 | 9 | ### Pre-requisites 10 | 11 | To run the above script, you will need to either setup your own dataset and modify the scripts or use our helper scripts to download the publicly available Books dataset and GPT vocab files. Please use the following from the ```dataset``` folder 12 | 13 | ```bash dataset/download_books.sh``` 14 | 15 | ```bash dataset/download_vocab.sh``` 16 | 17 | ### Run 175B and 1T models 18 | 19 | We have included two recipes for the 175B model and the 1T model. To train the model, we assume that the users will modify and tune hyperparameters and configurations by themselves. To facilitate initial training, we have made the recipes runnable with the Books dataset as follows. 20 | 21 | ```bash examples/azure/run-175b.sh``` 22 | 23 | ```bash examples/azure/run-1t.sh``` 24 | 25 | ### Note about ZeRO stage 3 and CPU offload 26 | 27 | By default, we have enabled ZeRO Stage 3 for both the recipes above. For the 1T model, we have also enabled the CPU-offload feature to save on memory and enable a larger batch size that offers better performance. 28 | -------------------------------------------------------------------------------- /examples/azureml/Dockerfile.dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.5-gpu 2 | USER root:root 3 | 4 | RUN pip install pybind11 5 | RUN pip install regex -------------------------------------------------------------------------------- /examples/azureml/README.md: -------------------------------------------------------------------------------- 1 | ## Megatron-DeepSpeed on AzureML 2 | Example script for running Megatron-DeepSpeed using Azure Machine Learning. 3 | 4 | ------ 5 | 6 | # Workspace Setup 7 | Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up). 8 | 9 | # Dataset Preparation 10 | Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 11 | Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset. 12 | 13 | # Training 14 | Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py). 15 | -------------------------------------------------------------------------------- /examples/azureml/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | # Use this script to upload data to blob store 2 | 3 | # AzureML libraries 4 | from azureml.core import Workspace 5 | from azureml.core.dataset import Dataset 6 | from azureml.data.datapath import DataPath 7 | 8 | ws = Workspace.from_config() 9 | print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n') 10 | 11 | data_dir = "bookcorpus_data" # Local directory for where data is located that includes .bin and .idx files 12 | blobstore_datadir = data_dir # Blob store directory to store data in 13 | 14 | datastore = ws.get_default_datastore() 15 | 16 | # Book Corpus Data 17 | print("upload dataset to blob store") 18 | uploaded_data = Dataset.File.upload_directory( 19 | src_dir=data_dir, 20 | target=DataPath(datastore, blobstore_datadir), 21 | show_progress=True 22 | ) 23 | 24 | # Usage after uploading the directory 25 | # To refer to the folder directly: 26 | train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)]) 27 | print(train_dataset) 28 | # To refer to a specific file: 29 | # train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir + "/filename.ext")]) 30 | # Create DatasetConsumptionConfig to specify how to deliver the dataset to a compute target. 31 | # In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target. 32 | # input_data_dir = train_dataset.as_mount() 33 | # input_data_dir = train_dataset.as_download() 34 | -------------------------------------------------------------------------------- /examples/bert_with_pile/README.md: -------------------------------------------------------------------------------- 1 | This ```bert_with_pile``` folder includes examples about BERT pre-training (using [the public Pile data](https://github.com/EleutherAI/the-pile) or user's own data) with DeepSpeed integration. We also provide scripts about preprocessing Pile data and MNLI finetuning. 2 | 3 | ## Data preprocessing 4 | ```prepare_pile_data.py``` is the script for downloading, decompressing, and preprocessing [the public Pile data](https://github.com/EleutherAI/the-pile). Users can also modify this script to preprocess their own training data. 5 | 6 | ## BERT pre-training 7 | ```ds_pretrain_bert.sh``` is the script for BERT pre-training integrated with DeepSpeed, supporting [ZeRO](https://www.deepspeed.ai/tutorials/zero/) together with Megatron's tensor-slicing model parallelism. The training hyperparameters follow the [Megatron paper](https://arxiv.org/abs/1909.08053). Note that the pipeline parallelism is currently not supported: DeepSpeed's pipeline parallelism is only integrated with the GPT case, and currently DeepSpeed is not integrated with Megatron's own pipeline parallelism. 8 | 9 | As a reference performance number, our measurements show that our example is able to achieve a throughput up to 145 TFLOPs per GPU when pre-training a 1.3B BERT model (with ZeRO stage-1, without model parallelism, with 64 NVIDIA A100 GPUs, with batch size 4096 (64 per GPU), with activation checkpointing). 10 | 11 | One thing to note is that this pre-training recipe is NOT a strict reproduction of the [original BERT paper](https://arxiv.org/abs/1810.04805): the Pile data is larger than the data used in original BERT (and the data used by Megatron paper); Megatron-LM introduces some changes to the BERT model (see details in [Megatron paper](https://arxiv.org/abs/1909.08053)); the training hyperparameters are also different. Overall these differences lead to longer training time but also better model quality than original BERT (see MNLI score below), and supporting large model scale by the combination of ZeRO and model parallelism. If you don't have enough computation budget, we recommend to reduce the total training iterations (```train_iters``` in the script) and potentially increase the learning rate at the same time. If you want to strictly reproduce original BERT, we recommend to use our [another BERT example](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert). 12 | 13 | ## BERT MNLI fine-tuning 14 | ```ds_finetune_bert_mnli.sh``` is the script for BERT MNLI fine-tuning, following the hyperparameters in the [Megatron paper](https://arxiv.org/abs/1909.08053). As a reference, table below present the scores using the model pre-trained based on the script above, comparing with the scores of original BERT and Megatron paper's BERT. Our BERT-Large's score is slightly lower than Megatron paper's, mainly due to the different data we used (Pile data is much diverse and larger than the data in Megatron paper, which potentially has negative effect on small million-scale models). 15 | 16 | | MNLI dev set accuracy | **MNLI-m** | **MNLI-mm** | 17 | | ---------- |---------- |---------- | 18 | | BERT-Base, [original BERT](https://arxiv.org/abs/1810.04805) | 84.6 | 83.4 | 19 | | BERT-Base, ours (median on 5 seeds) | 86.1 | 86.1 | 20 | | BERT-Large, [original BERT](https://arxiv.org/abs/1810.04805) | 86.7 | 85.9 | 21 | | BERT-Large, [Megatron paper](https://arxiv.org/abs/1909.08053) | 89.7 | 90.0 | 22 | | BERT-Large, ours (median on 5 seeds) | 89.1 | 89.6 | 23 | 24 | -------------------------------------------------------------------------------- /examples/bert_with_pile/ds_config_bert_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | 27 | "wall_clock_breakdown" : false 28 | } 29 | -------------------------------------------------------------------------------- /examples/compression/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | "curriculum_learning": { 27 | "enabled": CONFIG_CL_ENABLED, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | }, 37 | 38 | "wall_clock_breakdown" : false 39 | } 40 | -------------------------------------------------------------------------------- /examples/compression/ds_config_gpt_TEMPLATE_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | "curriculum_learning": { 27 | "enabled": CONFIG_CL_ENABLED, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | }, 37 | 38 | "wall_clock_breakdown" : false, 39 | 40 | "compression_training": { 41 | "weight_quantization": { 42 | "shared_parameters":{ 43 | "enabled": true, 44 | "quantizer_kernel": false, 45 | "schedule_offset": 50, 46 | "quantize_groups": 48, 47 | "quantize_verbose": false, 48 | "quantization_type": "symmetric", 49 | "rounding": "nearest", 50 | "fp16_mixed_quantize":{ 51 | "enabled": false, 52 | "quantize_change_ratio": 0.001 53 | } 54 | }, 55 | "different_groups":{ 56 | "wq1": { 57 | "params": { 58 | "start_bits": 12, 59 | "target_bits": 4, 60 | "quantization_period": 50 61 | }, 62 | "modules": [ 63 | "encoder.layers" 64 | ] 65 | } 66 | } 67 | }, 68 | "activation_quantization": { 69 | "shared_parameters":{ 70 | "enabled": true, 71 | "quantization_type": "asymmetric", 72 | "range_calibration": "static", 73 | "schedule_offset": 50 74 | }, 75 | "different_groups":{ 76 | "aq1": { 77 | "params": { 78 | "bits": 8 79 | }, 80 | "modules": [ 81 | "encoder.layers" 82 | ] 83 | } 84 | } 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /examples/compression/ds_evalharness.sh: -------------------------------------------------------------------------------- 1 | # This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory. 2 | 3 | # CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M10L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step2000/ 4 | # CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71000/ 5 | # CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M12L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step5000/ 6 | CHECKPOINT_PATH=/blob/users/minjiaz/project/gpt3_distillation/checkpoint/gpt3-kd-test2-alpha1-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-15-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71426/ 7 | CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus--1-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B.json 8 | RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log 9 | 10 | PP_SIZE=1 11 | TP_SIZE=1 12 | NO_PP="true" 13 | EP_PARALLEL_SIZE=1 14 | # Currently eval harness does not support data parallel 15 | # However, for MoE models it's possible to enable a "fake data parallel" 16 | # in order to load experts on multiple gpus. At the same time, it's not 17 | # real data parallel because we load the same data on all gpus. 18 | # On the other hand, it's better to use less number of gpus than training, 19 | # to reduce communication overhead. 20 | NUM_NODE=1 21 | NUM_GPU_PER_NODE=1 22 | 23 | # TASKS="lambada" 24 | # WikiText-2, not used in GPT-3 paper but used in GPT-2 paper 25 | TASKS="lambada,wikitext" 26 | # Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2. 27 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext" 28 | # All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test. 29 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli" 30 | 31 | VOCAB_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json 32 | MERGE_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt 33 | 34 | export HF_DATASETS_OFFLINE=1 35 | 36 | # Dummy arguments to make megatron happy. No need to configure them. 37 | # The reason we don't need to configure them and many other arguments is 38 | # because the eval framework will read the arguments from checkpoint file. 39 | MEGATRON_REQUIRED_ARGS="\ 40 | --num-layers -1\ 41 | --hidden-size -1\ 42 | --num-attention-heads -1\ 43 | --seq-length -1 \ 44 | --max-position-embeddings -1 45 | " 46 | 47 | CMD="../../tasks/eval_harness/evaluate.py \ 48 | --load $CHECKPOINT_PATH\ 49 | --tensor-model-parallel-size $TP_SIZE \ 50 | --pipeline-model-parallel-size $PP_SIZE\ 51 | --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \ 52 | --vocab-file $VOCAB_FILE\ 53 | --merge-file $MERGE_FILE\ 54 | --micro-batch-size 12\ 55 | --no-load-optim \ 56 | --no-load-rng \ 57 | --inference \ 58 | --disable-moe-token-dropping \ 59 | --adaptive_seq_len\ 60 | --eval_fp32\ 61 | --task_list $TASKS\ 62 | --results_path $RESULT_PATH \ 63 | --deepspeed \ 64 | --deepspeed_config $CONFIG_PATH \ 65 | $MEGATRON_REQUIRED_ARGS\ 66 | " 67 | 68 | if [[ "${NO_PP}" = "true" ]]; then 69 | CMD="${CMD} \ 70 | --no-pipeline-parallel" 71 | fi 72 | 73 | LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE" 74 | $LAUNCHER $CMD -------------------------------------------------------------------------------- /examples/create_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute embeddings for each entry of a given dataset (e.g. Wikipedia) 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | # Wikipedia data can be downloaded from the following link: 9 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 10 | EVIDENCE_DATA_DIR= 11 | EMBEDDING_PATH= 12 | CHECKPOINT_PATH= 13 | 14 | python tools/create_doc_index.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 128 \ 20 | --checkpoint-activations \ 21 | --seq-length 512 \ 22 | --retriever-seq-length 256 \ 23 | --max-position-embeddings 512 \ 24 | --load ${CHECKPOINT_PATH} \ 25 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 26 | --embedding-path ${EMBEDDING_PATH} \ 27 | --indexer-log-interval 1000 \ 28 | --indexer-batch-size 128 \ 29 | --vocab-file bert-vocab.txt \ 30 | --num-workers 2 \ 31 | --fp16 32 | 33 | -------------------------------------------------------------------------------- /examples/curriculum_learning/README.md: -------------------------------------------------------------------------------- 1 | This is an example of how to use DeepSpeed's curriculum learning (CL) feature which provides faster and more stable language model pre-training. Currently it is only integrated for GPT pre-training. Note that there are two curriculum learning examples in two different repos for Megatron-LM GPT-2 pre-training. Both of them have some unique features and limitations. See details in our [tutorial](https://www.deepspeed.ai/tutorials/curriculum-learning/). For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084). -------------------------------------------------------------------------------- /examples/curriculum_learning/ds_train.sh: -------------------------------------------------------------------------------- 1 | # # baseline 2 | # CONFIG=baseline 3 | # TAG=baseline 4 | # MODEL_SIZE=1558 5 | # LR=1.5e-4 6 | # BSZ=512 7 | # SEQ_LEN=1024 8 | # MP_SIZE=1 9 | # SEED=1234 10 | # SAVE_INTERVAL=5000 11 | # NUM_ITER=600000 12 | # NUM_TOKEN=157286400000 13 | # LR_DECAY_TOKEN=157286400000 14 | # LR_WARMUP_ITER=3000 15 | # CONFIG_TEMPLATE=false 16 | # CURRICULUM_STEP=0 17 | # CURRICULUM_MIN=0 18 | 19 | # curriculum learning 20 | CONFIG=curriculum_fixed_linear 21 | MODEL_SIZE=1558 22 | LR=6e-4 23 | BSZ=4096 24 | SEQ_LEN=1024 25 | MP_SIZE=1 26 | SEED=1234 27 | SAVE_INTERVAL=1000 28 | NUM_ITER=75000 29 | NUM_TOKEN=157286400000 30 | LR_DECAY_TOKEN=157286400000 31 | LR_WARMUP_ITER=3000 32 | CONFIG_TEMPLATE=true 33 | CURRICULUM_STEP=45000 34 | CURRICULUM_MIN=64 35 | TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}" 36 | 37 | bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN 38 | -------------------------------------------------------------------------------- /examples/curriculum_learning/ds_zero_stage_1_config_baseline.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false 26 | } 27 | -------------------------------------------------------------------------------- /examples/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false, 26 | "curriculum_learning": { 27 | "enabled": true, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /examples/data_efficiency/README.md: -------------------------------------------------------------------------------- 1 | This directory includes GPT-3/BERT pretraining example scripts for DeepSpeed Data Efficiency Library technologies (curriculum learning, random-LTD, and the two composed together). 2 | 3 | You need to install updated DeepSpeed version (>=0.8.0), which contains the DeepSpeed Data Efficiency Library. 4 | 5 | Additional tutorial can be found at [DeepSpeed website](https://www.deepspeed.ai/tutorials/data-efficiency/). 6 | 7 | Additional technical details can be found in our [random-LTD paper](https://arxiv.org/abs/2211.11586) and [data efficiency paper](https://arxiv.org/abs/2212.03597). 8 | 9 | ## GPT-3 pretraining and evaluation 10 | Inside ``gpt`` folder, first the ``ds_analyze_gpt_data_map.sh`` and ``ds_analyze_gpt_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing. 11 | 12 | ``gpt/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_gpt_1.3B_dense_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality. 13 | 14 | ``gpt/eval`` includes the zero-/few-shot evaluation example scripts. ``ds_evalharness_parallel_run.sh`` is for zero-shot, and ``ds_evalharness_parallel_run_10shot.sh`` is for 10-shot. 15 | 16 | ## BERT pretraining and finetuning 17 | Inside ``bert`` folder, first the ``pile_data_download_preprocess.py`` can be used to download and preprocess the public Pile dataset. 18 | 19 | The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing. 20 | 21 | ``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality. 22 | 23 | ``bert/finetune`` includes the MNLI/QQP/RACE finetuning example scripts following the [Megatron-LM paper](https://arxiv.org/abs/1909.08053). However, we found that the RACE task's accuracy is not very stable and the Megatron-LM paper used a very long number of epochs for MNLI/QQP which is not necessary. Thus we added capability of finetuning other GLUE tasks, and switched to follow the hyperparameters of the [original BERT paper](https://arxiv.org/abs/1810.04805). The corresponding scripts are at ``bert/finetune_glue``, which we recommend to use instead of ``bert/finetune``. Our [data efficiency paper](https://arxiv.org/abs/2212.03597) also uses the scripts under ``bert/finetune_glue`` for GLUE finetuning. -------------------------------------------------------------------------------- /examples/data_efficiency/bert/ds_analyze_bert_data_map.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | num_workers=1 # Num nodes to run the map job 4 | num_threads=40 # Num threads on each node. Set this based on #CPU cores 5 | 6 | # If different data epochs have slightly different data samples (e.g., due 7 | # to randomness), then you need to specify large enough num_epochs that cover 8 | # whole pretraining. If different data epochs are the same, set num_epochs to 9 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency 10 | # library will automatically handle reshuffling when reaching another epoch. 11 | num_epochs=5 12 | 13 | # Which node is this node (start with 0 and end with num_workers-1). This 14 | # script only launch the map job on 1 worker node, since we don't expect 15 | # running on many nodes and workers don't need any communication. But you 16 | # can modify this script to add a MPI/torch distributed launcher. 17 | worker_id=$1 18 | save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/" 19 | 20 | metric='total_vocab_freq' 21 | # metric='vocab_rarity' # this requires the result of total_vocab_freq 22 | # metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq 23 | # metric='seqlen' 24 | 25 | seq_len=512 26 | batch_size=10000 27 | 28 | jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}" 29 | ## Public the Pile dataset, see prepare_pile_data.py in the same directory 30 | ## about how to download and preprocess the data. 31 | ## Change data_home to your own training data path. 32 | # data_home="/vc_data_blob/users/conglli/the_pile_bert" 33 | data_home="/blob/data/the_pile_bert" 34 | data_path="${data_home}/pile_bert_train_text_sentence" 35 | 36 | vocab_path="bert-large-uncased-vocab.txt" 37 | if [ ! -f "$vocab_path" ]; then 38 | wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt 39 | fi 40 | 41 | # Make sure the "--split" is the same as what you will use for pre-training. 42 | options=" \ 43 | --analyzing-task map \ 44 | --analyzing-data-type BERT \ 45 | --analyzing-metric ${metric} \ 46 | --analyzing-num-workers ${num_workers} \ 47 | --analyzing-worker-id ${worker_id} \ 48 | --analyzing-num-threads ${num_threads} \ 49 | --vocab-file ${vocab_path} \ 50 | --data-path ${data_path} \ 51 | --data-impl mmap \ 52 | --tokenizer-type BertWordPieceLowerCase \ 53 | --micro-batch-size ${batch_size} \ 54 | --global-batch-size ${batch_size} \ 55 | --seq-length ${seq_len} \ 56 | --max-position-embeddings ${seq_len} \ 57 | --num-layers 1 \ 58 | --hidden-size 1 \ 59 | --num-attention-heads 1 \ 60 | --split 949,50,1 \ 61 | --distributed-backend gloo \ 62 | --train-data-exact-num-epochs ${num_epochs} \ 63 | --return-data-index \ 64 | --save-interval 1 \ 65 | --save ${save_path}" 66 | 67 | python ../analyze_data.py ${options} &> ${jobname}.log -------------------------------------------------------------------------------- /examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set these 2 to the same as what you used during map job. We need these 2 4 | # configs to know how many map job result files do we have. 5 | num_workers=1 6 | num_threads=40 7 | # Reduce job only has 1 worker but can accelerate by multithreading. 8 | num_threads_reduce=40 9 | 10 | # If different data epochs have slightly different data samples (e.g., due 11 | # to randomness), then you need to specify large enough num_epochs that cover 12 | # whole pretraining. If different data epochs are the same, set num_epochs to 13 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency 14 | # library will automatically handle reshuffling when reaching another epoch. 15 | num_epochs=5 16 | 17 | save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/" 18 | 19 | metric='total_vocab_freq' 20 | # metric='vocab_rarity' # this requires the result of total_vocab_freq 21 | # metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq 22 | # metric='seqlen' 23 | 24 | seq_len=512 25 | batch_size=10000 26 | 27 | jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-reduce" 28 | ## Public the Pile dataset, see prepare_pile_data.py in the same directory 29 | ## about how to download and preprocess the data. 30 | ## Change data_home to your own training data path. 31 | # data_home="/vc_data_blob/users/conglli/the_pile_bert" 32 | data_home="/blob/data/the_pile_bert" 33 | data_path="${data_home}/pile_bert_train_text_sentence" 34 | 35 | vocab_path="bert-large-uncased-vocab.txt" 36 | if [ ! -f "$vocab_path" ]; then 37 | wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt 38 | fi 39 | 40 | # Make sure the "--split" is the same as what you will use for pre-training. 41 | options=" \ 42 | --analyzing-task reduce \ 43 | --analyzing-data-type BERT \ 44 | --analyzing-metric ${metric} \ 45 | --analyzing-num-workers ${num_workers} \ 46 | --analyzing-num-threads ${num_threads} \ 47 | --analyzing-num-threads-reduce ${num_threads_reduce} \ 48 | --vocab-file ${vocab_path} \ 49 | --data-path ${data_path} \ 50 | --data-impl mmap \ 51 | --tokenizer-type BertWordPieceLowerCase \ 52 | --micro-batch-size ${batch_size} \ 53 | --global-batch-size ${batch_size} \ 54 | --seq-length ${seq_len} \ 55 | --max-position-embeddings ${seq_len} \ 56 | --num-layers 1 \ 57 | --hidden-size 1 \ 58 | --num-attention-heads 1 \ 59 | --split 949,50,1 \ 60 | --distributed-backend gloo \ 61 | --train-data-exact-num-epochs ${num_epochs} \ 62 | --return-data-index \ 63 | --save-interval 1 \ 64 | --save ${save_path}" 65 | 66 | python ../analyze_data.py ${options} &> ${jobname}.log -------------------------------------------------------------------------------- /examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false 24 | } 25 | -------------------------------------------------------------------------------- /examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false 24 | } 25 | -------------------------------------------------------------------------------- /examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh: -------------------------------------------------------------------------------- 1 | hostname_and_rank=$1 2 | master_port=$2 3 | pretrained_checkpoint=$3 4 | 5 | # hostname_and_rank="worker-0:0,1,2,3" 6 | # master_port=12345 7 | # pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp" 8 | 9 | tasks=( 10 | RTE 11 | MRPC 12 | STS-B 13 | CoLA 14 | SST-2 15 | QNLI 16 | QQP 17 | MNLI 18 | ) 19 | 20 | seeds=( 21 | 1234 22 | 1235 23 | 1236 24 | 1237 25 | 1238 26 | ) 27 | 28 | lrs=( 29 | 2e-5 30 | 3e-5 31 | 4e-5 32 | 5e-5 33 | ) 34 | 35 | for ((i=0;i<${#tasks[@]};++i)); do 36 | task=${tasks[i]} 37 | for ((j=0;j<${#seeds[@]};++j)); do 38 | seed=${seeds[j]} 39 | for ((k=0;k<${#lrs[@]};++k)); do 40 | lr=${lrs[k]} 41 | bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint} 42 | done 43 | done 44 | done -------------------------------------------------------------------------------- /examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false, 24 | "dataloader_drop_last": true, 25 | "data_efficiency": { 26 | "enabled": true, 27 | "seed": DATA_EFFICIENCY_SEED, 28 | "data_routing": { 29 | "enabled": LTD_ENABLED, 30 | "random_ltd":{ 31 | "enabled": LTD_ENABLED, 32 | "total_layer_num": 24, 33 | "random_ltd_layer_num": 22, 34 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], 35 | "model_mask_name": "attention_mask", 36 | "model_type": "encoder", 37 | "hidden_state_order": "seq_batch_dim", 38 | "random_ltd_schedule": { 39 | "min_value": LTD_MIN, 40 | "max_value": LTD_MAX, 41 | "schedule_type":"fixed_linear", 42 | "schedule_config": { 43 | "require_steps": LTD_STEP, 44 | "seq_per_step": 16 45 | } 46 | } 47 | } 48 | }, 49 | "data_sampling": { 50 | "enabled": CL_ENABLED, 51 | "num_workers": DATA_SAMPLING_NUM_WORKERS, 52 | "curriculum_learning": { 53 | "enabled": CL_ENABLED, 54 | "data_cluster_path": "CL_CLUSTER_PATH", 55 | "curriculum_metrics": { 56 | "CL_1st_METRIC_NAME": { 57 | "index_to_sample_path": "CL_1st_SAMPLE_PATH", 58 | "index_to_metric_path": "CL_1st_METRIC_PATH", 59 | "difficulty_type": "CL_1st_DIFF_TYPE", 60 | "clustering_type": "CL_1st_CLUSTER_TYPE", 61 | "min_difficulty": CL_1st_MIN, 62 | "max_difficulty": CL_1st_MAX, 63 | "schedule_type": "fixed_root", 64 | "schedule_config": { 65 | "total_curriculum_step": CL_1st_TOTAL_STEP, 66 | "difficulty_step": CL_1st_DIFF_STEP, 67 | "root_degree": CL_1st_ROOT 68 | } 69 | } 70 | } 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false, 24 | "dataloader_drop_last": true, 25 | "data_efficiency": { 26 | "enabled": true, 27 | "seed": DATA_EFFICIENCY_SEED, 28 | "data_routing": { 29 | "enabled": LTD_ENABLED, 30 | "random_ltd":{ 31 | "enabled": LTD_ENABLED, 32 | "total_layer_num": 24, 33 | "random_ltd_layer_num": 22, 34 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], 35 | "model_mask_name": "attention_mask", 36 | "model_type": "encoder", 37 | "hidden_state_order": "seq_batch_dim", 38 | "random_ltd_schedule": { 39 | "min_value": LTD_MIN, 40 | "max_value": LTD_MAX, 41 | "schedule_type":"fixed_linear", 42 | "schedule_config": { 43 | "require_steps": LTD_STEP, 44 | "seq_per_step": 16 45 | } 46 | } 47 | } 48 | }, 49 | "data_sampling": { 50 | "enabled": CL_ENABLED, 51 | "num_workers": DATA_SAMPLING_NUM_WORKERS, 52 | "curriculum_learning": { 53 | "enabled": CL_ENABLED, 54 | "data_cluster_path": "CL_CLUSTER_PATH", 55 | "curriculum_metrics": { 56 | "CL_1st_METRIC_NAME": { 57 | "index_to_sample_path": "CL_1st_SAMPLE_PATH", 58 | "index_to_metric_path": "CL_1st_METRIC_PATH", 59 | "difficulty_type": "CL_1st_DIFF_TYPE", 60 | "clustering_type": "CL_1st_CLUSTER_TYPE", 61 | "min_difficulty": CL_1st_MIN, 62 | "max_difficulty": CL_1st_MAX, 63 | "schedule_type": "fixed_root", 64 | "schedule_config": { 65 | "total_curriculum_step": CL_1st_TOTAL_STEP, 66 | "difficulty_step": CL_1st_DIFF_STEP, 67 | "root_degree": CL_1st_ROOT 68 | } 69 | }, 70 | "CL_2nd_METRIC_NAME": { 71 | "index_to_sample_path": "CL_2nd_SAMPLE_PATH", 72 | "index_to_metric_path": "CL_2nd_METRIC_PATH", 73 | "difficulty_type": "CL_2nd_DIFF_TYPE", 74 | "clustering_type": "CL_2nd_CLUSTER_TYPE", 75 | "min_difficulty": CL_2nd_MIN, 76 | "max_difficulty": CL_2nd_MAX, 77 | "schedule_type": "fixed_root", 78 | "schedule_config": { 79 | "total_curriculum_step": CL_2nd_TOTAL_STEP, 80 | "difficulty_step": CL_2nd_DIFF_STEP, 81 | "root_degree": CL_2nd_ROOT 82 | } 83 | } 84 | } 85 | } 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | num_workers=1 # Num nodes to run the map job 4 | num_threads=40 # Num threads on each node. Set this based on #CPU cores 5 | 6 | # If different data epochs have slightly different data samples (e.g., due 7 | # to randomness), then you need to specify large enough num_epochs that cover 8 | # whole pretraining. If different data epochs are the same, set num_epochs to 9 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency 10 | # library will automatically handle reshuffling when reaching another epoch. 11 | num_epochs=1 12 | 13 | # Which node is this node (start with 0 and end with num_workers-1). This 14 | # script only launch the map job on 1 worker node, since we don't expect 15 | # running on many nodes and workers don't need any communication. But you 16 | # can modify this script to add a MPI/torch distributed launcher. 17 | worker_id=$1 18 | save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/" 19 | 20 | metric='total_vocab_freq' 21 | # metric='vocab_rarity' # this requires the result of total_vocab_freq 22 | 23 | seq_len=2048 24 | batch_size=10000 25 | 26 | jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}" 27 | # Public the Pile dataset, can be downloaded at 28 | # https://mystic.the-eye.eu/public/AI/pile_neox/ 29 | ## Change data_home to your own training data path. 30 | # data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing" 31 | data_home="/blob/data/the_pile_public_merged_nopreprocessing" 32 | data_path="${data_home}/pile_text_document" 33 | 34 | vocab_path="gpt2-vocab.json" 35 | if [ ! -f "$vocab_path" ]; then 36 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 37 | fi 38 | merge_path="gpt2-merges.txt" 39 | if [ ! -f "$merge_path" ]; then 40 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt 41 | fi 42 | 43 | # Make sure the "--split" is the same as what you will use for pre-training. 44 | options=" \ 45 | --analyzing-task map \ 46 | --analyzing-data-type GPT \ 47 | --analyzing-metric ${metric} \ 48 | --analyzing-num-workers ${num_workers} \ 49 | --analyzing-worker-id ${worker_id} \ 50 | --analyzing-num-threads ${num_threads} \ 51 | --vocab-file ${vocab_path} \ 52 | --merge-file ${merge_path} \ 53 | --data-path ${data_path} \ 54 | --data-impl mmap \ 55 | --tokenizer-type GPT2BPETokenizer \ 56 | --micro-batch-size ${batch_size} \ 57 | --global-batch-size ${batch_size} \ 58 | --seq-length ${seq_len} \ 59 | --max-position-embeddings ${seq_len} \ 60 | --num-layers 1 \ 61 | --hidden-size 1 \ 62 | --num-attention-heads 1 \ 63 | --split 949,50,1 \ 64 | --distributed-backend gloo \ 65 | --train-data-exact-num-epochs ${num_epochs} \ 66 | --return-data-index \ 67 | --save-interval 1 \ 68 | --save ${save_path}" 69 | 70 | python ../analyze_data.py ${options} &> ${jobname}.log -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set these 2 to the same as what you used during map job. We need these 2 4 | # configs to know how many map job result files do we have. 5 | num_workers=1 6 | num_threads=40 7 | # Reduce job only has 1 worker but can accelerate by multithreading. 8 | num_threads_reduce=40 9 | 10 | # If different data epochs have slightly different data samples (e.g., due 11 | # to randomness), then you need to specify large enough num_epochs that cover 12 | # whole pretraining. If different data epochs are the same, set num_epochs to 13 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency 14 | # library will automatically handle reshuffling when reaching another epoch. 15 | num_epochs=1 16 | 17 | save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/" 18 | 19 | metric='total_vocab_freq' 20 | # metric='vocab_rarity' # this requires the result of total_vocab_freq 21 | 22 | seq_len=2048 23 | batch_size=10000 24 | 25 | jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-reduce" 26 | # Public the Pile dataset, can be downloaded at 27 | # https://mystic.the-eye.eu/public/AI/pile_neox/ 28 | ## Change data_home to your own training data path. 29 | # data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing" 30 | data_home="/blob/data/the_pile_public_merged_nopreprocessing" 31 | data_path="${data_home}/pile_text_document" 32 | 33 | vocab_path="gpt2-vocab.json" 34 | if [ ! -f "$vocab_path" ]; then 35 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 36 | fi 37 | merge_path="gpt2-merges.txt" 38 | if [ ! -f "$merge_path" ]; then 39 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt 40 | fi 41 | 42 | # Make sure the "--split" is the same as what you will use for pre-training. 43 | options=" \ 44 | --analyzing-task reduce \ 45 | --analyzing-data-type GPT \ 46 | --analyzing-metric ${metric} \ 47 | --analyzing-num-workers ${num_workers} \ 48 | --analyzing-num-threads ${num_threads} \ 49 | --analyzing-num-threads-reduce ${num_threads_reduce} \ 50 | --vocab-file ${vocab_path} \ 51 | --merge-file ${merge_path} \ 52 | --data-path ${data_path} \ 53 | --data-impl mmap \ 54 | --tokenizer-type GPT2BPETokenizer \ 55 | --micro-batch-size ${batch_size} \ 56 | --global-batch-size ${batch_size} \ 57 | --seq-length ${seq_len} \ 58 | --max-position-embeddings ${seq_len} \ 59 | --num-layers 1 \ 60 | --hidden-size 1 \ 61 | --num-attention-heads 1 \ 62 | --split 949,50,1 \ 63 | --distributed-backend gloo \ 64 | --train-data-exact-num-epochs ${num_epochs} \ 65 | --return-data-index \ 66 | --save-interval 1 \ 67 | --save ${save_path}" 68 | 69 | python ../analyze_data.py ${options} &> ${jobname}.log -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 2048, 3 | "train_micro_batch_size_per_gpu": 16, 4 | "steps_per_print": 10, 5 | 6 | "zero_optimization": { 7 | "stage": 0, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": true, 13 | 14 | "fp16": { 15 | "enabled": false, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": false 25 | }, 26 | 27 | "wall_clock_breakdown" : false 28 | } -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh: -------------------------------------------------------------------------------- 1 | ## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md 2 | ## and follow the steps of installation/data downloading. 3 | 4 | ## Code below only works when you run each evalharness task on a single GPU. 5 | ## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh 6 | checkpoint_path=$1 7 | config_path=$2 8 | result_path=$3 9 | rank=$4 10 | tasks=$5 11 | hostname=$6 12 | master_port=$(( 12345 + ${rank} )) 13 | batch_size=$7 14 | num_fewshot=$8 15 | 16 | mp_size=1 17 | pp_size=1 18 | no_pp="true" 19 | ep_size=1 20 | 21 | vocab_file="gpt2-vocab.json" 22 | if [ ! -f "$vocab_file" ]; then 23 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 24 | fi 25 | merge_file="gpt2-merges.txt" 26 | if [ ! -f "$merge_file" ]; then 27 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt 28 | fi 29 | 30 | export HF_DATASETS_OFFLINE=1 31 | 32 | dir2=$(dirname "$checkpoint_path") 33 | dirname=$(basename "$dir2")/$(basename "$checkpoint_path") 34 | result_path="${result_path}/${dirname}" 35 | mkdir -p $result_path 36 | result_file="${result_path}/${tasks}_${num_fewshot}shot.json" 37 | 38 | # Dummy arguments to make megatron happy. No need to configure them. 39 | # The reason we don't need to configure them and many other arguments is 40 | # because the eval framework will read the arguments from checkpoint file. 41 | megatron_required_args="\ 42 | --num-layers -1 \ 43 | --hidden-size -1 \ 44 | --num-attention-heads -1 \ 45 | --seq-length -1 \ 46 | --max-position-embeddings -1 47 | " 48 | 49 | command="../../../../tasks/eval_harness/evaluate.py \ 50 | --load ${checkpoint_path} \ 51 | --tensor-model-parallel-size ${mp_size} \ 52 | --pipeline-model-parallel-size ${pp_size} \ 53 | --moe-expert-parallel-size ${ep_size} \ 54 | --vocab-file ${vocab_file} \ 55 | --merge-file ${merge_file} \ 56 | --micro-batch-size ${batch_size} \ 57 | --no-load-optim \ 58 | --no-load-rng \ 59 | --inference \ 60 | --disable-moe-token-dropping \ 61 | --adaptive_seq_len \ 62 | --eval_fp32 \ 63 | --num_fewshot ${num_fewshot} \ 64 | --task_list ${tasks} \ 65 | --results_path ${result_file} \ 66 | --deepspeed \ 67 | --deepspeed_config ${config_path} \ 68 | ${megatron_required_args} \ 69 | " 70 | 71 | if [[ "${no_pp}" = "true" ]]; then 72 | command="${command} \ 73 | --no-pipeline-parallel" 74 | fi 75 | 76 | launcher="deepspeed --include=$hostname:$rank --master_port=${master_port}" 77 | $launcher $command &> "${result_path}/${tasks}_${num_fewshot}shot.log" -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh: -------------------------------------------------------------------------------- 1 | ## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md 2 | ## and follow the steps of installation/data downloading. 3 | checkpoint_paths=( 4 | /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/ 5 | /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/ 6 | ) 7 | 8 | ## No need to use the exact training config json, just use this dummy is fine 9 | config_path=ds_config_eval_dummy.json 10 | username=$(whoami) 11 | result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results" 12 | 13 | ## Task(s) on the same row will be performed together in the same process. 14 | ## There exist other tasks that can run but we skip because they didn't appear 15 | ## or have strange scores in GPT-3 paper: qqp, prost, cb, wic, mrpc, sst, wnli 16 | ## pubmedqa, logiqa, qnli, sciq, mc_taco, mathqa. For wikitext, it didn't 17 | ## appear in paper but we include it for a perplexity task. 18 | tasks=( 19 | record 20 | triviaqa 21 | hellaswag 22 | arc_challenge 23 | arc_easy 24 | race 25 | multirc 26 | openbookqa 27 | lambada 28 | webqs 29 | winogrande 30 | piqa 31 | anli_r1,anli_r2,anli_r3 32 | boolq,copa 33 | rte,wsc 34 | wikitext 35 | ) 36 | 37 | ## Use localhost if you didn't setup hostfile as described in 38 | ## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node. 39 | ## If hostfile exist, use hostname (e.g., worker-0) in hostfile. 40 | # hostname="localhost" 41 | hostname="worker-0" 42 | 43 | batch_size=32 44 | 45 | ## This script is for zero-shot 46 | num_fewshot=0 47 | 48 | num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) 49 | cuda_id=-1 50 | total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+) 51 | 52 | ## Code below only works when you run each evalharness task on a single GPU. 53 | ## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh 54 | for l in "${!checkpoint_paths[@]}"; do 55 | checkpoint_path=${checkpoint_paths[l]} 56 | for ((i=0;i<${#tasks[@]};++i)); do 57 | task=${tasks[i]} 58 | free_mem=0 59 | while [ $free_mem -lt $total_mem ]; do 60 | cuda_id=$(((cuda_id+1)%num_gpus)) 61 | free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+) 62 | sleep 60s 63 | done 64 | bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot & 65 | done 66 | done 67 | -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh: -------------------------------------------------------------------------------- 1 | ## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md 2 | ## and follow the steps of installation/data downloading. 3 | checkpoint_paths=( 4 | /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/ 5 | /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/ 6 | ) 7 | 8 | ## No need to use the exact training config json, just use this dummy is fine 9 | config_path=ds_config_eval_dummy.json 10 | username=$(whoami) 11 | result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results_10shot" 12 | 13 | ## Task(s) on the same row will be performed together in the same process. 14 | tasks=( 15 | record 16 | triviaqa 17 | hellaswag 18 | arc_challenge 19 | arc_easy 20 | race 21 | multirc 22 | openbookqa 23 | lambada 24 | webqs 25 | winogrande 26 | piqa 27 | anli_r1,anli_r2 28 | anli_r3 29 | boolq,copa 30 | rte,wsc 31 | ) 32 | 33 | num_fewshot=10 34 | 35 | ## Use localhost if you didn't setup hostfile as described in 36 | ## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node. 37 | ## If hostfile exist, use hostname (e.g., worker-0) in hostfile. 38 | # hostname="localhost" 39 | hostname="worker-0" 40 | 41 | batch_size=16 42 | 43 | num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) 44 | cuda_id=-1 45 | total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+) 46 | 47 | ## Code below only works when you run each evalharness task on a single GPU. 48 | ## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh 49 | for l in "${!checkpoint_paths[@]}"; do 50 | checkpoint_path=${checkpoint_paths[l]} 51 | for ((i=0;i<${#tasks[@]};++i)); do 52 | task=${tasks[i]} 53 | free_mem=0 54 | while [ $free_mem -lt $total_mem ]; do 55 | cuda_id=$(((cuda_id+1)%num_gpus)) 56 | free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+) 57 | sleep 60s 58 | done 59 | bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot & 60 | done 61 | done 62 | -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false, 24 | "dataloader_drop_last": true, 25 | "data_efficiency": { 26 | "enabled": true, 27 | "seed": DATA_EFFICIENCY_SEED, 28 | "data_routing": { 29 | "enabled": LTD_ENABLED, 30 | "random_ltd":{ 31 | "enabled": LTD_ENABLED, 32 | "total_layer_num": 24, 33 | "random_ltd_layer_num": 22, 34 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], 35 | "model_mask_name": "attention_mask", 36 | "model_type": "decoder", 37 | "hidden_state_order": "seq_batch_dim", 38 | "random_ltd_schedule": { 39 | "min_value": LTD_MIN, 40 | "max_value": LTD_MAX, 41 | "schedule_type":"fixed_linear", 42 | "schedule_config": { 43 | "require_steps": LTD_STEP, 44 | "seq_per_step": 16 45 | } 46 | } 47 | } 48 | }, 49 | "data_sampling": { 50 | "enabled": CL_ENABLED, 51 | "num_workers": DATA_SAMPLING_NUM_WORKERS, 52 | "curriculum_learning": { 53 | "enabled": CL_ENABLED, 54 | "data_cluster_path": "CL_CLUSTER_PATH", 55 | "curriculum_metrics": { 56 | "CL_1st_METRIC_NAME": { 57 | "index_to_sample_path": "CL_1st_SAMPLE_PATH", 58 | "index_to_metric_path": "CL_1st_METRIC_PATH", 59 | "difficulty_type": "CL_1st_DIFF_TYPE", 60 | "clustering_type": "CL_1st_CLUSTER_TYPE", 61 | "min_difficulty": CL_1st_MIN, 62 | "max_difficulty": CL_1st_MAX, 63 | "schedule_type": "fixed_root", 64 | "schedule_config": { 65 | "total_curriculum_step": CL_1st_TOTAL_STEP, 66 | "difficulty_step": CL_1st_DIFF_STEP, 67 | "root_degree": CL_1st_ROOT 68 | } 69 | } 70 | } 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false, 24 | "dataloader_drop_last": true, 25 | "data_efficiency": { 26 | "enabled": true, 27 | "seed": DATA_EFFICIENCY_SEED, 28 | "data_routing": { 29 | "enabled": LTD_ENABLED, 30 | "random_ltd":{ 31 | "enabled": LTD_ENABLED, 32 | "total_layer_num": 24, 33 | "random_ltd_layer_num": 22, 34 | "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], 35 | "model_mask_name": "attention_mask", 36 | "model_type": "decoder", 37 | "hidden_state_order": "seq_batch_dim", 38 | "random_ltd_schedule": { 39 | "min_value": LTD_MIN, 40 | "max_value": LTD_MAX, 41 | "schedule_type":"fixed_linear", 42 | "schedule_config": { 43 | "require_steps": LTD_STEP, 44 | "seq_per_step": 16 45 | } 46 | } 47 | } 48 | }, 49 | "data_sampling": { 50 | "enabled": CL_ENABLED, 51 | "num_workers": DATA_SAMPLING_NUM_WORKERS, 52 | "curriculum_learning": { 53 | "enabled": CL_ENABLED, 54 | "data_cluster_path": "CL_CLUSTER_PATH", 55 | "curriculum_metrics": { 56 | "CL_1st_METRIC_NAME": { 57 | "index_to_sample_path": "CL_1st_SAMPLE_PATH", 58 | "index_to_metric_path": "CL_1st_METRIC_PATH", 59 | "difficulty_type": "CL_1st_DIFF_TYPE", 60 | "clustering_type": "CL_1st_CLUSTER_TYPE", 61 | "min_difficulty": CL_1st_MIN, 62 | "max_difficulty": CL_1st_MAX, 63 | "schedule_type": "fixed_root", 64 | "schedule_config": { 65 | "total_curriculum_step": CL_1st_TOTAL_STEP, 66 | "difficulty_step": CL_1st_DIFF_STEP, 67 | "root_degree": CL_1st_ROOT 68 | } 69 | }, 70 | "CL_2nd_METRIC_NAME": { 71 | "index_to_sample_path": "CL_2nd_SAMPLE_PATH", 72 | "index_to_metric_path": "CL_2nd_METRIC_PATH", 73 | "difficulty_type": "CL_2nd_DIFF_TYPE", 74 | "clustering_type": "CL_2nd_CLUSTER_TYPE", 75 | "min_difficulty": CL_2nd_MIN, 76 | "max_difficulty": CL_2nd_MAX, 77 | "schedule_type": "fixed_root", 78 | "schedule_config": { 79 | "total_curriculum_step": CL_2nd_TOTAL_STEP, 80 | "difficulty_step": CL_2nd_DIFF_STEP, 81 | "root_degree": CL_2nd_ROOT 82 | } 83 | } 84 | } 85 | } 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /examples/evaluate_ict_zeroshot_nq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained 4 | # ICT model 5 | 6 | # Datasets can be downloaded from the following link: 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 8 | 9 | EVIDENCE_DATA_DIR= 10 | EMBEDDING_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | QA_FILE= 14 | 15 | python tasks/main.py \ 16 | --task ICT-ZEROSHOT-NQ \ 17 | --tokenizer-type BertWordPieceLowerCase \ 18 | --num-layers 12 \ 19 | --hidden-size 768 \ 20 | --num-attention-heads 12 \ 21 | --tensor-model-parallel-size 1 \ 22 | --micro-batch-size 128 \ 23 | --checkpoint-activations \ 24 | --seq-length 512 \ 25 | --max-position-embeddings 512 \ 26 | --load ${CHECKPOINT_PATH} \ 27 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 28 | --embedding-path ${EMBEDDING_PATH} \ 29 | --retriever-seq-length 256 \ 30 | --vocab-file bert-vocab.txt\ 31 | --qa-data-test ${QA_FILE} \ 32 | --num-workers 2 \ 33 | --faiss-use-gpu \ 34 | --retriever-report-topk-accuracies 1 5 20 100 \ 35 | --fp16 36 | 37 | -------------------------------------------------------------------------------- /examples/evaluate_zeroshot_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TASK="LAMBADA" 12 | 13 | VALID_DATA= 14 | VOCAB_FILE=gpt2-vocab.json 15 | MERGE_FILE=gpt2-merges.txt 16 | CHECKPOINT=checkpoints/gpt2_345m 17 | 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 20 | --task $TASK \ 21 | --valid-data $VALID_DATA \ 22 | --tokenizer-type GPT2BPETokenizer \ 23 | --strict-lambada \ 24 | --vocab-file $VOCAB_FILE \ 25 | --merge-file $MERGE_FILE \ 26 | --load $CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 8 \ 32 | --checkpoint-activations \ 33 | --seq-length 1024 \ 34 | --max-position-embeddings 1024 \ 35 | --log-interval 10 \ 36 | --fp16 \ 37 | --no-load-optim \ 38 | --no-load-rng 39 | -------------------------------------------------------------------------------- /examples/finetune_mnli_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv" 12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \ 13 | data/glue_data/MNLI/dev_mismatched.tsv" 14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 15 | VOCAB_FILE=bert-vocab.txt 16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task MNLI \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 5 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 8 \ 32 | --checkpoint-activations \ 33 | --lr 5.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.065 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 500000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/finetune_race_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/RACE/train/middle" 12 | VALID_DATA="data/RACE/dev/middle \ 13 | data/RACE/dev/high" 14 | VOCAB_FILE=bert-vocab.txt 15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 16 | CHECKPOINT_PATH=checkpoints/bert_345m_race 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task RACE \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 3 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 4 \ 32 | --checkpoint-activations \ 33 | --lr 1.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.06 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 100000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --clip-grad 1.0 \ 45 | --hidden-dropout 0.1 \ 46 | --attention-dropout 0.1 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /examples/generate_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export TORCH_CUDA_ARCH_LIST=8.6+PTX 3 | CHECKPOINT_PATH=checkpoints/gpt2_345m 4 | VOCAB_FILE=gpt2-vocab.json 5 | MERGE_FILE=gpt2-merges.txt 6 | b=8 7 | mp=1 8 | experts=1 9 | nodes=1 10 | gpus=1 11 | 12 | 13 | use_tutel="" 14 | #use_tutel="--use-tutel" 15 | 16 | 17 | #ds_inference="" 18 | ds_inference="--ds-inference" 19 | 20 | launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus" 21 | L=24 22 | H=1024 23 | A=16 24 | #experts1=${experts[$k]} 25 | program_cmd="tools/generate_samples_gpt.py \ 26 | --tensor-model-parallel-size $mp \ 27 | --num-layers $L \ 28 | --hidden-size $H \ 29 | --num-attention-heads $A \ 30 | --max-position-embeddings 1024 \ 31 | --tokenizer-type GPT2BPETokenizer \ 32 | --fp16 \ 33 | --num-experts ${experts} \ 34 | --mlp-type standard \ 35 | --micro-batch-size $b \ 36 | --seq-length 1024 \ 37 | --out-seq-length 1024 \ 38 | --temperature 1.0 \ 39 | --vocab-file $VOCAB_FILE \ 40 | --merge-file $MERGE_FILE \ 41 | --genfile unconditional_samples.json \ 42 | --top_p 0.9 \ 43 | --log-interval 1 \ 44 | --num-samples 0 \ 45 | --load $CHECKPOINT_PATH \ 46 | $use_tutel $ds_inference" 47 | 48 | echo $launch_cmd $program_cmd 49 | $launch_cmd $program_cmd 50 | -------------------------------------------------------------------------------- /examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TENSOR_MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH=_text_sentence 6 | CHECKPOINT_PATH= 7 | 8 | python pretrain_bert.py \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --num-attention-heads 16 \ 12 | --micro-batch-size 4 \ 13 | --global-batch-size 8 \ 14 | --seq-length 512 \ 15 | --max-position-embeddings 512 \ 16 | --train-iters 2000000 \ 17 | --lr-decay-iters 990000 \ 18 | --save $CHECKPOINT_PATH \ 19 | --load $CHECKPOINT_PATH \ 20 | --data-path $DATA_PATH \ 21 | --vocab-file bert-vocab.txt \ 22 | --data-impl mmap \ 23 | --split 949,50,1 \ 24 | --lr 0.0001 \ 25 | --min-lr 0.00001 \ 26 | --lr-decay-style linear \ 27 | --lr-warmup-fraction .01 \ 28 | --weight-decay 1e-2 \ 29 | --clip-grad 1.0 \ 30 | --log-interval 100 \ 31 | --save-interval 10000 \ 32 | --eval-interval 1000 \ 33 | --eval-iters 10 \ 34 | --fp16 35 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH=_text_sentence 12 | CHECKPOINT_PATH= 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | pretrain_bert.py \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --num-attention-heads 16 \ 21 | --micro-batch-size 4 \ 22 | --global-batch-size 32 \ 23 | --seq-length 512 \ 24 | --max-position-embeddings 512 \ 25 | --train-iters 1000000 \ 26 | --save $CHECKPOINT_PATH \ 27 | --load $CHECKPOINT_PATH \ 28 | --data-path $DATA_PATH \ 29 | --vocab-file bert-vocab.txt \ 30 | --data-impl mmap \ 31 | --split 949,50,1 \ 32 | --distributed-backend nccl \ 33 | --lr 0.0001 \ 34 | --lr-decay-style linear \ 35 | --min-lr 1.0e-5 \ 36 | --lr-decay-iters 990000 \ 37 | --weight-decay 1e-2 \ 38 | --clip-grad 1.0 \ 39 | --lr-warmup-fraction .01 \ 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH=_text_sentence 12 | VOCAB_FILE= 13 | CHECKPOINT_PATH= 14 | 15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 16 | 17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 18 | pretrain_bert.py \ 19 | --tensor-model-parallel-size 2 \ 20 | --pipeline-model-parallel-size 2 \ 21 | --num-layers 24 \ 22 | --hidden-size 1024 \ 23 | --num-attention-heads 16 \ 24 | --micro-batch-size 2 \ 25 | --global-batch-size 16 \ 26 | --max-position-embeddings 512 \ 27 | --train-iters 1000000 \ 28 | --save $CHECKPOINT_PATH \ 29 | --load $CHECKPOINT_PATH \ 30 | --data-path $DATA_PATH \ 31 | --vocab-file $VOCAB_FILE \ 32 | --data-impl mmap \ 33 | --split 949,50,1 \ 34 | --distributed-backend nccl \ 35 | --lr 0.0001 \ 36 | --lr-decay-style linear \ 37 | --min-lr 1.0e-5 \ 38 | --lr-decay-iters 990000 \ 39 | --weight-decay 1e-2 \ 40 | --clip-grad 1.0 \ 41 | --lr-warmup-fraction .01 \ 42 | --log-interval 100 \ 43 | --save-interval 10000 \ 44 | --eval-interval 1000 \ 45 | --eval-iters 10 \ 46 | --fp16 47 | -------------------------------------------------------------------------------- /examples/pretrain_gpt.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | DATA_PATH=_text_document 9 | CHECKPOINT_PATH= 10 | 11 | 12 | python pretrain_gpt.py \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --micro-batch-size 4 \ 17 | --global-batch-size 8 \ 18 | --seq-length 1024 \ 19 | --max-position-embeddings 1024 \ 20 | --train-iters 500000 \ 21 | --lr-decay-iters 320000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file gpt2-vocab.json \ 26 | --merge-file gpt2-merges.txt \ 27 | --data-impl mmap \ 28 | --split 949,50,1 \ 29 | --distributed-backend nccl \ 30 | --lr 0.00015 \ 31 | --min-lr 1.0e-5 \ 32 | --lr-decay-style cosine \ 33 | --weight-decay 1e-2 \ 34 | --clip-grad 1.0 \ 35 | --lr-warmup-fraction .01 \ 36 | --checkpoint-activations \ 37 | --log-interval 100 \ 38 | --save-interval 10000 \ 39 | --eval-interval 1000 \ 40 | --eval-iters 10 \ 41 | --fp16 42 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_175B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #SBATCH --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b 5 | 6 | 7 | DIR=`pwd` 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 9 | mkdir -p $DIR/logs 10 | 11 | 12 | DATASET_1="" 13 | DATASET_2="" 14 | DATASET_3="" 15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" 16 | 17 | 18 | options=" \ 19 | --tensor-model-parallel-size 8 \ 20 | --pipeline-model-parallel-size 16 \ 21 | --num-layers 96 \ 22 | --hidden-size 12288 \ 23 | --num-attention-heads 96 \ 24 | --seq-length 2048 \ 25 | --max-position-embeddings 2048 \ 26 | --micro-batch-size 1 \ 27 | --global-batch-size 1536 \ 28 | --rampup-batch-size 16 16 5859375 \ 29 | --train-samples 146484375 \ 30 | --lr-decay-samples 126953125 \ 31 | --lr-warmup-samples 183105 \ 32 | --lr 6.0e-5 \ 33 | --min-lr 6.0e-6 \ 34 | --lr-decay-style cosine \ 35 | --log-interval 10 \ 36 | --eval-iters 40 \ 37 | --eval-interval 1000 \ 38 | --data-path ${DATASET} \ 39 | --vocab-file \ 40 | --merge-file \ 41 | --save-interval 1000 \ 42 | --save \ 43 | --load \ 44 | --split 98,2,0 \ 45 | --clip-grad 1.0 \ 46 | --weight-decay 0.1 \ 47 | --adam-beta1 0.9 \ 48 | --adam-beta2 0.95 \ 49 | --init-method-std 0.006 \ 50 | --tensorboard-dir \ 51 | --fp16 \ 52 | --checkpoint-activations " 53 | 54 | 55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}" 56 | 57 | 58 | srun -l \ 59 | --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \ 60 | --container-mounts "" \ 61 | --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}" 62 | 63 | 64 | set +x 65 | 66 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DATA_PATH=_text_document 14 | CHECKPOINT_PATH= 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 19 | pretrain_gpt.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --micro-batch-size 8 \ 24 | --global-batch-size 64 \ 25 | --seq-length 1024 \ 26 | --max-position-embeddings 1024 \ 27 | --train-iters 500000 \ 28 | --lr-decay-iters 320000 \ 29 | --save $CHECKPOINT_PATH \ 30 | --load $CHECKPOINT_PATH \ 31 | --data-path $DATA_PATH \ 32 | --vocab-file gpt2-vocab.json \ 33 | --merge-file gpt2-merges.txt \ 34 | --data-impl mmap \ 35 | --split 949,50,1 \ 36 | --distributed-backend nccl \ 37 | --lr 0.00015 \ 38 | --lr-decay-style cosine \ 39 | --min-lr 1.0e-5 \ 40 | --weight-decay 1e-2 \ 41 | --clip-grad 1.0 \ 42 | --lr-warmup-fraction .01 \ 43 | --checkpoint-activations \ 44 | --log-interval 100 \ 45 | --save-interval 10000 \ 46 | --eval-interval 1000 \ 47 | --eval-iters 10 \ 48 | --fp16 49 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DATA_PATH=_text_document 14 | CHECKPOINT_PATH= 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 19 | pretrain_gpt.py \ 20 | --tensor-model-parallel-size 2 \ 21 | --pipeline-model-parallel-size 2 \ 22 | --num-layers 24 \ 23 | --hidden-size 1024 \ 24 | --num-attention-heads 16 \ 25 | --micro-batch-size 4 \ 26 | --global-batch-size 16 \ 27 | --seq-length 1024 \ 28 | --max-position-embeddings 1024 \ 29 | --train-iters 500000 \ 30 | --lr-decay-iters 320000 \ 31 | --save $CHECKPOINT_PATH \ 32 | --load $CHECKPOINT_PATH \ 33 | --data-path $DATA_PATH \ 34 | --vocab-file gpt2-vocab.json \ 35 | --merge-file gpt2-merges.txt \ 36 | --data-impl mmap \ 37 | --split 949,50,1 \ 38 | --distributed-backend nccl \ 39 | --lr 0.00015 \ 40 | --lr-decay-style cosine \ 41 | --min-lr 1.0e-5 \ 42 | --weight-decay 1e-2 \ 43 | --clip-grad 1.0 \ 44 | --lr-warmup-fraction .01 \ 45 | --checkpoint-activations \ 46 | --log-interval 100 \ 47 | --save-interval 10000 \ 48 | --eval-interval 1000 \ 49 | --eval-iters 10 \ 50 | --fp16 51 | -------------------------------------------------------------------------------- /examples/pretrain_ict.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "217M" parameter biencoder model for ICT retriever 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | PRETRAINED_BERT_PATH= 9 | TEXT_DATA_PATH= 10 | TITLE_DATA_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | 14 | python pretrain_ict.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 32 \ 20 | --seq-length 256 \ 21 | --max-position-embeddings 512 \ 22 | --train-iters 100000 \ 23 | --vocab-file bert-vocab.txt \ 24 | --tokenizer-type BertWordPieceLowerCase \ 25 | --DDP-impl torch \ 26 | --bert-load ${PRETRAINED_BERT_PATH} \ 27 | --log-interval 100 \ 28 | --eval-interval 1000 \ 29 | --eval-iters 10 \ 30 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 31 | --retriever-score-scaling \ 32 | --load $CHECKPOINT_PATH \ 33 | --save $CHECKPOINT_PATH \ 34 | --data-path ${TEXT_DATA_PATH} \ 35 | --titles-data-path ${TITLE_DATA_PATH} \ 36 | --lr 0.0001 \ 37 | --lr-decay-style linear \ 38 | --weight-decay 1e-2 \ 39 | --clip-grad 1.0 \ 40 | --lr-warmup-fraction 0.01 \ 41 | --save-interval 4000 \ 42 | --exit-interval 8000 \ 43 | --query-in-block-prob 0.1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_llama_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | ###################################### 5 | # Change the below configurations here 6 | BASE_PATH=./tmp 7 | DS_CONFIG=${BASE_PATH}/deepspeed.json 8 | DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence" 9 | DATASET="1 ${DATASET_1}" 10 | CHECKPOINT_PATH=./tmp 11 | TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model 12 | 13 | TP=2 14 | PP=2 15 | ZERO_STAGE=0 16 | 17 | GPUS_PER_NODE=8 18 | MASTER_ADDR=localhost 19 | MASTER_PORT=6000 20 | NNODES=1 21 | NODE_RANK=0 22 | 23 | HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 24 | FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 25 | NUM_LAYERS=24 # e.g. llama-13b: 40 26 | NUM_HEADS=16 # e.g. llama-13b: 40 27 | SEQ_LENGTH=2048 28 | 29 | MICRO_BATCH_SIZE=4 30 | GLOBAL_BATCH_SIZE=16 # e.g. llama: 4M tokens 31 | TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps 32 | LR=3e-4 33 | MIN_LR=3e-5 34 | LR_WARMUP_STEPS=2000 35 | WEIGHT_DECAY=0.1 36 | GRAD_CLIP=1 37 | 38 | # Below configuration required for llama model as per llama paper 39 | #--no-query-key-layer-scaling \ 40 | # --attention-dropout 0 \ 41 | # --hidden-dropout 0 \ 42 | # --position-embedding-type rope \ 43 | # --untie-embeddings-and-output-weights \ 44 | # --activation swiglu \ 45 | # --normalization rmsnorm \ 46 | # --no-bias-gelu-fusion \ 47 | # --no-bias-dropout-fusion \ 48 | # --no-bias \ 49 | ###################################### 50 | 51 | 52 | 53 | cat < $DS_CONFIG 54 | { 55 | "train_batch_size" : $GLOBAL_BATCH_SIZE, 56 | "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, 57 | "steps_per_print": 1, 58 | 59 | "zero_optimization": { 60 | "stage": $ZERO_STAGE 61 | }, 62 | 63 | "bf16": { 64 | "enabled": true 65 | } 66 | } 67 | EOT 68 | 69 | ds_args="" 70 | ds_args=" --deepspeed ${ds_args}" 71 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 72 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 73 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}" 74 | 75 | 76 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 77 | 78 | torchrun $DISTRIBUTED_ARGS \ 79 | pretrain_gpt.py \ 80 | --tensor-model-parallel-size $TP \ 81 | --pipeline-model-parallel-size $PP \ 82 | --num-layers $NUM_LAYERS \ 83 | --hidden-size $HIDDEN_SIZE \ 84 | --ffn-hidden-size $FFN_HIDDEN_SIZE \ 85 | --num-attention-heads $NUM_HEADS \ 86 | --micro-batch-size $MICRO_BATCH_SIZE \ 87 | --global-batch-size $GLOBAL_BATCH_SIZE \ 88 | --seq-length $SEQ_LENGTH \ 89 | --max-position-embeddings $SEQ_LENGTH \ 90 | --train-iters $TRAIN_STEPS \ 91 | --save $CHECKPOINT_PATH \ 92 | --load $CHECKPOINT_PATH \ 93 | --data-path $DATASET \ 94 | --data-impl mmap \ 95 | --tokenizer-type SPTokenizer \ 96 | --tokenizer-model-file $TOKENIZER_PATH \ 97 | --split 949,50,1 \ 98 | --distributed-backend nccl \ 99 | --lr $LR \ 100 | --lr-decay-style cosine \ 101 | --min-lr $MIN_LR \ 102 | --weight-decay $WEIGHT_DECAY \ 103 | --clip-grad $GRAD_CLIP \ 104 | --lr-warmup-iters $LR_WARMUP_STEPS \ 105 | --optimizer adam \ 106 | --adam-beta1 0.9 \ 107 | --adam-beta2 0.95 \ 108 | --checkpoint-activations \ 109 | --log-interval 100 \ 110 | --save-interval 10000 \ 111 | --eval-interval 1000 \ 112 | --eval-iters 10 \ 113 | --bf16 \ 114 | --no-query-key-layer-scaling \ 115 | --attention-dropout 0 \ 116 | --hidden-dropout 0 \ 117 | --position-embedding-type rope \ 118 | --untie-embeddings-and-output-weights \ 119 | --activation swiglu \ 120 | --normalization rmsnorm \ 121 | --no-bias-gelu-fusion \ 122 | --no-bias-dropout-fusion \ 123 | --no-bias 124 | $ds_args -------------------------------------------------------------------------------- /examples/pretrain_t5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH= 6 | VOCAB_FILE= 7 | CHECKPOINT_PATH= 8 | 9 | python pretrain_t5.py \ 10 | --num-layers 12 \ 11 | --hidden-size 768 \ 12 | --num-attention-heads 12 \ 13 | --kv-channels 64 \ 14 | --ffn-hidden-size 3072 \ 15 | --encoder-seq-length 512 \ 16 | --decoder-seq-length 128 \ 17 | --micro-batch-size 16 \ 18 | --global-batch-size 2048 \ 19 | --max-position-embeddings 512 \ 20 | --train-iters 1000000 \ 21 | --lr-decay-iters 1000000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file $VOCAB_FILE \ 26 | --data-impl mmap \ 27 | --split 949,50,1 \ 28 | --lr 0.0001 \ 29 | --min-lr 0.00001 \ 30 | --lr-decay-style linear \ 31 | --lr-warmup-fraction .01 \ 32 | --weight-decay 1e-2 \ 33 | --clip-grad 1.0 \ 34 | --log-interval 100 \ 35 | --save-interval 10000 \ 36 | --eval-interval 1000 \ 37 | --eval-iters 10 \ 38 | --fp16 39 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH= 12 | VOCAB_FILE= 13 | CHECKPOINT_PATH= 14 | 15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 16 | 17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 18 | pretrain_t5.py \ 19 | --num-layers 12 \ 20 | --hidden-size 768 \ 21 | --num-attention-heads 12 \ 22 | --kv-channels 64 \ 23 | --ffn-hidden-size 3072 \ 24 | --encoder-seq-length 512 \ 25 | --decoder-seq-length 128 \ 26 | --micro-batch-size 16 \ 27 | --global-batch-size 2048 \ 28 | --max-position-embeddings 512 \ 29 | --train-iters 1000000 \ 30 | --lr-decay-iters 1000000 \ 31 | --save $CHECKPOINT_PATH \ 32 | --load $CHECKPOINT_PATH \ 33 | --data-path $DATA_PATH \ 34 | --vocab-file $VOCAB_FILE \ 35 | --data-impl mmap \ 36 | --split 949,50,1 \ 37 | --lr 0.0001 \ 38 | --min-lr 0.00001 \ 39 | --lr-decay-style linear \ 40 | --lr-warmup-fraction .01 \ 41 | --weight-decay 1e-2 \ 42 | --clip-grad 1.0 \ 43 | --log-interval 100 \ 44 | --save-interval 10000 \ 45 | --eval-interval 1000 \ 46 | --eval-iters 10 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH= 12 | CHECKPOINT_PATH= 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | pretrain_t5.py \ 18 | --tensor-model-parallel-size 2 \ 19 | --num-layers 12 \ 20 | --hidden-size 768 \ 21 | --num-attention-heads 12 \ 22 | --kv-channels 64 \ 23 | --ffn-hidden-size 3072 \ 24 | --encoder-seq-length 512 \ 25 | --decoder-seq-length 128 \ 26 | --micro-batch-size 16 \ 27 | --global-batch-size 2048 \ 28 | --seq-length 512 \ 29 | --max-position-embeddings 512 \ 30 | --train-iters 1000000 \ 31 | --lr-decay-iters 1000000 \ 32 | --save $CHECKPOINT_PATH \ 33 | --load $CHECKPOINT_PATH \ 34 | --data-path $DATA_PATH \ 35 | --vocab-file t5-vocab.txt \ 36 | --data-impl mmap \ 37 | --split 949,50,1 \ 38 | --lr 0.0001 \ 39 | --min-lr 0.00001 \ 40 | --lr-decay-style linear \ 41 | --lr-warmup-fraction .01 \ 42 | --weight-decay 1e-2 \ 43 | --clip-grad 1.0 \ 44 | --log-interval 100 \ 45 | --save-interval 10000 \ 46 | --eval-interval 1000 \ 47 | --eval-iters 10 \ 48 | --fp16 49 | -------------------------------------------------------------------------------- /examples/run_deepspeed_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | BASE_PATH=/vc_data/Megatron-LM/data 5 | DATA_PATH=${BASE_PATH}/indexed_datasets/megatron 6 | DS_CONFIG=ds_config.json 7 | 8 | TP=1 9 | PP=1 10 | NLAYERS=24 11 | HIDDEN=512 12 | 13 | GLOBAL_BATCH=64 14 | MICRO_BATCH=4 15 | 16 | ZERO_STAGE=2 17 | 18 | OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH} 19 | #OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH} 20 | mkdir -p $OUTPUT_DIR 21 | 22 | cat < $DS_CONFIG 23 | { 24 | "train_batch_size" : $GLOBAL_BATCH, 25 | "train_micro_batch_size_per_gpu": $MICRO_BATCH, 26 | "steps_per_print": 1, 27 | 28 | "zero_optimization": { 29 | "stage": $ZERO_STAGE 30 | }, 31 | 32 | "fp16": { 33 | "enabled": true, 34 | "initial_scale_power": 12 35 | }, 36 | 37 | "wall_clock_breakdown" : true 38 | } 39 | EOT 40 | 41 | export NCCL_DEBUG=warn 42 | 43 | ds_args="" 44 | ds_args=" --deepspeed ${ds_args}" 45 | ds_args=" --no-pipeline-parallel ${ds_args}" 46 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 47 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 48 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}" 49 | 50 | 51 | deepspeed pretrain_gpt.py \ 52 | --tensor-model-parallel-size $TP \ 53 | --pipeline-model-parallel-size $PP \ 54 | --num-layers $NLAYERS \ 55 | --hidden-size $HIDDEN \ 56 | --num-attention-heads 16 \ 57 | --seq-length 256 \ 58 | --loss-scale 12 \ 59 | --max-position-embeddings 1024 \ 60 | --micro-batch-size 4 \ 61 | --global-batch-size 1024 \ 62 | --train-iters 1000 \ 63 | --lr 6.0e-5 \ 64 | --min-lr 6.0e-6 \ 65 | --lr-decay-style cosine \ 66 | --log-interval 1 \ 67 | --eval-iters 40 \ 68 | --eval-interval 1000 \ 69 | --data-path $DATA_PATH \ 70 | --vocab-file $BASE_PATH/gpt2-vocab.json \ 71 | --merge-file $BASE_PATH/gpt2-merges.txt \ 72 | --save-interval 1000 \ 73 | --split 98,2,0 \ 74 | --clip-grad 1.0 \ 75 | --weight-decay 0.1 \ 76 | --adam-beta1 0.9 \ 77 | --adam-beta2 0.95 \ 78 | --init-method-std 0.006 \ 79 | --fp16 \ 80 | --checkpoint-activations \ 81 | --tensorboard-dir $OUTPUT_DIR \ 82 | $ds_args \ 83 | --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log 84 | 85 | -------------------------------------------------------------------------------- /images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LydiaXiaohongLi/Megatron-DeepSpeed/336573636757b6db74eab4218885460dc14cec58/images/cases_april2021.png -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import os 16 | import torch 17 | from deepspeed.accelerator import get_accelerator 18 | from .package_info import ( 19 | __description__, 20 | __contact_names__, 21 | __url__, 22 | __download_url__, 23 | __keywords__, 24 | __license__, 25 | __package_name__, 26 | __version__, 27 | ) 28 | 29 | from .global_vars import get_args 30 | from .global_vars import get_current_global_batch_size 31 | from .global_vars import get_num_microbatches 32 | from .global_vars import update_num_microbatches 33 | from .global_vars import get_tokenizer 34 | from .global_vars import get_tensorboard_writer 35 | from .global_vars import get_adlr_autoresume 36 | from .global_vars import get_timers 37 | from .initialize import initialize_megatron 38 | 39 | def print_rank_0(message): 40 | """If distributed is initialized, print only on rank 0.""" 41 | if torch.distributed.is_initialized(): 42 | if torch.distributed.get_rank() == 0: 43 | print(message, flush=True) 44 | else: 45 | print(message, flush=True) 46 | 47 | def is_last_rank(): 48 | return torch.distributed.get_rank() == ( 49 | torch.distributed.get_world_size() - 1) 50 | 51 | def print_rank_last(message): 52 | """If distributed is initialized, print only on last rank.""" 53 | if torch.distributed.is_initialized(): 54 | if is_last_rank(): 55 | print(message, flush=True) 56 | else: 57 | print(message, flush=True) 58 | 59 | def is_aml(): 60 | # Are we running inside an Azure Machine Learning (AML) environment? 61 | return 'AZUREML_EXPERIMENT_ID' in os.environ 62 | 63 | def is_rank_0(): 64 | """Check whether it is rank 0. For AML, check if it is rank 0 of a node""" 65 | if torch.distributed.is_initialized(): 66 | if torch.distributed.get_rank() == 0 or ( 67 | is_aml() and torch.distributed.get_rank() % get_accelerator().device_count() == 0 68 | ): 69 | return True 70 | else: 71 | return False 72 | else: 73 | return True 74 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Blendable dataset.""" 17 | 18 | import time 19 | 20 | import numpy as np 21 | import torch 22 | 23 | from megatron import print_rank_0 24 | from megatron import mpu 25 | 26 | 27 | class BlendableDataset(torch.utils.data.Dataset): 28 | 29 | 30 | def __init__(self, datasets, weights): 31 | 32 | self.datasets = datasets 33 | num_datasets = len(datasets) 34 | assert num_datasets == len(weights) 35 | 36 | self.size = 0 37 | for dataset in self.datasets: 38 | self.size += len(dataset) 39 | 40 | # Normalize weights. 41 | weights = np.array(weights, dtype=np.float64) 42 | sum_weights = np.sum(weights) 43 | assert sum_weights > 0.0 44 | weights /= sum_weights 45 | 46 | # Build indecies. 47 | start_time = time.time() 48 | assert num_datasets < 255 49 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 50 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 51 | 52 | from megatron.data import helpers 53 | helpers.build_blending_indices(self.dataset_index, 54 | self.dataset_sample_index, 55 | weights, num_datasets, self.size, 56 | torch.distributed.get_rank() == 0) 57 | print_rank_0('> elapsed time for building blendable dataset indices: ' 58 | '{:.2f} (sec)'.format(time.time() - start_time)) 59 | 60 | 61 | def __len__(self): 62 | return self.size 63 | 64 | 65 | def __getitem__(self, idx): 66 | dataset_idx = self.dataset_index[idx] 67 | sample_idx = self.dataset_sample_index[idx] 68 | return self.datasets[dataset_idx][sample_idx] 69 | -------------------------------------------------------------------------------- /megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /megatron/data/vit_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import os 16 | import torch 17 | from torchvision import datasets, transforms 18 | from megatron.data.autoaugment import ImageNetPolicy 19 | 20 | 21 | def build_train_valid_datasets(data_path, crop_size=224, color_jitter=True): 22 | 23 | # training dataset 24 | train_data_path = os.path.join(data_path[0], "train") 25 | normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 26 | process = [ 27 | transforms.RandomResizedCrop(crop_size), 28 | transforms.RandomHorizontalFlip(), 29 | ] 30 | if color_jitter: 31 | process += [ 32 | transforms.ColorJitter( 33 | brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1 34 | ) 35 | ] 36 | fp16_t = transforms.ConvertImageDtype(torch.half) 37 | process += [ImageNetPolicy(), transforms.ToTensor(), normalize, fp16_t] 38 | transform_train = transforms.Compose(process) 39 | train_data = datasets.ImageFolder( 40 | root=train_data_path, transform=transform_train 41 | ) 42 | 43 | # validation dataset 44 | val_data_path = os.path.join(data_path[0], "val") 45 | transform_val = transforms.Compose( 46 | [ 47 | transforms.Resize(crop_size), 48 | transforms.CenterCrop(crop_size), 49 | transforms.ToTensor(), 50 | normalize, 51 | fp16_t 52 | ] 53 | ) 54 | val_data = datasets.ImageFolder( 55 | root=val_data_path, transform=transform_val 56 | ) 57 | 58 | return train_data, val_data 59 | -------------------------------------------------------------------------------- /megatron/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 29 | prefix = 3 30 | 31 | class PositionEmbeddingType(enum.Enum): 32 | rotary = 1 33 | absolute = 2 34 | alibi = 3 35 | -------------------------------------------------------------------------------- /megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """For backward compatibility, we need the class definitions to deserialize.""" 17 | 18 | class LossScaler: 19 | def __init__(self, scale=1): 20 | self.cur_scale = scale 21 | 22 | class DynamicLossScaler: 23 | def __init__(self, 24 | init_scale=2**32, 25 | scale_factor=2., 26 | scale_window=1000, 27 | min_scale=1, 28 | delayed_shift=1, 29 | consecutive_hysteresis=False): 30 | self.cur_scale = init_scale 31 | self.cur_iter = 0 32 | self.last_overflow_iter = -1 33 | self.scale_factor = scale_factor 34 | self.scale_window = scale_window 35 | self.min_scale = min_scale 36 | self.delayed_shift = delayed_shift 37 | self.cur_hysteresis = delayed_shift 38 | self.consecutive_hysteresis = consecutive_hysteresis 39 | 40 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | 22 | 23 | #ifndef TORCH_CHECK 24 | #define TORCH_CHECK AT_CHECK 25 | #endif 26 | 27 | #ifdef VERSION_GE_1_3 28 | #define DATA_PTR data_ptr 29 | #else 30 | #define DATA_PTR data 31 | #endif 32 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | torch::Tensor const& mask, 28 | float scale_factor); 29 | 30 | torch::Tensor bwd_cuda( 31 | torch::Tensor const& output_grads, 32 | torch::Tensor const& softmax_results, 33 | float scale_factor); 34 | 35 | torch::Tensor fwd( 36 | torch::Tensor const& input, 37 | torch::Tensor const& mask, 38 | float scale_factor) { 39 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 40 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 41 | (input.scalar_type() == at::ScalarType::BFloat16), 42 | "Only fp16 and bf16 are supported"); 43 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 44 | 45 | return fwd_cuda(input, mask, scale_factor); 46 | } 47 | 48 | torch::Tensor bwd( 49 | torch::Tensor const& output_grads, 50 | torch::Tensor const& softmax_results, 51 | float scale_factor) { 52 | 53 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 54 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 55 | 56 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 57 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 58 | "Only fp16 and bf16 are supported"); 59 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 60 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 61 | "Only fp16 and bf16 are supported"); 62 | 63 | return bwd_cuda(output_grads, softmax_results, scale_factor); 64 | } 65 | 66 | } // end namespace scaled_masked_softmax 67 | } // end namespace fused_softmax 68 | } // end namespace multihead_attn 69 | 70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 71 | m.def("forward", 72 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 73 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 74 | m.def("backward", 75 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 76 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 77 | } 78 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | float scale_factor); 28 | 29 | torch::Tensor bwd_cuda( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor); 33 | 34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 35 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 36 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 37 | (input.scalar_type() == at::ScalarType::BFloat16), 38 | "Only fp16 and bf16 are supported"); 39 | 40 | return fwd_cuda(input, scale_factor); 41 | } 42 | 43 | torch::Tensor bwd( 44 | torch::Tensor const& output_grads, 45 | torch::Tensor const& softmax_results, 46 | float scale_factor) { 47 | 48 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 49 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 50 | 51 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 52 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 55 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 56 | "Only fp16 and bf16 are supported"); 57 | 58 | return bwd_cuda(output_grads, softmax_results, scale_factor); 59 | } 60 | 61 | } // end namespace scaled_upper_triang_masked_softmax 62 | } // end namespace fused_softmax 63 | } // end namespace multihead_attn 64 | 65 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 66 | m.def("forward", 67 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 68 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 69 | m.def("backward", 70 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 71 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 72 | } 73 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #ifndef __HIP_PLATFORM_HCC__ 22 | #include 23 | #endif 24 | #include 25 | #include 26 | #include "scaled_upper_triang_masked_softmax.h" 27 | #include "type_shim.h" 28 | 29 | namespace multihead_attn { 30 | namespace fused_softmax { 31 | namespace scaled_upper_triang_masked_softmax { 32 | 33 | torch::Tensor fwd_cuda( 34 | torch::Tensor const& input, 35 | float scale_factor) 36 | { 37 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 38 | const int attn_batches = input.size(0); 39 | const int seq_len = input.size(1); 40 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 41 | 42 | // Output 43 | auto act_options = input.options().requires_grad(false); 44 | torch::Tensor softmax_results = 45 | torch::empty({attn_batches, seq_len, seq_len}, act_options); 46 | 47 | // Softmax Intermediate Result Ptr 48 | void* input_ptr = static_cast(input.data_ptr()); 49 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 50 | 51 | DISPATCH_HALF_AND_BFLOAT( 52 | input.scalar_type(), 53 | "dispatch_scaled_upper_triang_masked_softmax_forward", 54 | dispatch_scaled_upper_triang_masked_softmax_forward( 55 | reinterpret_cast(softmax_results_ptr), 56 | reinterpret_cast(input_ptr), 57 | scale_factor, 58 | seq_len, 59 | seq_len, 60 | attn_batches); 61 | ); 62 | return softmax_results; 63 | } 64 | 65 | 66 | torch::Tensor bwd_cuda( 67 | torch::Tensor const& output_grads_, 68 | torch::Tensor const& softmax_results_, 69 | float scale_factor) { 70 | 71 | auto output_grads = output_grads_.contiguous(); 72 | auto softmax_results = softmax_results_.contiguous(); 73 | 74 | //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 75 | const int attn_batches = output_grads.size(0); 76 | const int seq_len = output_grads.size(1); 77 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 78 | 79 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 80 | 81 | //Softmax Grad 82 | DISPATCH_HALF_AND_BFLOAT( 83 | output_grads_.scalar_type(), 84 | "dispatch_scaled_upper_triang_masked_softmax_backward", 85 | dispatch_scaled_upper_triang_masked_softmax_backward( 86 | reinterpret_cast(output_grads_ptr), 87 | reinterpret_cast(output_grads_ptr), 88 | reinterpret_cast(softmax_results.data_ptr()), 89 | scale_factor, 90 | seq_len, 91 | seq_len, 92 | attn_batches); 93 | ); 94 | 95 | //backward pass is completely in-place 96 | return output_grads; 97 | } 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /megatron/fused_kernels/type_shim.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | #include 19 | #include "compat.h" 20 | 21 | 22 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 23 | switch(TYPE) \ 24 | { \ 25 | case at::ScalarType::Half: \ 26 | { \ 27 | using scalar_t = at::Half; \ 28 | __VA_ARGS__; \ 29 | break; \ 30 | } \ 31 | case at::ScalarType::BFloat16: \ 32 | { \ 33 | using scalar_t = at::BFloat16; \ 34 | __VA_ARGS__; \ 35 | break; \ 36 | } \ 37 | default: \ 38 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 39 | } 40 | 41 | 42 | 43 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ 44 | switch(TYPEIN) \ 45 | { \ 46 | case at::ScalarType::Float: \ 47 | { \ 48 | using scalar_t_in = float; \ 49 | switch(TYPEOUT) \ 50 | { \ 51 | case at::ScalarType::Float: \ 52 | { \ 53 | using scalar_t_out = float; \ 54 | __VA_ARGS__; \ 55 | break; \ 56 | } \ 57 | case at::ScalarType::Half: \ 58 | { \ 59 | using scalar_t_out = at::Half; \ 60 | __VA_ARGS__; \ 61 | break; \ 62 | } \ 63 | case at::ScalarType::BFloat16: \ 64 | { \ 65 | using scalar_t_out = at::BFloat16; \ 66 | __VA_ARGS__; \ 67 | break; \ 68 | } \ 69 | default: \ 70 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ 71 | } \ 72 | break; \ 73 | } \ 74 | case at::ScalarType::Half: \ 75 | { \ 76 | using scalar_t_in = at::Half; \ 77 | using scalar_t_out = at::Half; \ 78 | __VA_ARGS__; \ 79 | break; \ 80 | } \ 81 | case at::ScalarType::BFloat16: \ 82 | { \ 83 | using scalar_t_in = at::BFloat16; \ 84 | using scalar_t_out = at::BFloat16; \ 85 | __VA_ARGS__; \ 86 | break; \ 87 | } \ 88 | default: \ 89 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ 90 | } 91 | 92 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from deepspeed.accelerator.real_accelerator import get_accelerator 16 | if get_accelerator().device_name() == 'cuda': 17 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 18 | else: 19 | from torch.nn import LayerNorm 20 | from .distributed import DistributedDataParallel 21 | from .bert_model import BertModel 22 | from .gpt_model import GPTModel, GPTModelPipe 23 | from .llama_model import LlamaModel, LlamaModelPipe 24 | from .t5_model import T5Model 25 | from .language_model import get_language_model 26 | from .module import Float16Module 27 | from .rotary_pos_embedding import RotaryEmbedding 28 | -------------------------------------------------------------------------------- /megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 29 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | torch._C._jit_set_profiling_mode(False) 19 | torch._C._jit_set_profiling_executor(False) 20 | torch._C._jit_override_can_fuse_on_cpu(True) 21 | torch._C._jit_override_can_fuse_on_gpu(True) 22 | 23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 24 | # 1/sqrt(2*pi)-> 0.3989423 25 | # 1/sqrt(2) -> 0.70710678 26 | # sqrt(2/pi) -> 0.79788456 27 | # this function is tanh approximation of gelu 28 | # actual gelu is: 29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 30 | 31 | @torch.jit.script 32 | def bias_gelu(bias, y): 33 | x = bias + y 34 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 35 | 36 | # gradient of tanh approximation of gelu 37 | # gradient of actual gelu is: 38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 39 | @torch.jit.script 40 | def bias_gelu_back(g, bias, y): 41 | x = bias + y 42 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 43 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 44 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 45 | return ff*g 46 | 47 | class GeLUFunction(torch.autograd.Function): 48 | @staticmethod 49 | # bias is an optional argument 50 | def forward(ctx, input, bias): 51 | ctx.save_for_backward(input, bias) 52 | return bias_gelu(bias, input) 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | input, bias = ctx.saved_tensors 57 | tmp = bias_gelu_back(grad_output, bias, input) 58 | return tmp, tmp 59 | 60 | bias_gelu_impl = GeLUFunction.apply 61 | -------------------------------------------------------------------------------- /megatron/model/fused_layer_norm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """This code is copied fron NVIDIA apex: 17 | https://github.com/NVIDIA/apex 18 | with some changes. """ 19 | 20 | import numbers 21 | import torch 22 | from torch.nn.parameter import Parameter 23 | from torch.nn import init 24 | import importlib 25 | from torch.nn import functional as F 26 | 27 | global fused_mix_prec_layer_norm_cuda 28 | fused_mix_prec_layer_norm_cuda = None 29 | 30 | 31 | class FusedLayerNormAffineFunction(torch.autograd.Function): 32 | 33 | @staticmethod 34 | def forward(ctx, input, weight, bias, normalized_shape, eps): 35 | 36 | ctx.normalized_shape = normalized_shape 37 | ctx.eps = eps 38 | input_ = input.contiguous() 39 | weight_ = weight.contiguous() 40 | bias_ = bias.contiguous() 41 | output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine( 42 | input_, ctx.normalized_shape, weight_, bias_, ctx.eps) 43 | ctx.save_for_backward(input_, weight_, bias_, mean, invvar) 44 | 45 | return output 46 | 47 | 48 | @staticmethod 49 | def backward(ctx, grad_output): 50 | 51 | input_, weight_, bias_, mean, invvar = ctx.saved_tensors 52 | grad_input = grad_weight = grad_bias = None 53 | grad_input, grad_weight, grad_bias \ 54 | = fused_mix_prec_layer_norm_cuda.backward_affine( 55 | grad_output.contiguous(), mean, invvar, 56 | input_, ctx.normalized_shape, 57 | weight_, bias_, ctx.eps) 58 | 59 | return grad_input, grad_weight, grad_bias, None, None 60 | 61 | 62 | 63 | class MixedFusedLayerNorm(torch.nn.Module): 64 | 65 | def __init__(self, normalized_shape, eps=1e-5): 66 | super(MixedFusedLayerNorm, self).__init__() 67 | 68 | global fused_mix_prec_layer_norm_cuda 69 | fused_mix_prec_layer_norm_cuda = importlib.import_module( 70 | "fused_mix_prec_layer_norm_cuda") 71 | 72 | if isinstance(normalized_shape, numbers.Integral): 73 | normalized_shape = (normalized_shape,) 74 | self.normalized_shape = torch.Size(normalized_shape) 75 | self.eps = eps 76 | self.weight = Parameter(torch.Tensor(*normalized_shape)) 77 | self.bias = Parameter(torch.Tensor(*normalized_shape)) 78 | self.reset_parameters() 79 | 80 | 81 | def reset_parameters(self): 82 | 83 | init.ones_(self.weight) 84 | init.zeros_(self.bias) 85 | 86 | 87 | def forward(self, input): 88 | # CPU path is here for unittest sake. 89 | if not input.is_cuda: 90 | print("WARNING! The input of FusedLayerNorm should be on the GPU." 91 | "This warning should only be triggered in the FusedLayerNorm unit tests.") 92 | return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) 93 | return FusedLayerNormAffineFunction.apply( 94 | input, self.weight, self.bias, self.normalized_shape,self.eps) 95 | 96 | -------------------------------------------------------------------------------- /megatron/model/rotary_pos_embedding.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from einops import rearrange 18 | from torch import einsum, nn 19 | 20 | __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb'] 21 | 22 | 23 | class RotaryEmbedding(nn.Module): 24 | def __init__(self, dim): 25 | super().__init__() 26 | inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) 27 | self.register_buffer('inv_freq', inv_freq) 28 | 29 | def forward(self, max_seq_len, offset=0): 30 | seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset 31 | freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq) 32 | # first part even vector components, second part odd vector components, 33 | # 2 * dim in dimension size 34 | emb = torch.cat((freqs, freqs), dim=-1) 35 | # emb [seq_length, .., dim] 36 | return rearrange(emb, 'n d -> n 1 1 d') 37 | 38 | 39 | def _rotate_half(x): 40 | """ 41 | change sign so the last dimension becomes [-odd, +even] 42 | """ 43 | x = rearrange(x, '... (j d) -> ... j d', j=2) 44 | x1, x2 = x.unbind(dim=-2) 45 | return torch.cat((-x2, x1), dim=-1) 46 | 47 | 48 | def apply_rotary_pos_emb(t, freqs): 49 | """ 50 | input tensor t is of shape [seq_length, ..., dim] 51 | rotary positional embeding tensor freqs is of shape [seq_length, ..., dim] 52 | check https://kexue.fm/archives/8265 for detailed formulas 53 | """ 54 | rot_dim = freqs.shape[-1] 55 | # ideally t_pass is empty so rotary pos embedding is applied to all tensor t 56 | t, t_pass = t[..., :rot_dim], t[..., rot_dim:] 57 | # first part is cosine component 58 | # second part is sine component, need to change signs with _rotate_half method 59 | t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin()) 60 | return torch.cat((t, t_pass), dim=-1) 61 | -------------------------------------------------------------------------------- /megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for models.""" 17 | 18 | import math 19 | 20 | import torch 21 | 22 | from megatron import get_args 23 | 24 | def init_method_normal(sigma): 25 | """Init method based on N(0, sigma).""" 26 | def init_(tensor): 27 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 28 | 29 | return init_ 30 | 31 | 32 | def scaled_init_method_normal(sigma, num_layers): 33 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 34 | std = sigma / math.sqrt(2.0 * num_layers) 35 | 36 | def init_(tensor): 37 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 38 | 39 | return init_ 40 | 41 | 42 | def attention_mask_func(attention_scores, attention_mask): 43 | args = get_args() 44 | if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: 45 | attention_mask_ = attention_mask 46 | actual_seqlen = attention_scores.size()[2] 47 | if actual_seqlen != attention_mask_.size()[2]: 48 | # attention_mask has size [1, 1, seqlen, seqlen] 49 | attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous() 50 | attention_scores.masked_fill_(attention_mask_, -10000.0) 51 | else: 52 | attention_scores.masked_fill_(attention_mask, -10000.0) 53 | return attention_scores 54 | 55 | 56 | def get_linear_layer(rows, columns, init_method): 57 | """Simple linear layer with weight initialization.""" 58 | layer = torch.nn.Linear(rows, columns) 59 | init_method(layer.weight) 60 | with torch.no_grad(): 61 | layer.bias.zero_() 62 | return layer 63 | 64 | @torch.jit.script 65 | def gelu_impl(x): 66 | """OpenAI's gelu implementation.""" 67 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 68 | (1.0 + 0.044715 * x * x))) 69 | def openai_gelu(x): 70 | return gelu_impl(x) 71 | 72 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 73 | @torch.jit.script 74 | def erf_gelu(x): 75 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 76 | -------------------------------------------------------------------------------- /megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .initialize import is_unitialized 23 | from .initialize import destroy_model_parallel 24 | from .initialize import get_data_parallel_group 25 | from .initialize import get_data_parallel_rank 26 | from .initialize import get_data_parallel_world_size 27 | from .initialize import get_embedding_group 28 | from .initialize import get_model_parallel_group 29 | from .initialize import get_tensor_model_parallel_group 30 | from .initialize import get_pipeline_model_parallel_group 31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank 32 | from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank 33 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage 34 | from .initialize import get_tensor_model_parallel_src_rank 35 | from .initialize import get_pipeline_model_parallel_first_rank 36 | from .initialize import get_pipeline_model_parallel_last_rank 37 | from .initialize import get_pipeline_model_parallel_next_rank 38 | from .initialize import get_pipeline_model_parallel_prev_rank 39 | from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size 40 | from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size 41 | from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank 42 | from .initialize import initialize_model_parallel 43 | from .initialize import model_parallel_is_initialized 44 | from .initialize import get_model_parallel_world_size, get_model_parallel_rank 45 | 46 | from .layers import ColumnParallelLinear 47 | from .layers import RowParallelLinear 48 | from .layers import VocabParallelEmbedding 49 | from .layers import (set_tensor_model_parallel_attributes, 50 | set_defaults_if_not_set_tensor_model_parallel_attributes, 51 | copy_tensor_model_parallel_attributes) 52 | 53 | from .mappings import copy_to_tensor_model_parallel_region 54 | from .mappings import gather_from_tensor_model_parallel_region 55 | from .mappings import reduce_from_tensor_model_parallel_region 56 | from .mappings import scatter_to_tensor_model_parallel_region 57 | 58 | from .random import checkpoint 59 | from .random import get_cuda_rng_tracker 60 | from .random import init_checkpointed_activations_memory_buffer 61 | from .random import model_parallel_cuda_manual_seed 62 | from .random import reset_checkpointed_activations_memory_buffer 63 | from .random import gather_split_1d_tensor 64 | from .random import split_tensor_into_1d_equal_chunks 65 | 66 | from .utils import divide 67 | from .utils import split_tensor_along_last_dim 68 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LydiaXiaohongLi/Megatron-DeepSpeed/336573636757b6db74eab4218885460dc14cec58/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | import mpu 23 | from deepspeed.accelerator import get_accelerator 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | 30 | def forward(self): 31 | return self.weight 32 | 33 | 34 | def set_random_seed(seed): 35 | """Set random seed for reproducability.""" 36 | random.seed(seed) 37 | numpy.random.seed(seed) 38 | torch.manual_seed(seed) 39 | mpu.model_parallel_cuda_manual_seed(seed) 40 | 41 | 42 | def initialize_distributed(backend='nccl'): 43 | """Initialize torch.distributed.""" 44 | # Get local rank in case it is provided. 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--local_rank', type=int, default=None, 47 | help='local rank passed from distributed launcher') 48 | args = parser.parse_args() 49 | local_rank = args.local_rank 50 | 51 | # Get rank and world size. 52 | rank = int(os.getenv('RANK', '0')) 53 | world_size = int(os.getenv("WORLD_SIZE", '1')) 54 | 55 | print('> initializing torch.distributed with local rank: {}, ' 56 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 57 | 58 | # Set the device id. 59 | device = rank % get_accelerator().device_count() 60 | if local_rank is not None: 61 | device = local_rank 62 | get_accelerator().set_device(device) 63 | 64 | # Call the init process. 65 | init_method = 'tcp://' 66 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 67 | master_port = os.getenv('MASTER_PORT', '6000') 68 | init_method += master_ip + ':' + master_port 69 | torch.distributed.init_process_group( 70 | backend=backend, 71 | world_size=world_size, 72 | rank=rank, 73 | init_method=init_method) 74 | 75 | 76 | def print_separator(message): 77 | torch.distributed.barrier() 78 | filler_len = (78 - len(message)) // 2 79 | filler = '-' * filler_len 80 | string = '\n' + filler + ' {} '.format(message) + filler 81 | if torch.distributed.get_rank() == 0: 82 | print(string, flush=True) 83 | torch.distributed.barrier() 84 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | from deepspeed.accelerator import get_accelerator 19 | from mpu import data as data_utils 20 | import mpu 21 | import torch 22 | import functools 23 | import operator 24 | import sys 25 | sys.path.append("../..") 26 | 27 | 28 | def test_broadcast_data(tensor_model_parallel_size): 29 | 30 | if torch.distributed.get_rank() == 0: 31 | print('> testing broadcast_data with model parallel size {} ...'. 32 | format(tensor_model_parallel_size)) 33 | 34 | mpu.initialize_model_parallel(tensor_model_parallel_size) 35 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 36 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 37 | 38 | key_size_t = {'key1': [7, 11], 39 | 'key2': [8, 2, 1], 40 | 'key3': [13], 41 | 'key4': [5, 1, 2], 42 | 'key5': [5, 12]} 43 | keys = list(key_size_t.keys()) 44 | 45 | data = {} 46 | data_t = {} 47 | for key in key_size_t: 48 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 49 | data_t[key] = data[key].clone() 50 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 51 | data_t['keyX'] = data['keyX'].clone() 52 | if mpu.get_tensor_model_parallel_rank() != 0: 53 | data = None 54 | 55 | data_utils._check_data_types(keys, data_t, torch.int64) 56 | key_size, key_numel, \ 57 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 58 | for key in keys: 59 | assert key_size[key] == key_size_t[key] 60 | total_numel_t = 0 61 | for key in keys: 62 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 63 | assert key_numel[key] == target_size 64 | total_numel_t += target_size 65 | assert total_numel == total_numel_t 66 | 67 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 68 | for key in keys: 69 | tensor = data_t[key].to(get_accelerator().device_name()) 70 | assert data_b[key].sub(tensor).abs().max() == 0 71 | 72 | # Reset groups 73 | mpu.destroy_tensor_model_parallel() 74 | 75 | torch.distributed.barrier() 76 | if torch.distributed.get_rank() == 0: 77 | print('>> passed the test :-)') 78 | 79 | 80 | if __name__ == '__main__': 81 | 82 | initialize_distributed() 83 | world_size = torch.distributed.get_world_size() 84 | 85 | tensor_model_parallel_size = 1 86 | while tensor_model_parallel_size <= world_size: 87 | print_separator('test test broadcast data') 88 | test_broadcast_data(tensor_model_parallel_size) 89 | tensor_model_parallel_size *= 2 90 | -------------------------------------------------------------------------------- /megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /megatron/package_info.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | MAJOR = 1 17 | MINOR = 1.5 18 | 19 | # Use the following formatting: (major, minor) 20 | VERSION = (MAJOR, MINOR) 21 | 22 | __version__ = '.'.join(map(str, VERSION)) 23 | __package_name__ = 'megatron-lm' 24 | __contact_names__ = 'NVIDIA INC' 25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM' 26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.' 28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE' 29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language' 30 | 31 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /megatron/tokenizer/sp_tokenization.py: -------------------------------------------------------------------------------- 1 | # from: https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py 2 | """Tokenization classes for SentencePiece tokenizer""" 3 | 4 | from sentencepiece import SentencePieceProcessor 5 | from typing import List 6 | import os 7 | 8 | 9 | 10 | class SPTokenizer: 11 | def __init__(self, model_path: str): 12 | # reload tokenizer 13 | assert os.path.isfile(model_path), model_path 14 | self.sp_model = SentencePieceProcessor(model_file=model_path) 15 | 16 | # BOS / EOS token IDs 17 | self.n_words: int = self.sp_model.vocab_size() 18 | self.bos_id: int = self.sp_model.bos_id() 19 | self.eos_id: int = self.sp_model.eos_id() 20 | self.pad_id: int = self.sp_model.pad_id() 21 | assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() 22 | 23 | def encode(self, s, bos=False, eos=False) -> List[int]: 24 | assert type(s) is str 25 | t = self.sp_model.encode(s) 26 | if bos: 27 | t = [self.bos_id] + t 28 | if eos: 29 | t = t + [self.eos_id] 30 | return t 31 | 32 | def decode(self, t: List[int]) -> str: 33 | return self.sp_model.decode(t) -------------------------------------------------------------------------------- /pretrain_vit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Pretrain VIT""" 17 | 18 | import torch 19 | import torch.nn.functional as F 20 | from megatron import get_args, get_timers, mpu, print_rank_0 21 | from megatron.data.vit_dataset import build_train_valid_datasets 22 | from megatron.model.vit_model import VitModel 23 | from megatron.training import pretrain 24 | from megatron.utils import average_losses_across_data_parallel_group 25 | 26 | def model_provider(): 27 | """Build the model.""" 28 | 29 | print_rank_0("building VIT model ...") 30 | args = get_args() 31 | 32 | model = VitModel(num_classes=args.num_classes) 33 | return model 34 | 35 | def get_batch(data_iterator): 36 | """Build the batch.""" 37 | data = next(data_iterator) 38 | 39 | # only data parallelism; no need for broadcast 40 | images = data[0].to(get_accelerator().device_name()) 41 | labels = data[1].to(get_accelerator().device_name()) 42 | 43 | return images, labels 44 | 45 | def forward_step(data_iterator, model, input_tensor): 46 | """Forward step.""" 47 | timers = get_timers() 48 | assert input_tensor is None 49 | 50 | # Get the batch. 51 | timers("batch-generator").start() 52 | ( 53 | images, 54 | labels, 55 | ) = get_batch(data_iterator) 56 | timers("batch-generator").stop() 57 | 58 | # Forward model. lm_labels 59 | logits = model(images).contiguous().float() 60 | loss = F.cross_entropy(logits, labels) 61 | 62 | outputs = torch.argmax(logits, -1) 63 | correct = (outputs == labels).float() 64 | accuracy = torch.mean(correct) 65 | 66 | averaged_loss = average_losses_across_data_parallel_group([loss, accuracy]) 67 | 68 | return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]} 69 | 70 | 71 | def train_valid_test_datasets_provider(train_val_test_num_samples): 72 | """Build train, valid, and test datasets.""" 73 | args = get_args() 74 | 75 | print_rank_0( 76 | "> building train, validation, and test datasets " "for VIT ..." 77 | ) 78 | train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path) 79 | print_rank_0("> finished creating VIT datasets ...") 80 | 81 | return train_ds, valid_ds, None 82 | 83 | 84 | if __name__ == "__main__": 85 | 86 | pretrain( 87 | train_valid_test_datasets_provider, 88 | model_provider, 89 | forward_step, 90 | args_defaults={'dataloader_type': 'cyclic'} 91 | ) 92 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pybind11 2 | torch 3 | six 4 | regex 5 | numpy 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Setup for pip package.""" 17 | 18 | import os 19 | import sys 20 | import setuptools 21 | 22 | if sys.version_info < (3,): 23 | raise Exception("Python 2 is not supported by Megatron.") 24 | 25 | from megatron.package_info import ( 26 | __description__, 27 | __contact_names__, 28 | __url__, 29 | __download_url__, 30 | __keywords__, 31 | __license__, 32 | __package_name__, 33 | __version__, 34 | ) 35 | 36 | with open("README.md", "r") as fh: 37 | long_description = fh.read() 38 | 39 | ############################################################################### 40 | # Dependency Loading # 41 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 42 | 43 | 44 | def req_file(filename): 45 | with open(filename) as f: 46 | content = f.readlines() 47 | return [x.strip() for x in content] 48 | 49 | 50 | install_requires = req_file("requirements.txt") 51 | 52 | setuptools.setup( 53 | name=__package_name__, 54 | # Versions should comply with PEP440. For a discussion on single-sourcing 55 | # the version across setup.py and the project code, see 56 | # https://packaging.python.org/en/latest/single_source_version.html 57 | version=__version__, 58 | description=__description__, 59 | long_description=long_description, 60 | long_description_content_type="text/markdown", 61 | # The project's main homepage. 62 | url=__url__, 63 | author=__contact_names__, 64 | maintainer=__contact_names__, 65 | # The licence under which the project is released 66 | license=__license__, 67 | classifiers=[ 68 | 'Intended Audience :: Developers', 69 | 'Intended Audience :: Science/Research', 70 | 'Intended Audience :: Information Technology', 71 | # Indicate what your project relates to 72 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 73 | 'Topic :: Software Development :: Libraries :: Python Modules', 74 | # Supported python versions 75 | 'Programming Language :: Python :: 3.6', 76 | 'Programming Language :: Python :: 3.7', 77 | 'Programming Language :: Python :: 3.8', 78 | # Additional Setting 79 | 'Environment :: Console', 80 | 'Natural Language :: English', 81 | 'Operating System :: OS Independent', 82 | ], 83 | python_requires='>=3.6', 84 | packages=setuptools.find_packages(), 85 | install_requires=install_requires, 86 | # Add in any packaged data. 87 | include_package_data=True, 88 | zip_safe=False, 89 | # PyPI package information. 90 | keywords=__keywords__ 91 | ) 92 | -------------------------------------------------------------------------------- /tasks/eval_harness/download.py: -------------------------------------------------------------------------------- 1 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed 2 | # under the license https://huggingface.co/spaces/bigscience/license 3 | 4 | # Downloads the specified taks in the evaluation harness 5 | # This is particularly useful when running in environments where the GPU nodes 6 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation. 7 | 8 | from lm_eval import tasks 9 | from lm_eval.tasks import ALL_TASKS 10 | import argparse 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False) 15 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.') 16 | args = parser.parse_args() 17 | 18 | def main(): 19 | task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') 20 | tasks.get_task_dict(task_list) 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | 26 | -------------------------------------------------------------------------------- /tasks/eval_harness/report-to-csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed 4 | # under the license https://huggingface.co/spaces/bigscience/license 5 | 6 | # this script converts results.json: 7 | # 8 | # "results": { 9 | # "arc_challenge": { 10 | # "acc": 0.24232081911262798, 11 | # "acc_stderr": 0.01252159329580012, 12 | # "acc_norm": 0.2764505119453925, 13 | # "acc_norm_stderr": 0.013069662474252425 14 | # }, 15 | # 16 | # into a format expected by a spreadsheet, which is: 17 | # 18 | # task metric value err 19 | # arc_challenge acc xxx yyy 20 | # arc_challenge acc_norm xxx yyy 21 | # arc_challenge f1 xxx yyy 22 | # 23 | # usage: 24 | # report-to-csv.py results.json 25 | 26 | 27 | import sys 28 | import json 29 | import io 30 | import csv 31 | 32 | results_file = sys.argv[1] 33 | 34 | csv_file = results_file.replace("json", "csv") 35 | 36 | print(f"Converting {results_file} to {csv_file}") 37 | 38 | with io.open(results_file, 'r', encoding='utf-8') as f: 39 | results = json.load(f) 40 | 41 | with io.open(csv_file, 'w', encoding='utf-8') as f: 42 | 43 | writer = csv.writer(f) 44 | writer.writerow(["task", "metric", "value", "err", "version"]) 45 | 46 | versions = results["versions"] 47 | 48 | for k,v in sorted(results["results"].items()): 49 | if k not in versions: 50 | versions[k] = -1 51 | 52 | if "acc" in v: 53 | writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]]) 54 | if "acc_norm" in v: 55 | writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]]) 56 | if "f1" in v: 57 | writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]]) 58 | # if "ppl" in v: 59 | # writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]]) 60 | # if "em" in v: 61 | # writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]]) 62 | -------------------------------------------------------------------------------- /tasks/glue/cola.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """CoLA dataset.""" 17 | 18 | from megatron import print_rank_0 19 | from tasks.data_utils import clean_text 20 | from .data import GLUEAbstractDataset 21 | 22 | 23 | LABELS = [0, 1] 24 | 25 | 26 | class CoLADataset(GLUEAbstractDataset): 27 | 28 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 29 | test_label=0): 30 | self.test_label = test_label 31 | super().__init__('CoLA', name, datapaths, 32 | tokenizer, max_seq_length) 33 | 34 | def process_samples_from_single_path(self, filename): 35 | """"Implement abstract method.""" 36 | print_rank_0(' > Processing {} ...'.format(filename)) 37 | 38 | samples = [] 39 | total = 0 40 | first = True 41 | is_test = False 42 | with open(filename, 'r') as f: 43 | for line in f: 44 | row = line.strip().split('\t') 45 | if first: 46 | first = False 47 | if len(row) == 2: 48 | is_test = True 49 | print_rank_0(' reading {} and {} columns and ' 50 | 'setting labels to {}'.format( 51 | row[0].strip(), row[1].strip(), 52 | self.test_label)) 53 | continue 54 | 55 | if is_test: 56 | assert len(row) == 2, 'expected length 2: {}'.format(row) 57 | uid = int(row[0].strip()) 58 | text_a = clean_text(row[1].strip()) 59 | text_b = None 60 | label = self.test_label 61 | assert len(text_a) > 0 62 | else: 63 | if len(row) == 4: 64 | uid = total 65 | text_a = clean_text(row[3].strip()) 66 | text_b = None 67 | label = int(row[1].strip()) 68 | else: 69 | print_rank_0('***WARNING*** index error, ' 70 | 'skipping: {}'.format(row)) 71 | continue 72 | if len(text_a) == 0: 73 | print_rank_0('***WARNING*** zero length a, ' 74 | 'skipping: {}'.format(row)) 75 | continue 76 | assert label in LABELS 77 | assert uid >= 0 78 | 79 | sample = {'uid': uid, 80 | 'text_a': text_a, 81 | 'text_b': text_b, 82 | 'label': label} 83 | total += 1 84 | samples.append(sample) 85 | 86 | if total % 50000 == 0: 87 | print_rank_0(' > processed {} so far ...'.format(total)) 88 | 89 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 90 | return samples 91 | -------------------------------------------------------------------------------- /tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GLUE dataset.""" 17 | 18 | from abc import ABC 19 | from abc import abstractmethod 20 | 21 | from torch.utils.data import Dataset 22 | 23 | from megatron import print_rank_0 24 | from tasks.data_utils import build_sample 25 | from tasks.data_utils import build_tokens_types_paddings_from_text 26 | 27 | 28 | class GLUEAbstractDataset(ABC, Dataset): 29 | """GLUE base dataset class.""" 30 | 31 | def __init__(self, task_name, dataset_name, datapaths, 32 | tokenizer, max_seq_length): 33 | # Store inputs. 34 | self.task_name = task_name 35 | self.dataset_name = dataset_name 36 | self.tokenizer = tokenizer 37 | self.max_seq_length = max_seq_length 38 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 39 | self.dataset_name)) 40 | # Process the files. 41 | string = ' > paths:' 42 | for path in datapaths: 43 | string += ' ' + path 44 | print_rank_0(string) 45 | self.samples = [] 46 | for datapath in datapaths: 47 | self.samples.extend(self.process_samples_from_single_path(datapath)) 48 | print_rank_0(' >> total number of samples: {}'.format( 49 | len(self.samples))) 50 | 51 | def __len__(self): 52 | return len(self.samples) 53 | 54 | def __getitem__(self, idx): 55 | raw_sample = self.samples[idx] 56 | ids, types, paddings = build_tokens_types_paddings_from_text( 57 | raw_sample['text_a'], raw_sample['text_b'], 58 | self.tokenizer, self.max_seq_length) 59 | sample = build_sample(ids, types, paddings, 60 | raw_sample['label'], raw_sample['uid']) 61 | return sample 62 | 63 | @abstractmethod 64 | def process_samples_from_single_path(self, datapath): 65 | """Abstract method that takes a single path / filename and 66 | returns a list of dataset samples, each sample being a dict of 67 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 68 | """ 69 | pass 70 | -------------------------------------------------------------------------------- /tasks/glue/mnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """MNLI dataset.""" 17 | 18 | from megatron import print_rank_0 19 | from tasks.data_utils import clean_text 20 | from .data import GLUEAbstractDataset 21 | 22 | 23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} 24 | 25 | 26 | class MNLIDataset(GLUEAbstractDataset): 27 | 28 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 29 | test_label='contradiction'): 30 | self.test_label = test_label 31 | super().__init__('MNLI', name, datapaths, 32 | tokenizer, max_seq_length) 33 | 34 | def process_samples_from_single_path(self, filename): 35 | """"Implement abstract method.""" 36 | print_rank_0(' > Processing {} ...'.format(filename)) 37 | 38 | samples = [] 39 | total = 0 40 | first = True 41 | is_test = False 42 | with open(filename, 'r') as f: 43 | for line in f: 44 | row = line.strip().split('\t') 45 | if first: 46 | first = False 47 | if len(row) == 10: 48 | is_test = True 49 | print_rank_0( 50 | ' reading {}, {} and {} columns and setting ' 51 | 'labels to {}'.format( 52 | row[0].strip(), row[8].strip(), 53 | row[9].strip(), self.test_label)) 54 | else: 55 | print_rank_0(' reading {} , {}, {}, and {} columns ' 56 | '...'.format( 57 | row[0].strip(), row[8].strip(), 58 | row[9].strip(), row[-1].strip())) 59 | continue 60 | 61 | text_a = clean_text(row[8].strip()) 62 | text_b = clean_text(row[9].strip()) 63 | unique_id = int(row[0].strip()) 64 | label = row[-1].strip() 65 | if is_test: 66 | label = self.test_label 67 | 68 | assert len(text_a) > 0 69 | assert len(text_b) > 0 70 | assert label in LABELS 71 | assert unique_id >= 0 72 | 73 | sample = {'text_a': text_a, 74 | 'text_b': text_b, 75 | 'label': LABELS[label], 76 | 'uid': unique_id} 77 | total += 1 78 | samples.append(sample) 79 | 80 | if total % 50000 == 0: 81 | print_rank_0(' > processed {} so far ...'.format(total)) 82 | 83 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 84 | return samples 85 | -------------------------------------------------------------------------------- /tasks/glue/sst2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """SST-2 dataset.""" 17 | 18 | from megatron import print_rank_0 19 | from tasks.data_utils import clean_text 20 | from .data import GLUEAbstractDataset 21 | 22 | 23 | LABELS = [0, 1] 24 | 25 | 26 | class SST2Dataset(GLUEAbstractDataset): 27 | 28 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 29 | test_label=0): 30 | self.test_label = test_label 31 | super().__init__('SST-2', name, datapaths, 32 | tokenizer, max_seq_length) 33 | 34 | def process_samples_from_single_path(self, filename): 35 | """"Implement abstract method.""" 36 | print_rank_0(' > Processing {} ...'.format(filename)) 37 | 38 | samples = [] 39 | total = 0 40 | first = True 41 | is_test = False 42 | with open(filename, 'r') as f: 43 | for line in f: 44 | row = line.strip().split('\t') 45 | if first: 46 | first = False 47 | if row[0].strip() == 'index': 48 | is_test = True 49 | print_rank_0(' reading {} and {} columns and ' 50 | 'setting labels to {}'.format( 51 | row[0].strip(), row[1].strip(), 52 | self.test_label)) 53 | else: 54 | assert len(row) == 2 55 | print_rank_0(' reading {} and {} columns' 56 | ' ...'.format( 57 | row[0].strip(), row[1].strip())) 58 | continue 59 | 60 | if is_test: 61 | assert len(row) == 2, 'expected length 2: {}'.format(row) 62 | uid = int(row[0].strip()) 63 | text_a = clean_text(row[1].strip()) 64 | text_b = None 65 | label = self.test_label 66 | assert len(text_a) > 0 67 | else: 68 | if len(row) == 2: 69 | uid = total 70 | text_a = clean_text(row[0].strip()) 71 | text_b = None 72 | label = int(row[-1].strip()) 73 | else: 74 | print_rank_0('***WARNING*** index error, ' 75 | 'skipping: {}'.format(row)) 76 | continue 77 | if len(text_a) == 0: 78 | print_rank_0('***WARNING*** zero length a, ' 79 | 'skipping: {}'.format(row)) 80 | continue 81 | assert label in LABELS 82 | assert uid >= 0 83 | 84 | sample = {'uid': uid, 85 | 'text_a': text_a, 86 | 'text_b': text_b, 87 | 'label': label} 88 | total += 1 89 | samples.append(sample) 90 | 91 | if total % 50000 == 0: 92 | print_rank_0(' > processed {} so far ...'.format(total)) 93 | 94 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 95 | return samples 96 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | import os 19 | import sys 20 | 21 | from megatron import get_args 22 | from tasks.orqa.evaluate_utils import ORQAEvaluator 23 | 24 | def main(): 25 | """ 26 | Main program 27 | """ 28 | 29 | args = get_args() 30 | 31 | # Set up the model and evaluator 32 | evaluator = ORQAEvaluator() 33 | 34 | # Run evaluation 35 | if args.qa_data_dev is not None: 36 | evaluator.evaluate(args.qa_data_dev, "DEV") 37 | 38 | if args.qa_data_test is not None: 39 | evaluator.evaluate(args.qa_data_test, "TEST") 40 | 41 | -------------------------------------------------------------------------------- /tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Race.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron import get_tokenizer 21 | from megatron import mpu 22 | from megatron.model.multiple_choice import MultipleChoice 23 | from tasks.eval_utils import accuracy_func_provider 24 | from tasks.finetune_utils import finetune 25 | from tasks.race.data import RaceDataset 26 | 27 | 28 | def train_valid_datasets_provider(): 29 | """Provide train and validation datasets.""" 30 | args = get_args() 31 | tokenizer = get_tokenizer() 32 | 33 | train_dataset = RaceDataset('training', args.train_data, 34 | tokenizer, args.seq_length) 35 | valid_dataset = RaceDataset('validation', args.valid_data, 36 | tokenizer, args.seq_length) 37 | 38 | return train_dataset, valid_dataset 39 | 40 | 41 | def model_provider(pre_process=True, post_process=True): 42 | """Build the model.""" 43 | 44 | print_rank_0('building multichoice model for RACE ...') 45 | model = MultipleChoice(num_tokentypes=2, 46 | pre_process=pre_process, 47 | post_process=post_process) 48 | 49 | return model 50 | 51 | 52 | def metrics_func_provider(): 53 | """Privde metrics callback function.""" 54 | args = get_args() 55 | tokenizer = get_tokenizer() 56 | 57 | def single_dataset_provider(datapath): 58 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 59 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 60 | 61 | return accuracy_func_provider(single_dataset_provider) 62 | 63 | 64 | def main(): 65 | 66 | finetune(train_valid_datasets_provider, model_provider, 67 | end_of_epoch_callback_provider=metrics_func_provider) 68 | -------------------------------------------------------------------------------- /tasks/vision/classification.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Vision-classification finetuning/evaluation.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron.model.vit_model import VitModel 21 | from megatron.data.vit_dataset import build_train_valid_datasets 22 | from tasks.vision.eval_utils import accuracy_func_provider 23 | from tasks.vision.finetune_utils import finetune 24 | 25 | 26 | def classification(): 27 | def train_valid_datasets_provider(): 28 | """Build train and validation dataset.""" 29 | args = get_args() 30 | 31 | train_ds, valid_ds = build_train_valid_datasets( 32 | data_path=args.data_path, 33 | crop_size=args.img_dim, 34 | ) 35 | return train_ds, valid_ds 36 | 37 | def model_provider(): 38 | """Build the model.""" 39 | args = get_args() 40 | 41 | print_rank_0("building classification model for ImageNet ...") 42 | 43 | return VitModel(num_classes=args.num_classes, finetune=True) 44 | 45 | """Finetune/evaluate.""" 46 | finetune( 47 | train_valid_datasets_provider, 48 | model_provider, 49 | end_of_epoch_callback_provider=accuracy_func_provider, 50 | ) 51 | 52 | 53 | def main(): 54 | classification() 55 | -------------------------------------------------------------------------------- /tasks/vision/eval_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Evaluation utilities.""" 17 | 18 | import os 19 | import torch 20 | from megatron import get_args 21 | from megatron import print_rank_0 22 | from megatron import mpu 23 | from tasks.vision.finetune_utils import build_data_loader 24 | from tasks.vision.finetune_utils import process_batch 25 | from torchvision import datasets, transforms 26 | from deepspeed.accelerator import get_accelerator 27 | 28 | def accuracy_func_provider(): 29 | """Provide function that calculates accuracies.""" 30 | args = get_args() 31 | data_path = args.data_path 32 | crop_size = args.img_dim 33 | 34 | # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] 35 | # Build dataloaders. 36 | val_data_path = os.path.join(data_path[0], "val") 37 | normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 38 | transform_val = transforms.Compose( 39 | [ 40 | transforms.Resize(crop_size), 41 | transforms.CenterCrop(crop_size), 42 | transforms.ToTensor(), 43 | normalize, 44 | ] 45 | ) 46 | dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val) 47 | 48 | dataloader = build_data_loader( 49 | dataset, 50 | args.micro_batch_size, 51 | num_workers=args.num_workers, 52 | drop_last=(mpu.get_data_parallel_world_size() > 1), 53 | ) 54 | 55 | def metrics_func(model, epoch): 56 | print_rank_0("calculating metrics ...") 57 | correct, total = calculate_correct_answers(model, dataloader, epoch) 58 | percent = float(correct) * 100.0 / float(total) 59 | print_rank_0( 60 | " >> |epoch: {}| overall: correct / total = {} / {} = " 61 | "{:.4f} %".format(epoch, correct, total, percent) 62 | ) 63 | 64 | return metrics_func 65 | 66 | 67 | def calculate_correct_answers(model, dataloader, epoch): 68 | """Calculate correct over total answers""" 69 | 70 | model.eval() 71 | with torch.no_grad(): 72 | # For all the batches in the dataset. 73 | total = 0 74 | correct = 0 75 | for _, batch in enumerate(dataloader): 76 | # Run the model forward. 77 | images, labels = process_batch(batch) 78 | logits = model(images).contiguous().float() 79 | # Add output predictions. 80 | # Compute the correct answers. 81 | predicted = torch.argmax(logits, dim=-1) 82 | corrects = (predicted == labels).float() 83 | # Add to the counters. 84 | total += labels.size(0) 85 | correct += corrects.sum().item() 86 | model.train() 87 | 88 | # Reduce. 89 | unreduced = get_accelerator().LongTensor([correct, total]) 90 | torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) 91 | 92 | # Print on screen. 93 | correct_ans = unreduced[0].item() 94 | total_count = unreduced[1].item() 95 | return correct_ans, total_count 96 | -------------------------------------------------------------------------------- /tasks/vision/main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | import os 19 | import sys 20 | 21 | sys.path.append( 22 | os.path.abspath( 23 | os.path.join( 24 | os.path.join(os.path.dirname(__file__), os.path.pardir), 25 | os.path.pardir, 26 | ) 27 | ) 28 | ) 29 | from megatron import get_args 30 | from megatron.initialize import initialize_megatron 31 | from classification import main 32 | 33 | 34 | def get_tasks_args(parser): 35 | """Provide extra arguments required for tasks.""" 36 | group = parser.add_argument_group(title="tasks") 37 | 38 | group.add_argument( 39 | "--epochs", 40 | type=int, 41 | default=None, 42 | help="Number of finetunning epochs. Zero results in " 43 | "evaluation only.", 44 | ) 45 | group.add_argument( 46 | "--pretrained-checkpoint", 47 | type=str, 48 | default=None, 49 | help="Pretrained checkpoint used for finetunning.", 50 | ) 51 | group.add_argument( 52 | "--keep-last", 53 | action="store_true", 54 | help="Keep the last batch (maybe incomplete) in" "the data loader", 55 | ) 56 | 57 | return parser 58 | 59 | 60 | if __name__ == "__main__": 61 | 62 | initialize_megatron(extra_args_provider=get_tasks_args) 63 | args = get_args() 64 | main() 65 | -------------------------------------------------------------------------------- /tasks/zeroshot_gpt/detokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Detokenization.""" 17 | 18 | import re 19 | 20 | 21 | def ptb_detokenizer(string): 22 | string = string.replace(" '", "'") 23 | string = string.replace(" \n", "\n") 24 | string = string.replace("\n ", "\n") 25 | string = string.replace(" n't", "n't") 26 | string = string.replace(" N ", "1 ") 27 | string = string.replace("$ 1", "$1") 28 | string = string.replace("# 1", "#1") 29 | return string 30 | 31 | 32 | def wikitext_detokenizer(string): 33 | # contractions 34 | string = string.replace("s '", "s'") 35 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 36 | # number separators 37 | string = string.replace(" @-@ ", "-") 38 | string = string.replace(" @,@ ", ",") 39 | string = string.replace(" @.@ ", ".") 40 | # punctuation 41 | string = string.replace(" : ", ": ") 42 | string = string.replace(" ; ", "; ") 43 | string = string.replace(" . ", ". ") 44 | string = string.replace(" ! ", "! ") 45 | string = string.replace(" ? ", "? ") 46 | string = string.replace(" , ", ", ") 47 | # double brackets 48 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 49 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 50 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 51 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 52 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 53 | # miscellaneous 54 | string = string.replace("= = = =", "====") 55 | string = string.replace("= = =", "===") 56 | string = string.replace("= =", "==") 57 | string = string.replace(" " + chr(176) + " ", chr(176)) 58 | string = string.replace(" \n", "\n") 59 | string = string.replace("\n ", "\n") 60 | string = string.replace(" N ", " 1 ") 61 | string = string.replace(" 's", "'s") 62 | 63 | return string 64 | 65 | 66 | def lambada_detokenizer(string): 67 | return string 68 | 69 | 70 | _DETOKENIZERS = { 71 | 'ptb': ptb_detokenizer, 72 | 'wiki': wikitext_detokenizer, 73 | 'lambada': lambada_detokenizer, 74 | } 75 | 76 | 77 | def get_detokenizer(path): 78 | for key in _DETOKENIZERS.keys(): 79 | if key in path: 80 | return _DETOKENIZERS[key] 81 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /tests/test_megatron.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import re 4 | import subprocess 5 | 6 | 7 | @pytest.fixture(params=[1]) 8 | def moe_num_experts(request): 9 | return str(request.param) 10 | 11 | 12 | @pytest.fixture(params=[1]) 13 | def mp_size(request): 14 | return str(request.param) 15 | 16 | 17 | @pytest.fixture 18 | def params(moe_num_experts, mp_size): 19 | base_dir = os.getenv("MEGATRON_CKPT_DIR") 20 | assert base_dir, "Please set MEGATRON_CKPT_DIR in your environment" 21 | 22 | vocab_file = os.path.join(base_dir, "gpt2-vocab.json") 23 | merge_file = os.path.join(base_dir, "gpt2-merges.txt") 24 | ckpt_path = os.path.join(base_dir, "checkpoints/gpt2_345m") 25 | 26 | return [ 27 | "--micro-batch-size", "1", 28 | "--num-layers", "24", 29 | "--hidden-size", "1024", 30 | "--num-attention-heads", "16", 31 | "--max-position-embeddings", "1024", 32 | "--vocab-file", vocab_file, 33 | "--merge-file", merge_file, 34 | "--load", ckpt_path, 35 | "--seq-length", "1024", 36 | "--out-seq-length", "1024", 37 | "--tensor-model-parallel-size", mp_size, 38 | "--tokenizer-type", "GPT2BPETokenizer", 39 | "--num-experts", moe_num_experts, 40 | "--mlp-type", "standard", 41 | "--num-samples", "0", 42 | "--fp16", 43 | ] 44 | 45 | 46 | def test_moe_megatron(params, mp_size): 47 | output_re = r"===START OUTPUT===([\S\s]*)===END OUTPUT===" 48 | 49 | # Run the baseline 50 | baseline_cmd = ["deepspeed", "--num_gpus", mp_size, "./run_megatron.py"] + params 51 | result = subprocess.run(baseline_cmd, stdout=subprocess.PIPE) 52 | baseline_output = re.search(output_re, result.stdout.decode("utf-8")).group(1) 53 | 54 | # Run with DeepSpeed 55 | deepspeed_cmd = baseline_cmd + ["--ds-inference"] 56 | result = subprocess.run(deepspeed_cmd, stdout=subprocess.PIPE) 57 | deepspeed_output = re.search(output_re, result.stdout.decode("utf-8")).group(1) 58 | 59 | assert ( 60 | baseline_output == deepspeed_output 61 | ), f"outputs do not match: {baseline_output}\n{deepspeed_output}" 62 | -------------------------------------------------------------------------------- /tools/convert_checkpoint/deepspeed_to_transformers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import torch 5 | import json 6 | 7 | from deepspeed_checkpoint import DeepSpeedCheckpoint 8 | from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments 9 | 10 | # the import was tested to work with this version 11 | # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider 12 | # copying that version here instead 13 | from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint 14 | from transformers import GPT2Config 15 | 16 | def main(): 17 | 18 | # this first part comes mainly from deepspeed_to_megatron.main 19 | args = parse_arguments() 20 | print(f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}') 21 | 22 | ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp) 23 | iteration = ds_checkpoint.get_iteration() 24 | input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, args.for_release) 25 | 26 | # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main 27 | # Spell out all parameters in case the defaults change. 28 | config = GPT2Config( 29 | vocab_size=50257, 30 | n_positions=1024, 31 | n_ctx=1024, 32 | n_embd=1024, 33 | n_layer=24, 34 | n_head=16, 35 | n_inner=4096, 36 | activation_function="gelu", # used to be "gelu_new" in earlier versions 37 | resid_pdrop=0.1, 38 | embd_pdrop=0.1, 39 | attn_pdrop=0.1, 40 | layer_norm_epsilon=1e-5, 41 | initializer_range=0.02, 42 | summary_type="cls_index", 43 | summary_use_proj=True, 44 | summary_activation=None, 45 | summary_proj_to_labels=True, 46 | summary_first_dropout=0.1, 47 | scale_attn_weights=True, 48 | gradient_checkpointing=False, 49 | use_cache=True, 50 | bos_token_id=50256, 51 | eos_token_id=50256, 52 | ) 53 | 54 | # Convert. 55 | print("Converting to HF Checkpoint") 56 | output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) 57 | 58 | basename = args.output_folder 59 | os.makedirs(basename, exist_ok=True) 60 | 61 | # Print the structure of converted state dict. 62 | #if args.print_checkpoint_structure: 63 | # recursive_print(None, output_state_dict) 64 | 65 | # Store the config to file. 66 | output_config_file = os.path.join(basename, "config.json") 67 | output_config = config.to_dict() 68 | output_config["architectures"] = ["GPT2LMHeadModel"] 69 | output_config["model_type"] = "gpt2" 70 | print(f'Saving config to "{output_config_file}"') 71 | with open(output_config_file, "w") as f: 72 | json.dump(output_config, f) 73 | 74 | # Store the state_dict to file. 75 | output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") 76 | print(f'Saving checkpoint to "{output_checkpoint_file}"') 77 | torch.save(output_state_dict, output_checkpoint_file) 78 | 79 | print("Now add tokenizer files and upload to the hub") 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /tools/convert_checkpoint/inspect_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os 4 | from collections import OrderedDict 5 | 6 | 7 | def dump_data(datum, name_list=[]): 8 | if type(datum) in (dict, OrderedDict): 9 | for k, v in datum.items(): 10 | dump_data(v, name_list+[str(k)]) 11 | elif type(datum) in (list, tuple): 12 | for v in datum: 13 | dump_data(v, name_list) 14 | elif torch.is_tensor(datum): 15 | prefix = '.'.join(name_list) 16 | print(f'[tensor] {prefix} = {datum.shape}') 17 | else: 18 | #pass 19 | prefix = '.'.join(name_list) 20 | print(f'[other] {prefix} = {datum}') 21 | 22 | def main(): 23 | if len(sys.argv) < 2: 24 | print(f'Usage: {sys.argv[0]} ') 25 | exit(1) 26 | 27 | ckpt_file = sys.argv[1] 28 | if not os.path.isfile(ckpt_file): 29 | print(f'{ckpt_file} is not a valid file') 30 | exit(1) 31 | 32 | print(f'loading checkpoint file: {ckpt_file}') 33 | sd = torch.load(ckpt_file) 34 | dump_data(sd) 35 | 36 | quit() 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /tools/convert_checkpoint/inspect_deepspeed_checkpoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from deepspeed_checkpoint import DeepSpeedCheckpoint 3 | 4 | def list_files(file_list, tag): 5 | print(f'Listing files: {tag}') 6 | for i, file in enumerate(file_list): 7 | print(f'{i+1}: {file}') 8 | 9 | def parse_arguments(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--folder', default=None, type=str, help='DeepSpeed Checkpoint folder') 12 | parser.add_argument('--target_tp', default=None, type=int, help='Target TP degree') 13 | parser.add_argument('--target_pp', default=None, type=int, help='Target PP degree') 14 | args = parser.parse_args() 15 | print(f'args = {args}') 16 | return args 17 | 18 | 19 | def show_input_files(ds_checkpoint): 20 | list_files(ds_checkpoint.file_list, 'all') 21 | list_files(ds_checkpoint.zero_files, 'zero') 22 | list_files(ds_checkpoint.layer_files, 'layer') 23 | list_files(ds_checkpoint.mp_rank_files, 'mp rank') 24 | 25 | def show_simple_state(ds_checkpoint): 26 | print(f'layer keys = {ds_checkpoint.layer_keys}') 27 | print(f'layer count = {ds_checkpoint.layer_count}') 28 | 29 | print(f'tp_degree_count = {ds_checkpoint.tp_degree}') 30 | print(f'pp_degree_count = {ds_checkpoint.pp_degree}') 31 | print(f'dp_degree_count = {ds_checkpoint.dp_degree}') 32 | 33 | def show_mappings(ds_checkpoint): 34 | ds_checkpoint.show_pp_tranformer_map() 35 | ds_checkpoint.show_transformer_file_map() 36 | ds_checkpoint.show_tp_embedding_map() 37 | ds_checkpoint.show_tp_final_norm_map() 38 | 39 | def show_state_summary(tag, sd): 40 | summary = {k:v.shape for k,v in sd.items()} 41 | print(f'{tag} = {summary}') 42 | 43 | def show_embedding_states(ds_checkpoint): 44 | for i in range(0, ds_checkpoint.tp_degree): 45 | sd = ds_checkpoint.get_embedding_state(i) 46 | show_state_summary(f'embedding[{i}]', sd) 47 | 48 | def show_final_norm_states(ds_checkpoint): 49 | for i in range(0, ds_checkpoint.tp_degree): 50 | sd = ds_checkpoint.get_final_norm_state(i) 51 | show_state_summary(f'final_norm[{i}]', sd) 52 | 53 | def show_transformer_states(ds_checkpoint): 54 | for i in range(0, ds_checkpoint.tp_degree): 55 | for j in range(0, ds_checkpoint.pp_degree): 56 | state_list = ds_checkpoint.get_transformer_state(tp_index=i, pp_index=j) 57 | print(f'tp_pp_rank[{i},{j}] = ') 58 | for k, sd in enumerate(state_list): 59 | show_state_summary(f' block[{k}]', sd) 60 | print("") 61 | 62 | 63 | def main(): 64 | print(f'Inspecting DeepSpeed Checkpoint') 65 | args = parse_arguments() 66 | 67 | ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp, args.target_pp) 68 | ds_checkpoint.validate_files() 69 | 70 | show_input_files(ds_checkpoint) 71 | show_simple_state(ds_checkpoint) 72 | show_mappings(ds_checkpoint) 73 | show_embedding_states(ds_checkpoint) 74 | show_final_norm_states(ds_checkpoint) 75 | show_transformer_states(ds_checkpoint) 76 | checkpoint_args = ds_checkpoint.get_args() 77 | print(f'checkpoint args = {checkpoint_args}') 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /tools/create_doc_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 4 | os.path.pardir))) 5 | 6 | from megatron import print_rank_0 7 | from megatron.indexer import IndexBuilder 8 | from megatron.initialize import initialize_megatron 9 | 10 | 11 | def main(): 12 | """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset 13 | - Include all args needed for initial model specification 14 | 15 | Other key args: 16 | --block-data-path: path to write to 17 | --ict-load or --realm-load: path to checkpoint with which to embed 18 | --data-path and --titles-data-path: paths for dataset 19 | --indexer-log-interval: reporting interval 20 | --indexer-batch-size: size specific for indexer jobs 21 | 22 | Check README.md for example script 23 | """ 24 | 25 | initialize_megatron(extra_args_provider=None, 26 | args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) 27 | index_builder = IndexBuilder() 28 | index_builder.build_and_save_index() 29 | print_rank_0("Build and save indices: done!") 30 | 31 | if __name__ == "__main__": 32 | main() 33 | 34 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/openwebtext/README.md: -------------------------------------------------------------------------------- 1 | The following steps show how to prepare training dataset to train the mode. 2 | 3 | # Libraries to install 4 | 5 | ``` 6 | pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 7 | git clone https://github.com/mattilyra/LSH 8 | cd LSH 9 | python setup.py install 10 | ``` 11 | 12 | # Download the dataset 13 | 14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ) 15 | 2. Remove blacklisted URLs. 16 | ``` 17 | python blacklist_urls.py 18 | ``` 19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 20 | 21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique. 22 | 23 | # Prepare the data for GPT-2 training: 24 | 25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards. 26 | ``` 27 | python cleanup_dataset.py 28 | ``` 29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`. 30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`. 31 | ``` 32 | python find_duplicates.py --inputs --output 33 | ``` 34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest. 35 | ``` 36 | python group_duplicate_urls.py 37 | ``` 38 | 4. Remove similar documents that were detected in the last step. 39 | ``` 40 | python remove_group_duplicates.py 41 | ``` 42 | 43 | 5. Shuffle the dataset. 44 | ``` 45 | shuf -o train_data.json 46 | ``` 47 | 48 | # Deduplicating ngrams 49 | 50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command. 51 | 52 | ``` 53 | python filter_ngrams.py --tasks --dedup-dataset --output 54 | ``` 55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments. 56 | 57 | Only for the lambada task, we need to provide the path, `--lambada-path `. 58 | 59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details. 60 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import json 18 | import os 19 | import time 20 | 21 | """ 22 | This code adds id to each json object in a json file. User can add prefix 23 | to the ids. 24 | """ 25 | 26 | if __name__ == '__main__': 27 | 28 | print('parsing the arguments ...') 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 32 | ' json file where id needs to be added') 33 | parser.add_argument('--output-file', type=str, default=None, help=\ 34 | 'Output file name with id') 35 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 36 | 'Id prefix') 37 | parser.add_argument('--log-interval', type=int, default=100, 38 | help='Log interval') 39 | args = parser.parse_args() 40 | 41 | print('Adding ids to dataset ...') 42 | 43 | f_input = open(args.input_file, 'r', encoding='utf-8') 44 | f_output = open(args.output_file, 'wb') 45 | 46 | unique_ids = 1 47 | start_time = time.time() 48 | for row in f_input: 49 | each_row = json.loads(row) 50 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 51 | each_row['adlr_id'] = adlr_id_string 52 | myjson = json.dumps(each_row, ensure_ascii=False) 53 | 54 | f_output.write(myjson.encode('utf-8')) 55 | f_output.write('\n'.encode('utf-8')) 56 | 57 | if unique_ids % args.log_interval == 0: 58 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 59 | unique_ids, time.time() - start_time), flush=True) 60 | 61 | unique_ids += 1 62 | 63 | # Close the file. 64 | f_input.close() 65 | f_output.close() 66 | 67 | print('done :-)', flush=True) 68 | -------------------------------------------------------------------------------- /tools/openwebtext/group_duplicate_url.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import time 18 | import sys 19 | 20 | 21 | if __name__ == '__main__': 22 | 23 | 24 | print('grouping duplicate urls ...') 25 | 26 | input = sys.argv[1] 27 | output = sys.argv[2] 28 | if len(sys.argv) > 3: 29 | jaccard_similarity_threshold = float(sys.argv[3]) 30 | else: 31 | jaccard_similarity_threshold = 0.7 32 | 33 | url_to_index = {} 34 | index_to_urls = [] 35 | counter = 0 36 | start_time = time.time() 37 | with open(input, 'r') as f: 38 | for line in f: 39 | counter += 1 40 | myjson = json.loads(line) 41 | urls = [] 42 | for main_url in myjson.keys(): 43 | urls.append(main_url) 44 | for value in myjson[main_url]: 45 | for other_url, js in value.items(): 46 | if js >= jaccard_similarity_threshold: 47 | urls.append(other_url) 48 | current_index = -1 49 | other_indices = set() 50 | for url in urls: 51 | if url in url_to_index: 52 | if current_index == -1: 53 | current_index = url_to_index[url] 54 | elif current_index != url_to_index[url]: 55 | other_indices.add(url_to_index[url]) 56 | if current_index == -1: 57 | current_index = len(index_to_urls) 58 | index_to_urls.append(set()) 59 | for url in urls: 60 | url_to_index[url] = current_index 61 | index_to_urls[current_index].add(url) 62 | for index in other_indices: 63 | for url in index_to_urls[index]: 64 | index_to_urls[current_index].add(url) 65 | url_to_index[url] = current_index 66 | index_to_urls[index] = None 67 | 68 | if counter % 100000 == 0: 69 | print(' > processed {} lines in {} seconds ...'.format( 70 | counter, time.time() - start_time)) 71 | 72 | 73 | total_remove = 0 74 | total_remain = 0 75 | for urls in index_to_urls: 76 | if urls is not None: 77 | if len(urls) > 1: 78 | total_remove += (len(urls) - 1) 79 | total_remain += 1 80 | print('out of {} urls, only {} are unique and {} should be removed'.format( 81 | total_remove+total_remain, total_remain, total_remove)) 82 | 83 | with open(output, 'wb') as f: 84 | for i, urls in enumerate(index_to_urls): 85 | if urls is not None: 86 | if len(urls) > 1: 87 | myjson = json.dumps({str(i): list(urls)}, 88 | ensure_ascii=False) 89 | f.write(myjson.encode('utf-8')) 90 | f.write('\n'.encode('utf-8')) 91 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import glob 18 | import sys 19 | import json 20 | import argparse 21 | 22 | if __name__ == '__main__': 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--json_path", type=str, default=".", 26 | help="path where all the json files are located") 27 | 28 | parser.add_argument("--output_file", type=str, default="merged_output.json", 29 | help="filename where the merged json should go") 30 | 31 | args = parser.parse_args() 32 | 33 | json_path = args.json_path 34 | out_file = args.output_file 35 | 36 | json_files = glob.glob(json_path + '/*.json') 37 | 38 | counter = 0 39 | 40 | with open(out_file, 'w') as outfile: 41 | for fname in json_files: 42 | counter += 1 43 | 44 | if counter % 1024 == 0: 45 | print("Merging at ", counter, flush=True) 46 | 47 | with open(fname, 'r') as infile: 48 | for row in infile: 49 | each_row = json.loads(row) 50 | outfile.write(row) 51 | 52 | 53 | print("Merged file", out_file, flush=True) 54 | 55 | 56 | -------------------------------------------------------------------------------- /tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import time 19 | import sys 20 | 21 | 22 | if __name__ == '__main__': 23 | 24 | url_filename = sys.argv[1] 25 | data_filename = sys.argv[2] 26 | output_filename = sys.argv[3] 27 | 28 | urls = set() 29 | with open(url_filename, 'r') as f: 30 | for line in f: 31 | myjson = json.loads(line) 32 | for key in myjson: 33 | this_urls = myjson[key] 34 | for i in range(1, len(this_urls)): 35 | urls.add(this_urls[i]) 36 | print('will be removing {} urls'.format(len(urls)), flush=True) 37 | 38 | written_docs = 0 39 | removed_docs = 0 40 | removed_chars = 0 41 | start_time = time.time() 42 | with open(output_filename, 'wb') as fout: 43 | with open(data_filename, 'r') as fin: 44 | for line in fin: 45 | try: 46 | myjson = json.loads(line) 47 | url = myjson['url'] 48 | if url in urls: 49 | print('removing', myjson) 50 | removed_docs += 1 51 | removed_chars += len(myjson['text']) 52 | continue 53 | myjson = json.dumps(myjson, ensure_ascii=False) 54 | fout.write(myjson.encode('utf-8')) 55 | fout.write('\n'.encode('utf-8')) 56 | written_docs += 1 57 | if written_docs % 10000 == 0: 58 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 59 | '| removed: {} (char: {})'.format( 60 | time.time() - start_time, 61 | written_docs, removed_docs, removed_chars)) 62 | except Exception as e: 63 | print('[SKIPPING]', line, e) 64 | 65 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 66 | '| removed: {} (char: {})'.format( 67 | time.time() - start_time, 68 | written_docs, removed_docs, removed_chars)) 69 | print('done :-)') 70 | --------------------------------------------------------------------------------