├── .gitignore
├── .gitlab-ci.yml
├── CODEOWNERS
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── dataset
    ├── README.md
    ├── download_books.sh
    ├── download_ckpt.sh
    └── download_vocab.sh
├── examples
    ├── MoE
    │   ├── ds_config_gpt_TEMPLATE.json
    │   ├── ds_config_gpt_Zero2_TEMPLATE.json
    │   ├── ds_evalharness.sh
    │   ├── ds_pretrain_gpt_1.3B_MoE128.sh
    │   ├── ds_pretrain_gpt_1.3B_PR-MoE64or128.sh
    │   ├── ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh
    │   ├── ds_pretrain_gpt_1.3B_dense.sh
    │   ├── ds_pretrain_gpt_1.3B_dense_cl.sh
    │   ├── ds_pretrain_gpt_125M_MoE64.sh
    │   ├── ds_pretrain_gpt_125M_dense_cl.sh
    │   ├── ds_pretrain_gpt_350M_MoE128.sh
    │   ├── ds_pretrain_gpt_350M_PR-MoE32or64.sh
    │   ├── ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh
    │   ├── ds_pretrain_gpt_350M_dense.sh
    │   ├── ds_pretrain_gpt_6.7B_dense.sh
    │   └── readme_evalharness.md
    ├── README.md
    ├── azure
    │   ├── README.md
    │   ├── run-175b.sh
    │   ├── run-1t.sh
    │   └── run-benchmark-model.sh
    ├── azureml
    │   ├── Dockerfile.dockerfile
    │   ├── README.md
    │   ├── aml_submit.py
    │   └── prepare_dataset.py
    ├── bert_with_pile
    │   ├── README.md
    │   ├── ds_config_bert_TEMPLATE.json
    │   ├── ds_finetune_bert_mnli.sh
    │   ├── ds_finetune_bert_qqp.sh
    │   ├── ds_finetune_bert_race.sh
    │   ├── ds_pretrain_bert.sh
    │   └── prepare_pile_data.py
    ├── compression
    │   ├── 125M-Int8-test-64gpu-distilled-group48.sh
    │   ├── 125M-L10-Int8-test-64gpu-distilled-group48.sh
    │   ├── 125M-L12-Int8-test-64gpu-distilled-group48.sh
    │   ├── ds_config_gpt_TEMPLATE.json
    │   ├── ds_config_gpt_TEMPLATE_compression.json
    │   ├── ds_evalharness.sh
    │   ├── ds_pretrain_gpt_1.3B_dense_cl_kd.sh
    │   ├── ds_pretrain_gpt_125M_dense_cl_kd.sh
    │   ├── ds_pretrain_gpt_125M_dense_kd.sh
    │   └── ds_pretrain_gpt_350M_dense_kd.sh
    ├── create_embeddings.sh
    ├── curriculum_learning
    │   ├── README.md
    │   ├── ds_pretrain_gpt2.sh
    │   ├── ds_train.sh
    │   ├── ds_zero_stage_1_config_baseline.json
    │   └── ds_zero_stage_1_config_curriculum_fixed_linear.json
    ├── data_efficiency
    │   ├── README.md
    │   ├── analyze_data.py
    │   ├── bert
    │   │   ├── ds_analyze_bert_data_map.sh
    │   │   ├── ds_analyze_bert_data_reduce.sh
    │   │   ├── finetune
    │   │   │   ├── ds_config_bert_TEMPLATE.json
    │   │   │   ├── ds_finetune_bert_mnli.sh
    │   │   │   ├── ds_finetune_bert_qqp.sh
    │   │   │   ├── ds_finetune_bert_race.sh
    │   │   │   └── ds_finetune_gather_result.py
    │   │   ├── finetune_glue
    │   │   │   ├── ds_config_bert_TEMPLATE.json
    │   │   │   ├── ds_finetune_bert_glue.sh
    │   │   │   ├── ds_finetune_bert_glue_run.sh
    │   │   │   └── ds_finetune_gather_result.py
    │   │   ├── pile_data_download_preprocess.py
    │   │   └── pretrain
    │   │   │   ├── ds_config_bert_1clmetric_TEMPLATE.json
    │   │   │   ├── ds_config_bert_2clmetrics_TEMPLATE.json
    │   │   │   ├── ds_pretrain_bert_336M_base_script.sh
    │   │   │   └── ds_pretrain_bert_336M_run.sh
    │   └── gpt
    │   │   ├── ds_analyze_gpt_data_map.sh
    │   │   ├── ds_analyze_gpt_data_reduce.sh
    │   │   ├── eval
    │   │       ├── ds_config_eval_dummy.json
    │   │       ├── ds_evalharness_1gpu.sh
    │   │       ├── ds_evalharness_gather_result.py
    │   │       ├── ds_evalharness_parallel_run.sh
    │   │       └── ds_evalharness_parallel_run_10shot.sh
    │   │   └── pretrain
    │   │       ├── ds_config_gpt_1clmetric_TEMPLATE.json
    │   │       ├── ds_config_gpt_2clmetrics_TEMPLATE.json
    │   │       ├── ds_pretrain_gpt_1.3B_dense_base_script.sh
    │   │       └── ds_pretrain_gpt_1.3B_dense_run.sh
    ├── evaluate_ict_zeroshot_nq.sh
    ├── evaluate_zeroshot_gpt.sh
    ├── finetune_mnli_distributed.sh
    ├── finetune_race_distributed.sh
    ├── generate_text.sh
    ├── merge_mp_bert.sh
    ├── pretrain_bert.sh
    ├── pretrain_bert_distributed.sh
    ├── pretrain_bert_distributed_with_mp.sh
    ├── pretrain_gpt.sh
    ├── pretrain_gpt3_175B.sh
    ├── pretrain_gpt_distributed.sh
    ├── pretrain_gpt_distributed_with_mp.sh
    ├── pretrain_ict.sh
    ├── pretrain_llama_distributed.sh
    ├── pretrain_t5.sh
    ├── pretrain_t5_distributed.sh
    ├── pretrain_t5_distributed_with_mp.sh
    └── run_deepspeed_example.sh
├── images
    └── cases_april2021.png
├── megatron
    ├── __init__.py
    ├── arguments.py
    ├── checkpointing.py
    ├── data
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── bert_dataset.py
    │   ├── biencoder_dataset_utils.py
    │   ├── blendable_dataset.py
    │   ├── data_samplers.py
    │   ├── dataset_utils.py
    │   ├── gpt_dataset.py
    │   ├── helpers.cpp
    │   ├── ict_dataset.py
    │   ├── indexed_dataset.py
    │   ├── orqa_wiki_dataset.py
    │   ├── realm_dataset_utils.py
    │   ├── realm_index.py
    │   ├── t5_dataset.py
    │   ├── test
    │   │   ├── test_indexed_dataset.py
    │   │   └── test_preprocess_data.sh
    │   └── vit_dataset.py
    ├── enums.py
    ├── fp16_deprecated
    │   └── loss_scaler.py
    ├── fused_kernels
    │   ├── __init__.py
    │   ├── compat.h
    │   ├── layer_norm_cuda.cpp
    │   ├── layer_norm_cuda_kernel.cu
    │   ├── scaled_masked_softmax.cpp
    │   ├── scaled_masked_softmax.h
    │   ├── scaled_masked_softmax_cuda.cu
    │   ├── scaled_upper_triang_masked_softmax.cpp
    │   ├── scaled_upper_triang_masked_softmax.h
    │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   └── type_shim.h
    ├── global_vars.py
    ├── indexer.py
    ├── initialize.py
    ├── learning_rates.py
    ├── memory.py
    ├── microbatches.py
    ├── model
    │   ├── __init__.py
    │   ├── bert_model.py
    │   ├── biencoder_model.py
    │   ├── classification.py
    │   ├── distributed.py
    │   ├── enums.py
    │   ├── fused_bias_gelu.py
    │   ├── fused_layer_norm.py
    │   ├── fused_softmax.py
    │   ├── gpt_model.py
    │   ├── language_model.py
    │   ├── module.py
    │   ├── multiple_choice.py
    │   ├── realm_model.py
    │   ├── rotary_pos_embedding.py
    │   ├── t5_model.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── vit_model.py
    ├── mpu
    │   ├── __init__.py
    │   ├── cross_entropy.py
    │   ├── data.py
    │   ├── initialize.py
    │   ├── layers.py
    │   ├── mappings.py
    │   ├── random.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_layers.py
    │   │   └── test_random.py
    │   └── utils.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── clip_grads.py
    │   ├── grad_scaler.py
    │   └── optimizer.py
    ├── p2p_communication.py
    ├── package_info.py
    ├── schedules.py
    ├── text_generation_utils.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── bert_tokenization.py
    │   ├── gpt2_tokenization.py
    │   ├── sp_tokenization.py
    │   └── tokenizer.py
    ├── training.py
    └── utils.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_ict.py
├── pretrain_t5.py
├── pretrain_vit.py
├── requirements.txt
├── setup.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_harness
    │   ├── download.py
    │   ├── evaluate.py
    │   └── report-to-csv.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── cola.py
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   ├── mrpc.py
    │   ├── qnli.py
    │   ├── qqp.py
    │   ├── rte.py
    │   ├── sst2.py
    │   └── stsb.py
    ├── main.py
    ├── orqa
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   └── natural_questions
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification.py
    │   ├── eval_utils.py
    │   ├── finetune_utils.py
    │   └── main.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── run_megatron.py
    ├── test_basic.py
    └── test_megatron.py
└── tools
    ├── convert_checkpoint
        ├── README.md
        ├── deepspeed_checkpoint.py
        ├── deepspeed_to_megatron.py
        ├── deepspeed_to_transformers.py
        ├── inspect_checkpoint.py
        └── inspect_deepspeed_checkpoint.py
    ├── create_doc_index.py
    ├── generate_samples_gpt.py
    ├── linter.py
    ├── merge_mp_partitions.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    └── preprocess_data.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | 
 3 | # Distribution / packaging
 4 | build/
 5 | dist/
 6 | *.egg-info/
 7 | 
 8 | # binaries
 9 | *.so
10 | 
11 | # tmp files
12 | *.swp
13 | 
14 | # AML workspace config file
15 | *config.json


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel
 2 | 
 3 | test:
 4 |   script:
 5 |     - pytest --junitxml=report.xml tests
 6 |   artifacts:
 7 |     when: always
 8 |     reports:
 9 |       junit: report.xml
10 |     


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft @guanhuawang
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | recursive-include megatron/fused_kernels *.cpp *.h *.cu *.tr *.cuh *.cc
4 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | # Run the scripts below to setup dataset 
2 | 
3 | bash download_books.sh
4 | 
5 | bash download_vocab.sh
6 | 


--------------------------------------------------------------------------------
/dataset/download_books.sh:
--------------------------------------------------------------------------------
1 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
2 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx


--------------------------------------------------------------------------------
/dataset/download_ckpt.sh:
--------------------------------------------------------------------------------
1 | mkdir -p checkpoints/gpt2_345m
2 | 
3 | cd checkpoints/gpt2_345m
4 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip
5 | unzip megatron_lm_345m_v0.0.zip
6 | rm megatron_lm_345m_v0.0.zip
7 | cd ../..
8 | 
9 | 


--------------------------------------------------------------------------------
/dataset/download_vocab.sh:
--------------------------------------------------------------------------------
1 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt


--------------------------------------------------------------------------------
/examples/MoE/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": CONFIG_FP16_ENABLED,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "bf16": {
24 |     "enabled": CONFIG_BF16_ENABLED
25 |   },
26 |   "curriculum_learning": {
27 |     "enabled": CONFIG_CL_ENABLED,
28 |     "curriculum_type": "seqlen",
29 |     "min_difficulty": CONFIG_CL_MIN,
30 |     "max_difficulty": CONFIG_CL_MAX,
31 |     "schedule_type": "fixed_linear",
32 |     "schedule_config": {
33 |       "total_curriculum_step": CONFIG_CL_DURATION,
34 |       "difficulty_step": 8
35 |     }
36 |   },
37 | 
38 |   "wall_clock_breakdown" : false
39 | }
40 | 


--------------------------------------------------------------------------------
/examples/MoE/ds_config_gpt_Zero2_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": 2
 8 |   },
 9 | 
10 |   "gradient_clipping": 1.0,
11 |   "prescale_gradients": false,
12 | 
13 |   "fp16": {
14 |     "enabled": CONFIG_FP16_ENABLED,
15 |     "loss_scale": 0,
16 |     "loss_scale_window": 500,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1,
19 |     "initial_scale_power": 11
20 |   },
21 | 
22 |   "bf16": {
23 |     "enabled": CONFIG_BF16_ENABLED
24 |   },
25 |   "curriculum_learning": {
26 |     "enabled": CONFIG_CL_ENABLED,
27 |     "curriculum_type": "seqlen",
28 |     "min_difficulty": CONFIG_CL_MIN,
29 |     "max_difficulty": CONFIG_CL_MAX,
30 |     "schedule_type": "fixed_linear",
31 |     "schedule_config": {
32 |       "total_curriculum_step": CONFIG_CL_DURATION,
33 |       "difficulty_step": 8
34 |     }
35 |   },
36 | 
37 |   "wall_clock_breakdown" : false
38 | }
39 | 


--------------------------------------------------------------------------------
/examples/MoE/ds_evalharness.sh:
--------------------------------------------------------------------------------
 1 | # This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory.
 2 | 
 3 | CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B/global_step81566/
 4 | CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B.json
 5 | RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log
 6 | 
 7 | PP_SIZE=1
 8 | TP_SIZE=1
 9 | NO_PP="true"
10 | EP_PARALLEL_SIZE=1
11 | # Currently eval harness does not support data parallel
12 | # However, for MoE models it's possible to enable a "fake data parallel"
13 | # in order to load experts on multiple gpus. At the same time, it's not
14 | # real data parallel because we load the same data on all gpus.
15 | # On the other hand, it's better to use less number of gpus than training,
16 | # to reduce communication overhead.
17 | NUM_NODE=1
18 | NUM_GPU_PER_NODE=1
19 | 
20 | TASKS="lambada"
21 | # WikiText-2, not used in GPT-3 paper but used in GPT-2 paper
22 | # TASKS="wikitext"
23 | # Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2.
24 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext"
25 | # All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test.
26 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli"
27 | 
28 | VOCAB_FILE=/data/Megatron-LM/data/gpt2-vocab.json
29 | MERGE_FILE=/data/Megatron-LM/data/gpt2-merges.txt
30 | 
31 | export HF_DATASETS_OFFLINE=1
32 | 
33 | # Dummy arguments to make megatron happy. No need to configure them.
34 | # The reason we don't need to configure them and many other arguments is
35 | # because the eval framework will read the arguments from checkpoint file.
36 | MEGATRON_REQUIRED_ARGS="\
37 |     --num-layers -1\
38 |     --hidden-size -1\
39 |     --num-attention-heads -1\
40 |     --seq-length -1 \
41 |     --max-position-embeddings -1
42 | "
43 | 
44 | CMD="../../tasks/eval_harness/evaluate.py \
45 |     --load $CHECKPOINT_PATH\
46 |     --tensor-model-parallel-size $TP_SIZE \
47 |     --pipeline-model-parallel-size $PP_SIZE\
48 |     --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
49 |     --vocab-file $VOCAB_FILE\
50 |     --merge-file $MERGE_FILE\
51 |     --micro-batch-size 12\
52 |     --no-load-optim \
53 |     --no-load-rng \
54 |     --inference \
55 |     --disable-moe-token-dropping \
56 |     --adaptive_seq_len\
57 |     --eval_fp32\
58 |     --task_list $TASKS\
59 |     --results_path $RESULT_PATH \
60 |     --deepspeed \
61 |     --deepspeed_config $CONFIG_PATH \
62 |     $MEGATRON_REQUIRED_ARGS\
63 |     "
64 | 
65 | if [[ "${NO_PP}" = "true" ]]; then
66 | CMD="${CMD} \
67 |     --no-pipeline-parallel"
68 | fi
69 | 
70 | LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE"
71 | $LAUNCHER $CMD


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ## Recipes and Scripts
 2 | 
 3 | Please note that some of the script examples (e.g., pretrain_*.sh directly under ```Megatron-DeepSpeed/examples/``` folder) are from the original NVIDIA's Megatron-LM and does not have DeepSpeed integration (scripts with DeepSpeed integration should include the ```deepspeed``` keyword). Below we list various examples that do have DeepSpeed integration.
 4 | 
 5 | ### Azure
 6 | 
 7 | We strongly recommend to start with AzureML recipe in the ```azureml``` folder.
 8 | 
 9 | If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder.
10 | 
11 | ### MoE
12 | 
13 | Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models.
14 | 
15 | ### Data Efficiency 
16 | 
17 | The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library. Please refer to the detailed tutorials in data_efficiency/README.MD.
18 | 
19 | ### Curriculum Learning
20 | 
21 | Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models.
22 | Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above.
23 | 
24 | ### Model Compression
25 | 
26 | The ```compression``` folder includes examples about layer reduction for task-agnostic compression. Please refer to [this tutorial](https://www.deepspeed.ai/tutorials/model-compression/#11-layer-reduction) about the DeepSpeed Model Compression Library. These recipes are for GPT-style NLG models.
27 | 
28 | ### BERT example
29 | 
30 | The ```bert_with_pile``` folder includes examples about BERT-style model pre-training (using the public Pile data or user's own data) with DeepSpeed integration. Please refer to the readme in the folder for tutorial.
31 | 


--------------------------------------------------------------------------------
/examples/azure/README.md:
--------------------------------------------------------------------------------
 1 | ## Recipes for experimentation on Azure
 2 | 
 3 | The recipes have been tested on command line on a cluster setup using Azure VMs and VMSS as well as inside Docker based environments.
 4 | 
 5 | To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows
 6 | 
 7 | ```bash examples/azure/run-benchmark-model.sh```
 8 | 
 9 | ### Pre-requisites
10 | 
11 | To run the above script, you will need to either setup your own dataset and modify the scripts or use our helper scripts to download the publicly available Books dataset and GPT vocab files. Please use the following from the ```dataset``` folder
12 | 
13 | ```bash dataset/download_books.sh```
14 | 
15 | ```bash dataset/download_vocab.sh```
16 | 
17 | ### Run 175B and 1T models
18 | 
19 | We have included two recipes for the 175B model and the 1T model. To train the model, we assume that the users will modify and tune hyperparameters and configurations by themselves. To facilitate initial training, we have made the recipes runnable with the Books dataset as follows.
20 | 
21 | ```bash examples/azure/run-175b.sh```
22 | 
23 | ```bash examples/azure/run-1t.sh```
24 | 
25 | ### Note about ZeRO stage 3 and CPU offload 
26 | 
27 | By default, we have enabled ZeRO Stage 3 for both the recipes above. For the 1T model, we have also enabled the CPU-offload feature to save on memory and enable a larger batch size that offers better performance. 
28 | 


--------------------------------------------------------------------------------
/examples/azureml/Dockerfile.dockerfile:
--------------------------------------------------------------------------------
1 | FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.5-gpu
2 | USER root:root
3 | 
4 | RUN pip install pybind11
5 | RUN pip install regex


--------------------------------------------------------------------------------
/examples/azureml/README.md:
--------------------------------------------------------------------------------
 1 | ## Megatron-DeepSpeed on AzureML
 2 | Example script for running Megatron-DeepSpeed using Azure Machine Learning.
 3 | 
 4 | ------
 5 | 
 6 | # Workspace Setup
 7 | Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up).
 8 | 
 9 | # Dataset Preparation
10 | Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 
11 | Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset.
12 | 
13 | # Training
14 | Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py).
15 | 


--------------------------------------------------------------------------------
/examples/azureml/prepare_dataset.py:
--------------------------------------------------------------------------------
 1 | # Use this script to upload data to blob store
 2 | 
 3 | # AzureML libraries
 4 | from azureml.core import Workspace
 5 | from azureml.core.dataset import Dataset
 6 | from azureml.data.datapath import DataPath
 7 | 
 8 | ws = Workspace.from_config()
 9 | print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')
10 | 
11 | data_dir = "bookcorpus_data"  # Local directory for where data is located that includes .bin and .idx files
12 | blobstore_datadir = data_dir  # Blob store directory to store data in
13 | 
14 | datastore = ws.get_default_datastore()
15 | 
16 | # Book Corpus Data
17 | print("upload dataset to blob store")
18 | uploaded_data = Dataset.File.upload_directory(
19 |     src_dir=data_dir,
20 |     target=DataPath(datastore, blobstore_datadir),
21 |     show_progress=True
22 | )
23 | 
24 | # Usage after uploading the directory
25 | # To refer to the folder directly:
26 | train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)])
27 | print(train_dataset)
28 | # To refer to a specific file:
29 | # train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir + "/filename.ext")])
30 | # Create DatasetConsumptionConfig to specify how to deliver the dataset to a compute target.
31 | # In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target.
32 | # input_data_dir = train_dataset.as_mount()
33 | # input_data_dir = train_dataset.as_download()
34 | 


--------------------------------------------------------------------------------
/examples/bert_with_pile/README.md:
--------------------------------------------------------------------------------
 1 | This ```bert_with_pile``` folder includes examples about BERT pre-training (using [the public Pile data](https://github.com/EleutherAI/the-pile) or user's own data) with DeepSpeed integration. We also provide scripts about preprocessing Pile data and MNLI finetuning.
 2 | 
 3 | ## Data preprocessing
 4 | ```prepare_pile_data.py``` is the script for downloading, decompressing, and preprocessing [the public Pile data](https://github.com/EleutherAI/the-pile). Users can also modify this script to preprocess their own training data.
 5 | 
 6 | ## BERT pre-training
 7 | ```ds_pretrain_bert.sh``` is the script for BERT pre-training integrated with DeepSpeed, supporting [ZeRO](https://www.deepspeed.ai/tutorials/zero/) together with Megatron's tensor-slicing model parallelism. The training hyperparameters follow the [Megatron paper](https://arxiv.org/abs/1909.08053). Note that the pipeline parallelism is currently not supported: DeepSpeed's pipeline parallelism is only integrated with the GPT case, and currently DeepSpeed is not integrated with Megatron's own pipeline parallelism.
 8 | 
 9 | As a reference performance number, our measurements show that our example is able to achieve a throughput up to 145 TFLOPs per GPU when pre-training a 1.3B BERT model (with ZeRO stage-1, without model parallelism, with 64 NVIDIA A100 GPUs, with batch size 4096 (64 per GPU), with activation checkpointing).
10 | 
11 | One thing to note is that this pre-training recipe is NOT a strict reproduction of the [original BERT paper](https://arxiv.org/abs/1810.04805): the Pile data is larger than the data used in original BERT (and the data used by Megatron paper); Megatron-LM introduces some changes to the BERT model (see details in [Megatron paper](https://arxiv.org/abs/1909.08053)); the training hyperparameters are also different. Overall these differences lead to longer training time but also better model quality than original BERT (see MNLI score below), and supporting large model scale by the combination of ZeRO and model parallelism. If you don't have enough computation budget, we recommend to reduce the total training iterations (```train_iters``` in the script) and potentially increase the learning rate at the same time. If you want to strictly reproduce original BERT, we recommend to use our [another BERT example](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert).
12 | 
13 | ## BERT MNLI fine-tuning
14 | ```ds_finetune_bert_mnli.sh``` is the script for BERT MNLI fine-tuning, following the hyperparameters in the [Megatron paper](https://arxiv.org/abs/1909.08053). As a reference, table below present the scores using the model pre-trained based on the script above, comparing with the scores of original BERT and Megatron paper's BERT. Our BERT-Large's score is slightly lower than Megatron paper's, mainly due to the different data we used (Pile data is much diverse and larger than the data in Megatron paper, which potentially has negative effect on small million-scale models).
15 | 
16 | | MNLI dev set accuracy | **MNLI-m** | **MNLI-mm** |
17 | | ---------- |---------- |---------- |
18 | | BERT-Base, [original BERT](https://arxiv.org/abs/1810.04805) | 84.6 | 83.4 |
19 | | BERT-Base, ours (median on 5 seeds) | 86.1 | 86.1 |
20 | | BERT-Large, [original BERT](https://arxiv.org/abs/1810.04805) | 86.7 | 85.9 |
21 | | BERT-Large, [Megatron paper](https://arxiv.org/abs/1909.08053) | 89.7 | 90.0 |
22 | | BERT-Large, ours (median on 5 seeds) | 89.1 | 89.6 |
23 | 
24 | 


--------------------------------------------------------------------------------
/examples/bert_with_pile/ds_config_bert_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": CONFIG_FP16_ENABLED,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "bf16": {
24 |     "enabled": CONFIG_BF16_ENABLED
25 |   },
26 | 
27 |   "wall_clock_breakdown" : false
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/compression/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": CONFIG_FP16_ENABLED,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "bf16": {
24 |     "enabled": CONFIG_BF16_ENABLED
25 |   },
26 |   "curriculum_learning": {
27 |     "enabled": CONFIG_CL_ENABLED,
28 |     "curriculum_type": "seqlen",
29 |     "min_difficulty": CONFIG_CL_MIN,
30 |     "max_difficulty": CONFIG_CL_MAX,
31 |     "schedule_type": "fixed_linear",
32 |     "schedule_config": {
33 |       "total_curriculum_step": CONFIG_CL_DURATION,
34 |       "difficulty_step": 8
35 |     }
36 |   },
37 | 
38 |   "wall_clock_breakdown" : false
39 | }
40 | 


--------------------------------------------------------------------------------
/examples/compression/ds_config_gpt_TEMPLATE_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": CONFIG_FP16_ENABLED,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "bf16": {
24 |     "enabled": CONFIG_BF16_ENABLED
25 |   },
26 |   "curriculum_learning": {
27 |     "enabled": CONFIG_CL_ENABLED,
28 |     "curriculum_type": "seqlen",
29 |     "min_difficulty": CONFIG_CL_MIN,
30 |     "max_difficulty": CONFIG_CL_MAX,
31 |     "schedule_type": "fixed_linear",
32 |     "schedule_config": {
33 |       "total_curriculum_step": CONFIG_CL_DURATION,
34 |       "difficulty_step": 8
35 |     }
36 |   },
37 | 
38 |   "wall_clock_breakdown" : false,
39 | 
40 |   "compression_training": {
41 |     "weight_quantization": {
42 |       "shared_parameters":{
43 |         "enabled": true,
44 |         "quantizer_kernel": false,
45 |         "schedule_offset": 50,
46 |         "quantize_groups": 48,
47 |         "quantize_verbose": false,
48 |         "quantization_type": "symmetric",
49 |         "rounding": "nearest",
50 |         "fp16_mixed_quantize":{
51 |           "enabled": false,
52 |           "quantize_change_ratio": 0.001
53 |         }
54 |       },
55 |       "different_groups":{
56 |         "wq1": {
57 |           "params": {
58 |               "start_bits": 12, 
59 |               "target_bits": 4,
60 |               "quantization_period": 50
61 |           },
62 |           "modules": [
63 |             "encoder.layers"
64 |           ]
65 |         }
66 |       }
67 |     },
68 |     "activation_quantization": {
69 |       "shared_parameters":{
70 |         "enabled": true,
71 |         "quantization_type": "asymmetric",
72 |         "range_calibration": "static",
73 |         "schedule_offset": 50
74 |       },
75 |       "different_groups":{
76 |         "aq1": {
77 |           "params": { 
78 |               "bits": 8
79 |           },
80 |           "modules": [
81 |             "encoder.layers"
82 |           ]
83 |         }
84 |       }
85 |     }
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/examples/compression/ds_evalharness.sh:
--------------------------------------------------------------------------------
 1 | # This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory.
 2 | 
 3 | # CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M10L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step2000/
 4 | # CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71000/
 5 | # CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M12L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step5000/
 6 | CHECKPOINT_PATH=/blob/users/minjiaz/project/gpt3_distillation/checkpoint/gpt3-kd-test2-alpha1-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-15-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71426/
 7 | CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus--1-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B.json
 8 | RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log
 9 | 
10 | PP_SIZE=1
11 | TP_SIZE=1
12 | NO_PP="true"
13 | EP_PARALLEL_SIZE=1
14 | # Currently eval harness does not support data parallel
15 | # However, for MoE models it's possible to enable a "fake data parallel"
16 | # in order to load experts on multiple gpus. At the same time, it's not
17 | # real data parallel because we load the same data on all gpus.
18 | # On the other hand, it's better to use less number of gpus than training,
19 | # to reduce communication overhead.
20 | NUM_NODE=1
21 | NUM_GPU_PER_NODE=1
22 | 
23 | # TASKS="lambada"
24 | # WikiText-2, not used in GPT-3 paper but used in GPT-2 paper
25 | TASKS="lambada,wikitext"
26 | # Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2.
27 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext"
28 | # All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test.
29 | # TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli"
30 | 
31 | VOCAB_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
32 | MERGE_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
33 | 
34 | export HF_DATASETS_OFFLINE=1
35 | 
36 | # Dummy arguments to make megatron happy. No need to configure them.
37 | # The reason we don't need to configure them and many other arguments is
38 | # because the eval framework will read the arguments from checkpoint file.
39 | MEGATRON_REQUIRED_ARGS="\
40 |     --num-layers -1\
41 |     --hidden-size -1\
42 |     --num-attention-heads -1\
43 |     --seq-length -1 \
44 |     --max-position-embeddings -1
45 | "
46 | 
47 | CMD="../../tasks/eval_harness/evaluate.py \
48 |     --load $CHECKPOINT_PATH\
49 |     --tensor-model-parallel-size $TP_SIZE \
50 |     --pipeline-model-parallel-size $PP_SIZE\
51 |     --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
52 |     --vocab-file $VOCAB_FILE\
53 |     --merge-file $MERGE_FILE\
54 |     --micro-batch-size 12\
55 |     --no-load-optim \
56 |     --no-load-rng \
57 |     --inference \
58 |     --disable-moe-token-dropping \
59 |     --adaptive_seq_len\
60 |     --eval_fp32\
61 |     --task_list $TASKS\
62 |     --results_path $RESULT_PATH \
63 |     --deepspeed \
64 |     --deepspeed_config $CONFIG_PATH \
65 |     $MEGATRON_REQUIRED_ARGS\
66 |     "
67 | 
68 | if [[ "${NO_PP}" = "true" ]]; then
69 | CMD="${CMD} \
70 |     --no-pipeline-parallel"
71 | fi
72 | 
73 | LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE"
74 | $LAUNCHER $CMD


--------------------------------------------------------------------------------
/examples/create_embeddings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Compute embeddings for each entry of a given dataset (e.g. Wikipedia)
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | # Wikipedia data can be downloaded from the following link:
 9 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
10 | EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
11 | EMBEDDING_PATH=<Specify path to store embeddings>
12 | CHECKPOINT_PATH=<Specify path of pretrained ICT model>
13 | 
14 | python tools/create_doc_index.py \
15 |     --num-layers 12 \
16 |     --hidden-size 768 \
17 |     --num-attention-heads 12 \
18 |     --tensor-model-parallel-size 1 \
19 |     --micro-batch-size 128 \
20 |     --checkpoint-activations \
21 |     --seq-length 512 \
22 |     --retriever-seq-length 256 \
23 |     --max-position-embeddings 512 \
24 |     --load ${CHECKPOINT_PATH} \
25 |     --evidence-data-path ${EVIDENCE_DATA_DIR} \
26 |     --embedding-path ${EMBEDDING_PATH} \
27 |     --indexer-log-interval 1000 \
28 |     --indexer-batch-size 128 \
29 |     --vocab-file bert-vocab.txt \
30 |     --num-workers 2 \
31 |     --fp16
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/curriculum_learning/README.md:
--------------------------------------------------------------------------------
1 | This is an example of how to use DeepSpeed's curriculum learning (CL) feature which provides faster and more stable language model pre-training. Currently it is only integrated for GPT pre-training. Note that there are two curriculum learning examples in two different repos for Megatron-LM GPT-2 pre-training. Both of them have some unique features and limitations. See details in our [tutorial](https://www.deepspeed.ai/tutorials/curriculum-learning/). For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084).


--------------------------------------------------------------------------------
/examples/curriculum_learning/ds_train.sh:
--------------------------------------------------------------------------------
 1 | # # baseline
 2 | # CONFIG=baseline
 3 | # TAG=baseline
 4 | # MODEL_SIZE=1558
 5 | # LR=1.5e-4
 6 | # BSZ=512
 7 | # SEQ_LEN=1024
 8 | # MP_SIZE=1
 9 | # SEED=1234
10 | # SAVE_INTERVAL=5000
11 | # NUM_ITER=600000
12 | # NUM_TOKEN=157286400000
13 | # LR_DECAY_TOKEN=157286400000
14 | # LR_WARMUP_ITER=3000
15 | # CONFIG_TEMPLATE=false
16 | # CURRICULUM_STEP=0
17 | # CURRICULUM_MIN=0
18 | 
19 | # curriculum learning
20 | CONFIG=curriculum_fixed_linear
21 | MODEL_SIZE=1558
22 | LR=6e-4
23 | BSZ=4096
24 | SEQ_LEN=1024
25 | MP_SIZE=1
26 | SEED=1234
27 | SAVE_INTERVAL=1000
28 | NUM_ITER=75000
29 | NUM_TOKEN=157286400000
30 | LR_DECAY_TOKEN=157286400000
31 | LR_WARMUP_ITER=3000
32 | CONFIG_TEMPLATE=true
33 | CURRICULUM_STEP=45000
34 | CURRICULUM_MIN=64
35 | TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}"
36 | 
37 | bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN
38 | 


--------------------------------------------------------------------------------
/examples/curriculum_learning/ds_zero_stage_1_config_baseline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 512,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 1,
 5 |   "zero_optimization": {
 6 |     "stage": 1
 7 |   },
 8 |   "optimizer": {
 9 |     "type": "Adam",
10 |     "params": {
11 |       "lr": 0.00015,
12 |       "max_grad_norm": 1.0,
13 |       "betas": [0.9, 0.95]
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true,
19 |     "loss_scale": 0,
20 |     "loss_scale_window": 1000,
21 |     "hysteresis": 2,
22 |     "min_loss_scale": 1
23 |   },
24 |   "wall_clock_breakdown": false,
25 |   "zero_allow_untested_optimizer": false
26 | }
27 | 


--------------------------------------------------------------------------------
/examples/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 512,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 1,
 5 |   "zero_optimization": {
 6 |     "stage": 1
 7 |   },
 8 |   "optimizer": {
 9 |     "type": "Adam",
10 |     "params": {
11 |       "lr": 0.00015,
12 |       "max_grad_norm": 1.0,
13 |       "betas": [0.9, 0.95]
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true,
19 |     "loss_scale": 0,
20 |     "loss_scale_window": 1000,
21 |     "hysteresis": 2,
22 |     "min_loss_scale": 1
23 |   },
24 |   "wall_clock_breakdown": false,
25 |   "zero_allow_untested_optimizer": false,
26 |   "curriculum_learning": {
27 |     "enabled": true,
28 |     "curriculum_type": "seqlen",
29 |     "min_difficulty": CONFIG_CL_MIN,
30 |     "max_difficulty": CONFIG_CL_MAX,
31 |     "schedule_type": "fixed_linear",
32 |     "schedule_config": {
33 |       "total_curriculum_step": CONFIG_CL_DURATION,
34 |       "difficulty_step": 8
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/README.md:
--------------------------------------------------------------------------------
 1 | This directory includes GPT-3/BERT pretraining example scripts for DeepSpeed Data Efficiency Library technologies (curriculum learning, random-LTD, and the two composed together).
 2 | 
 3 | You need to install updated DeepSpeed version (>=0.8.0), which contains the DeepSpeed Data Efficiency Library.
 4 | 
 5 | Additional tutorial can be found at [DeepSpeed website](https://www.deepspeed.ai/tutorials/data-efficiency/).
 6 | 
 7 | Additional technical details can be found in our [random-LTD paper](https://arxiv.org/abs/2211.11586) and [data efficiency paper](https://arxiv.org/abs/2212.03597).
 8 | 
 9 | ## GPT-3 pretraining and evaluation
10 | Inside ``gpt`` folder, first the ``ds_analyze_gpt_data_map.sh`` and ``ds_analyze_gpt_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
11 | 
12 | ``gpt/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_gpt_1.3B_dense_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
13 | 
14 | ``gpt/eval`` includes the zero-/few-shot evaluation example scripts. ``ds_evalharness_parallel_run.sh`` is for zero-shot, and ``ds_evalharness_parallel_run_10shot.sh`` is for 10-shot.
15 | 
16 | ## BERT pretraining and finetuning
17 | Inside ``bert`` folder, first the ``pile_data_download_preprocess.py`` can be used to download and preprocess the public Pile dataset.
18 | 
19 | The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
20 | 
21 | ``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
22 | 
23 | ``bert/finetune`` includes the MNLI/QQP/RACE finetuning example scripts following the [Megatron-LM paper](https://arxiv.org/abs/1909.08053). However, we found that the RACE task's accuracy is not very stable and the Megatron-LM paper used a very long number of epochs for MNLI/QQP which is not necessary. Thus we added capability of finetuning other GLUE tasks, and switched to follow the hyperparameters of the [original BERT paper](https://arxiv.org/abs/1810.04805). The corresponding scripts are at ``bert/finetune_glue``, which we recommend to use instead of ``bert/finetune``. Our [data efficiency paper](https://arxiv.org/abs/2212.03597) also uses the scripts under ``bert/finetune_glue`` for GLUE finetuning.


--------------------------------------------------------------------------------
/examples/data_efficiency/bert/ds_analyze_bert_data_map.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | num_workers=1 # Num nodes to run the map job
 4 | num_threads=40 # Num threads on each node. Set this based on #CPU cores
 5 | 
 6 | # If different data epochs have slightly different data samples (e.g., due
 7 | # to randomness), then you need to specify large enough num_epochs that cover
 8 | # whole pretraining. If different data epochs are the same, set num_epochs to
 9 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
10 | # library will automatically handle reshuffling when reaching another epoch.
11 | num_epochs=5
12 | 
13 | # Which node is this node (start with 0 and end with num_workers-1). This
14 | # script only launch the map job on 1 worker node, since we don't expect
15 | # running on many nodes and workers don't need any communication. But you
16 | # can modify this script to add a MPI/torch distributed launcher.
17 | worker_id=$1
18 | save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
19 | 
20 | metric='total_vocab_freq'
21 | # metric='vocab_rarity' # this requires the result of total_vocab_freq
22 | # metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
23 | # metric='seqlen'
24 | 
25 | seq_len=512
26 | batch_size=10000
27 | 
28 | jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
29 | ## Public the Pile dataset, see prepare_pile_data.py in the same directory
30 | ## about how to download and preprocess the data.
31 | ## Change data_home to your own training data path.
32 | # data_home="/vc_data_blob/users/conglli/the_pile_bert"
33 | data_home="/blob/data/the_pile_bert"
34 | data_path="${data_home}/pile_bert_train_text_sentence"
35 | 
36 | vocab_path="bert-large-uncased-vocab.txt"
37 | if [ ! -f "$vocab_path" ]; then
38 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
39 | fi
40 | 
41 | # Make sure the "--split" is the same as what you will use for pre-training.
42 | options=" \
43 |     --analyzing-task map \
44 |     --analyzing-data-type BERT \
45 |     --analyzing-metric ${metric} \
46 |     --analyzing-num-workers ${num_workers} \
47 |     --analyzing-worker-id ${worker_id} \
48 |     --analyzing-num-threads ${num_threads} \
49 |     --vocab-file ${vocab_path} \
50 |     --data-path ${data_path} \
51 |     --data-impl mmap \
52 |     --tokenizer-type BertWordPieceLowerCase \
53 |     --micro-batch-size ${batch_size} \
54 |     --global-batch-size ${batch_size} \
55 |     --seq-length ${seq_len} \
56 |     --max-position-embeddings ${seq_len} \
57 |     --num-layers 1 \
58 |     --hidden-size 1 \
59 |     --num-attention-heads 1 \
60 |     --split 949,50,1 \
61 |     --distributed-backend gloo \
62 |     --train-data-exact-num-epochs ${num_epochs} \
63 |     --return-data-index \
64 |     --save-interval 1 \
65 |     --save ${save_path}"
66 | 
67 | python ../analyze_data.py ${options} &> ${jobname}.log


--------------------------------------------------------------------------------
/examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set these 2 to the same as what you used during map job. We need these 2
 4 | # configs to know how many map job result files do we have.
 5 | num_workers=1
 6 | num_threads=40
 7 | # Reduce job only has 1 worker but can accelerate by multithreading.
 8 | num_threads_reduce=40
 9 | 
10 | # If different data epochs have slightly different data samples (e.g., due
11 | # to randomness), then you need to specify large enough num_epochs that cover
12 | # whole pretraining. If different data epochs are the same, set num_epochs to
13 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
14 | # library will automatically handle reshuffling when reaching another epoch.
15 | num_epochs=5
16 | 
17 | save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
18 | 
19 | metric='total_vocab_freq'
20 | # metric='vocab_rarity' # this requires the result of total_vocab_freq
21 | # metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
22 | # metric='seqlen'
23 | 
24 | seq_len=512
25 | batch_size=10000
26 | 
27 | jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
28 | ## Public the Pile dataset, see prepare_pile_data.py in the same directory
29 | ## about how to download and preprocess the data.
30 | ## Change data_home to your own training data path.
31 | # data_home="/vc_data_blob/users/conglli/the_pile_bert"
32 | data_home="/blob/data/the_pile_bert"
33 | data_path="${data_home}/pile_bert_train_text_sentence"
34 | 
35 | vocab_path="bert-large-uncased-vocab.txt"
36 | if [ ! -f "$vocab_path" ]; then
37 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
38 | fi
39 | 
40 | # Make sure the "--split" is the same as what you will use for pre-training.
41 | options=" \
42 |     --analyzing-task reduce \
43 |     --analyzing-data-type BERT \
44 |     --analyzing-metric ${metric} \
45 |     --analyzing-num-workers ${num_workers} \
46 |     --analyzing-num-threads ${num_threads} \
47 |     --analyzing-num-threads-reduce ${num_threads_reduce} \
48 |     --vocab-file ${vocab_path} \
49 |     --data-path ${data_path} \
50 |     --data-impl mmap \
51 |     --tokenizer-type BertWordPieceLowerCase \
52 |     --micro-batch-size ${batch_size} \
53 |     --global-batch-size ${batch_size} \
54 |     --seq-length ${seq_len} \
55 |     --max-position-embeddings ${seq_len} \
56 |     --num-layers 1 \
57 |     --hidden-size 1 \
58 |     --num-attention-heads 1 \
59 |     --split 949,50,1 \
60 |     --distributed-backend gloo \
61 |     --train-data-exact-num-epochs ${num_epochs} \
62 |     --return-data-index \
63 |     --save-interval 1 \
64 |     --save ${save_path}"
65 | 
66 | python ../analyze_data.py ${options} &> ${jobname}.log


--------------------------------------------------------------------------------
/examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false
24 | }
25 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size" : CONFIG_BATCH_SIZE,
 3 |   "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false
24 | }
25 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh:
--------------------------------------------------------------------------------
 1 | hostname_and_rank=$1
 2 | master_port=$2
 3 | pretrained_checkpoint=$3
 4 | 
 5 | # hostname_and_rank="worker-0:0,1,2,3"
 6 | # master_port=12345
 7 | # pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
 8 | 
 9 | tasks=(
10 |     RTE
11 |     MRPC
12 |     STS-B
13 |     CoLA
14 |     SST-2
15 |     QNLI
16 |     QQP
17 |     MNLI
18 | )
19 | 
20 | seeds=(
21 |     1234
22 |     1235
23 |     1236
24 |     1237
25 |     1238
26 | )
27 | 
28 | lrs=(
29 |     2e-5
30 |     3e-5
31 |     4e-5
32 |     5e-5
33 | )
34 | 
35 | for ((i=0;i<${#tasks[@]};++i)); do
36 |     task=${tasks[i]}
37 |     for ((j=0;j<${#seeds[@]};++j)); do
38 |         seed=${seeds[j]}
39 |         for ((k=0;k<${#lrs[@]};++k)); do
40 |             lr=${lrs[k]}
41 |             bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint}
42 |         done
43 |     done
44 | done


--------------------------------------------------------------------------------
/examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false,
24 |   "dataloader_drop_last": true,
25 |   "data_efficiency": {
26 |     "enabled": true,
27 |     "seed": DATA_EFFICIENCY_SEED,
28 |     "data_routing": {
29 |       "enabled": LTD_ENABLED,
30 |       "random_ltd":{
31 |         "enabled": LTD_ENABLED,
32 |         "total_layer_num": 24,
33 |         "random_ltd_layer_num": 22,
34 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
35 |         "model_mask_name": "attention_mask",
36 |         "model_type": "encoder",
37 |         "hidden_state_order": "seq_batch_dim",
38 |         "random_ltd_schedule": {
39 |           "min_value": LTD_MIN,
40 |           "max_value": LTD_MAX,
41 |           "schedule_type":"fixed_linear",
42 |           "schedule_config": {
43 |             "require_steps": LTD_STEP,
44 |             "seq_per_step": 16
45 |           }
46 |         }
47 |       } 
48 |     },
49 |     "data_sampling": {
50 |       "enabled": CL_ENABLED,
51 |       "num_workers": DATA_SAMPLING_NUM_WORKERS,
52 |       "curriculum_learning": {
53 |         "enabled": CL_ENABLED,
54 |         "data_cluster_path": "CL_CLUSTER_PATH",
55 |         "curriculum_metrics": {
56 |           "CL_1st_METRIC_NAME": {
57 |             "index_to_sample_path": "CL_1st_SAMPLE_PATH",
58 |             "index_to_metric_path": "CL_1st_METRIC_PATH",
59 |             "difficulty_type": "CL_1st_DIFF_TYPE",
60 |             "clustering_type": "CL_1st_CLUSTER_TYPE",
61 |             "min_difficulty": CL_1st_MIN,
62 |             "max_difficulty": CL_1st_MAX,
63 |             "schedule_type": "fixed_root",
64 |             "schedule_config": {
65 |               "total_curriculum_step": CL_1st_TOTAL_STEP,
66 |               "difficulty_step": CL_1st_DIFF_STEP,
67 |               "root_degree": CL_1st_ROOT
68 |             }
69 |           }
70 |         }
71 |       }
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false,
24 |   "dataloader_drop_last": true,
25 |   "data_efficiency": {
26 |     "enabled": true,
27 |     "seed": DATA_EFFICIENCY_SEED,
28 |     "data_routing": {
29 |       "enabled": LTD_ENABLED,
30 |       "random_ltd":{
31 |         "enabled": LTD_ENABLED,
32 |         "total_layer_num": 24,
33 |         "random_ltd_layer_num": 22,
34 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
35 |         "model_mask_name": "attention_mask",
36 |         "model_type": "encoder",
37 |         "hidden_state_order": "seq_batch_dim",
38 |         "random_ltd_schedule": {
39 |           "min_value": LTD_MIN,
40 |           "max_value": LTD_MAX,
41 |           "schedule_type":"fixed_linear",
42 |           "schedule_config": {
43 |             "require_steps": LTD_STEP,
44 |             "seq_per_step": 16
45 |           }
46 |         }
47 |       } 
48 |     },
49 |     "data_sampling": {
50 |       "enabled": CL_ENABLED,
51 |       "num_workers": DATA_SAMPLING_NUM_WORKERS,
52 |       "curriculum_learning": {
53 |         "enabled": CL_ENABLED,
54 |         "data_cluster_path": "CL_CLUSTER_PATH",
55 |         "curriculum_metrics": {
56 |           "CL_1st_METRIC_NAME": {
57 |             "index_to_sample_path": "CL_1st_SAMPLE_PATH",
58 |             "index_to_metric_path": "CL_1st_METRIC_PATH",
59 |             "difficulty_type": "CL_1st_DIFF_TYPE",
60 |             "clustering_type": "CL_1st_CLUSTER_TYPE",
61 |             "min_difficulty": CL_1st_MIN,
62 |             "max_difficulty": CL_1st_MAX,
63 |             "schedule_type": "fixed_root",
64 |             "schedule_config": {
65 |               "total_curriculum_step": CL_1st_TOTAL_STEP,
66 |               "difficulty_step": CL_1st_DIFF_STEP,
67 |               "root_degree": CL_1st_ROOT
68 |             }
69 |           },
70 |           "CL_2nd_METRIC_NAME": {
71 |             "index_to_sample_path": "CL_2nd_SAMPLE_PATH",
72 |             "index_to_metric_path": "CL_2nd_METRIC_PATH",
73 |             "difficulty_type": "CL_2nd_DIFF_TYPE",
74 |             "clustering_type": "CL_2nd_CLUSTER_TYPE",
75 |             "min_difficulty": CL_2nd_MIN,
76 |             "max_difficulty": CL_2nd_MAX,
77 |             "schedule_type": "fixed_root",
78 |             "schedule_config": {
79 |               "total_curriculum_step": CL_2nd_TOTAL_STEP,
80 |               "difficulty_step": CL_2nd_DIFF_STEP,
81 |               "root_degree": CL_2nd_ROOT
82 |             }
83 |           }
84 |         }
85 |       }
86 |     }
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | num_workers=1 # Num nodes to run the map job
 4 | num_threads=40 # Num threads on each node. Set this based on #CPU cores
 5 | 
 6 | # If different data epochs have slightly different data samples (e.g., due
 7 | # to randomness), then you need to specify large enough num_epochs that cover
 8 | # whole pretraining. If different data epochs are the same, set num_epochs to
 9 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
10 | # library will automatically handle reshuffling when reaching another epoch.
11 | num_epochs=1
12 | 
13 | # Which node is this node (start with 0 and end with num_workers-1). This
14 | # script only launch the map job on 1 worker node, since we don't expect
15 | # running on many nodes and workers don't need any communication. But you
16 | # can modify this script to add a MPI/torch distributed launcher.
17 | worker_id=$1
18 | save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/"
19 | 
20 | metric='total_vocab_freq'
21 | # metric='vocab_rarity' # this requires the result of total_vocab_freq
22 | 
23 | seq_len=2048
24 | batch_size=10000
25 | 
26 | jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
27 | # Public the Pile dataset, can be downloaded at
28 | # https://mystic.the-eye.eu/public/AI/pile_neox/
29 | ## Change data_home to your own training data path.
30 | # data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
31 | data_home="/blob/data/the_pile_public_merged_nopreprocessing"
32 | data_path="${data_home}/pile_text_document"
33 | 
34 | vocab_path="gpt2-vocab.json"
35 | if [ ! -f "$vocab_path" ]; then
36 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
37 | fi
38 | merge_path="gpt2-merges.txt"
39 | if [ ! -f "$merge_path" ]; then
40 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
41 | fi
42 | 
43 | # Make sure the "--split" is the same as what you will use for pre-training.
44 | options=" \
45 |     --analyzing-task map \
46 |     --analyzing-data-type GPT \
47 |     --analyzing-metric ${metric} \
48 |     --analyzing-num-workers ${num_workers} \
49 |     --analyzing-worker-id ${worker_id} \
50 |     --analyzing-num-threads ${num_threads} \
51 |     --vocab-file ${vocab_path} \
52 |     --merge-file ${merge_path} \
53 |     --data-path ${data_path} \
54 |     --data-impl mmap \
55 |     --tokenizer-type GPT2BPETokenizer \
56 |     --micro-batch-size ${batch_size} \
57 |     --global-batch-size ${batch_size} \
58 |     --seq-length ${seq_len} \
59 |     --max-position-embeddings ${seq_len} \
60 |     --num-layers 1 \
61 |     --hidden-size 1 \
62 |     --num-attention-heads 1 \
63 |     --split 949,50,1 \
64 |     --distributed-backend gloo \
65 |     --train-data-exact-num-epochs ${num_epochs} \
66 |     --return-data-index \
67 |     --save-interval 1 \
68 |     --save ${save_path}"
69 | 
70 | python ../analyze_data.py ${options} &> ${jobname}.log


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set these 2 to the same as what you used during map job. We need these 2
 4 | # configs to know how many map job result files do we have.
 5 | num_workers=1
 6 | num_threads=40
 7 | # Reduce job only has 1 worker but can accelerate by multithreading.
 8 | num_threads_reduce=40
 9 | 
10 | # If different data epochs have slightly different data samples (e.g., due
11 | # to randomness), then you need to specify large enough num_epochs that cover
12 | # whole pretraining. If different data epochs are the same, set num_epochs to
13 | # 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
14 | # library will automatically handle reshuffling when reaching another epoch.
15 | num_epochs=1
16 | 
17 | save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/"
18 | 
19 | metric='total_vocab_freq'
20 | # metric='vocab_rarity' # this requires the result of total_vocab_freq
21 | 
22 | seq_len=2048
23 | batch_size=10000
24 | 
25 | jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
26 | # Public the Pile dataset, can be downloaded at
27 | # https://mystic.the-eye.eu/public/AI/pile_neox/
28 | ## Change data_home to your own training data path.
29 | # data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
30 | data_home="/blob/data/the_pile_public_merged_nopreprocessing"
31 | data_path="${data_home}/pile_text_document"
32 | 
33 | vocab_path="gpt2-vocab.json"
34 | if [ ! -f "$vocab_path" ]; then
35 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
36 | fi
37 | merge_path="gpt2-merges.txt"
38 | if [ ! -f "$merge_path" ]; then
39 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
40 | fi
41 | 
42 | # Make sure the "--split" is the same as what you will use for pre-training.
43 | options=" \
44 |     --analyzing-task reduce \
45 |     --analyzing-data-type GPT \
46 |     --analyzing-metric ${metric} \
47 |     --analyzing-num-workers ${num_workers} \
48 |     --analyzing-num-threads ${num_threads} \
49 |     --analyzing-num-threads-reduce ${num_threads_reduce} \
50 |     --vocab-file ${vocab_path} \
51 |     --merge-file ${merge_path} \
52 |     --data-path ${data_path} \
53 |     --data-impl mmap \
54 |     --tokenizer-type GPT2BPETokenizer \
55 |     --micro-batch-size ${batch_size} \
56 |     --global-batch-size ${batch_size} \
57 |     --seq-length ${seq_len} \
58 |     --max-position-embeddings ${seq_len} \
59 |     --num-layers 1 \
60 |     --hidden-size 1 \
61 |     --num-attention-heads 1 \
62 |     --split 949,50,1 \
63 |     --distributed-backend gloo \
64 |     --train-data-exact-num-epochs ${num_epochs} \
65 |     --return-data-index \
66 |     --save-interval 1 \
67 |     --save ${save_path}"
68 | 
69 | python ../analyze_data.py ${options} &> ${jobname}.log


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "train_batch_size" : 2048,
 3 | "train_micro_batch_size_per_gpu": 16,
 4 | "steps_per_print": 10,
 5 | 
 6 | "zero_optimization": {
 7 |     "stage": 0,
 8 |     "elastic_checkpoint": true
 9 | },
10 | 
11 | "gradient_clipping": 1.0,
12 | "prescale_gradients": true,
13 | 
14 | "fp16": {
15 |     "enabled": false,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 | },
22 | 
23 | "bf16": {
24 |     "enabled": false
25 | },
26 | 
27 | "wall_clock_breakdown" : false
28 | }


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh:
--------------------------------------------------------------------------------
 1 | ## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md
 2 | ## and follow the steps of installation/data downloading.
 3 | 
 4 | ## Code below only works when you run each evalharness task on a single GPU.
 5 | ## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh
 6 | checkpoint_path=$1
 7 | config_path=$2
 8 | result_path=$3
 9 | rank=$4
10 | tasks=$5
11 | hostname=$6
12 | master_port=$(( 12345 + ${rank} ))
13 | batch_size=$7
14 | num_fewshot=$8
15 | 
16 | mp_size=1
17 | pp_size=1
18 | no_pp="true"
19 | ep_size=1
20 | 
21 | vocab_file="gpt2-vocab.json"
22 | if [ ! -f "$vocab_file" ]; then
23 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
24 | fi
25 | merge_file="gpt2-merges.txt"
26 | if [ ! -f "$merge_file" ]; then
27 |     wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
28 | fi
29 | 
30 | export HF_DATASETS_OFFLINE=1
31 | 
32 | dir2=$(dirname "$checkpoint_path")
33 | dirname=$(basename "$dir2")/$(basename "$checkpoint_path")
34 | result_path="${result_path}/${dirname}"
35 | mkdir -p $result_path
36 | result_file="${result_path}/${tasks}_${num_fewshot}shot.json"
37 | 
38 | # Dummy arguments to make megatron happy. No need to configure them.
39 | # The reason we don't need to configure them and many other arguments is
40 | # because the eval framework will read the arguments from checkpoint file.
41 | megatron_required_args="\
42 |     --num-layers -1 \
43 |     --hidden-size -1 \
44 |     --num-attention-heads -1 \
45 |     --seq-length -1 \
46 |     --max-position-embeddings -1
47 | "
48 | 
49 | command="../../../../tasks/eval_harness/evaluate.py \
50 |     --load ${checkpoint_path} \
51 |     --tensor-model-parallel-size ${mp_size} \
52 |     --pipeline-model-parallel-size ${pp_size} \
53 |     --moe-expert-parallel-size ${ep_size} \
54 |     --vocab-file ${vocab_file} \
55 |     --merge-file ${merge_file} \
56 |     --micro-batch-size ${batch_size} \
57 |     --no-load-optim \
58 |     --no-load-rng \
59 |     --inference \
60 |     --disable-moe-token-dropping \
61 |     --adaptive_seq_len \
62 |     --eval_fp32 \
63 |     --num_fewshot ${num_fewshot} \
64 |     --task_list ${tasks} \
65 |     --results_path ${result_file} \
66 |     --deepspeed \
67 |     --deepspeed_config ${config_path} \
68 |     ${megatron_required_args} \
69 |     "
70 | 
71 | if [[ "${no_pp}" = "true" ]]; then
72 | command="${command} \
73 |     --no-pipeline-parallel"
74 | fi
75 | 
76 | launcher="deepspeed --include=$hostname:$rank --master_port=${master_port}"
77 | $launcher $command &> "${result_path}/${tasks}_${num_fewshot}shot.log"


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh:
--------------------------------------------------------------------------------
 1 | ## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md
 2 | ## and follow the steps of installation/data downloading.
 3 | checkpoint_paths=(
 4 |     /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/
 5 |     /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/
 6 | )
 7 | 
 8 | ## No need to use the exact training config json, just use this dummy is fine
 9 | config_path=ds_config_eval_dummy.json
10 | username=$(whoami)
11 | result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results"
12 | 
13 | ## Task(s) on the same row will be performed together in the same process.
14 | ## There exist other tasks that can run but we skip because they didn't appear
15 | ## or have strange scores in GPT-3 paper: qqp, prost, cb, wic, mrpc, sst, wnli
16 | ## pubmedqa, logiqa, qnli, sciq, mc_taco, mathqa. For wikitext, it didn't
17 | ## appear in paper but we include it for a perplexity task.
18 | tasks=(
19 |     record
20 |     triviaqa
21 |     hellaswag
22 |     arc_challenge
23 |     arc_easy
24 |     race
25 |     multirc
26 |     openbookqa
27 |     lambada
28 |     webqs
29 |     winogrande
30 |     piqa
31 |     anli_r1,anli_r2,anli_r3
32 |     boolq,copa
33 |     rte,wsc
34 |     wikitext
35 | )
36 | 
37 | ## Use localhost if you didn't setup hostfile as described in
38 | ## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node.
39 | ## If hostfile exist, use hostname (e.g., worker-0) in hostfile.
40 | # hostname="localhost"
41 | hostname="worker-0"
42 | 
43 | batch_size=32
44 | 
45 | ## This script is for zero-shot
46 | num_fewshot=0
47 | 
48 | num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
49 | cuda_id=-1
50 | total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+)
51 | 
52 | ## Code below only works when you run each evalharness task on a single GPU.
53 | ## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh
54 | for l in "${!checkpoint_paths[@]}"; do 
55 |     checkpoint_path=${checkpoint_paths[l]}
56 |     for ((i=0;i<${#tasks[@]};++i)); do
57 |         task=${tasks[i]}
58 |         free_mem=0
59 |         while [ $free_mem -lt $total_mem ]; do
60 |             cuda_id=$(((cuda_id+1)%num_gpus))
61 |             free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+)
62 |             sleep 60s
63 |         done
64 |         bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot &
65 |     done
66 | done
67 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh:
--------------------------------------------------------------------------------
 1 | ## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md
 2 | ## and follow the steps of installation/data downloading.
 3 | checkpoint_paths=(
 4 |     /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/
 5 |     /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/
 6 | )
 7 | 
 8 | ## No need to use the exact training config json, just use this dummy is fine
 9 | config_path=ds_config_eval_dummy.json
10 | username=$(whoami)
11 | result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results_10shot"
12 | 
13 | ## Task(s) on the same row will be performed together in the same process.
14 | tasks=(
15 |     record
16 |     triviaqa
17 |     hellaswag
18 |     arc_challenge
19 |     arc_easy
20 |     race
21 |     multirc
22 |     openbookqa
23 |     lambada
24 |     webqs
25 |     winogrande
26 |     piqa
27 |     anli_r1,anli_r2
28 |     anli_r3
29 |     boolq,copa
30 |     rte,wsc
31 | )
32 | 
33 | num_fewshot=10
34 | 
35 | ## Use localhost if you didn't setup hostfile as described in
36 | ## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node.
37 | ## If hostfile exist, use hostname (e.g., worker-0) in hostfile.
38 | # hostname="localhost"
39 | hostname="worker-0"
40 | 
41 | batch_size=16
42 | 
43 | num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
44 | cuda_id=-1
45 | total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+)
46 | 
47 | ## Code below only works when you run each evalharness task on a single GPU.
48 | ## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh
49 | for l in "${!checkpoint_paths[@]}"; do 
50 |     checkpoint_path=${checkpoint_paths[l]}
51 |     for ((i=0;i<${#tasks[@]};++i)); do
52 |         task=${tasks[i]}
53 |         free_mem=0
54 |         while [ $free_mem -lt $total_mem ]; do
55 |             cuda_id=$(((cuda_id+1)%num_gpus))
56 |             free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+)
57 |             sleep 60s
58 |         done
59 |         bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot &
60 |     done
61 | done
62 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false,
24 |   "dataloader_drop_last": true,
25 |   "data_efficiency": {
26 |     "enabled": true,
27 |     "seed": DATA_EFFICIENCY_SEED,
28 |     "data_routing": {
29 |       "enabled": LTD_ENABLED,
30 |       "random_ltd":{
31 |         "enabled": LTD_ENABLED,
32 |         "total_layer_num": 24,
33 |         "random_ltd_layer_num": 22,
34 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
35 |         "model_mask_name": "attention_mask",
36 |         "model_type": "decoder",
37 |         "hidden_state_order": "seq_batch_dim",
38 |         "random_ltd_schedule": {
39 |           "min_value": LTD_MIN,
40 |           "max_value": LTD_MAX,
41 |           "schedule_type":"fixed_linear",
42 |           "schedule_config": {
43 |             "require_steps": LTD_STEP,
44 |             "seq_per_step": 16
45 |           }
46 |         }
47 |       } 
48 |     },
49 |     "data_sampling": {
50 |       "enabled": CL_ENABLED,
51 |       "num_workers": DATA_SAMPLING_NUM_WORKERS,
52 |       "curriculum_learning": {
53 |         "enabled": CL_ENABLED,
54 |         "data_cluster_path": "CL_CLUSTER_PATH",
55 |         "curriculum_metrics": {
56 |           "CL_1st_METRIC_NAME": {
57 |             "index_to_sample_path": "CL_1st_SAMPLE_PATH",
58 |             "index_to_metric_path": "CL_1st_METRIC_PATH",
59 |             "difficulty_type": "CL_1st_DIFF_TYPE",
60 |             "clustering_type": "CL_1st_CLUSTER_TYPE",
61 |             "min_difficulty": CL_1st_MIN,
62 |             "max_difficulty": CL_1st_MAX,
63 |             "schedule_type": "fixed_root",
64 |             "schedule_config": {
65 |               "total_curriculum_step": CL_1st_TOTAL_STEP,
66 |               "difficulty_step": CL_1st_DIFF_STEP,
67 |               "root_degree": CL_1st_ROOT
68 |             }
69 |           }
70 |         }
71 |       }
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false,
24 |   "dataloader_drop_last": true,
25 |   "data_efficiency": {
26 |     "enabled": true,
27 |     "seed": DATA_EFFICIENCY_SEED,
28 |     "data_routing": {
29 |       "enabled": LTD_ENABLED,
30 |       "random_ltd":{
31 |         "enabled": LTD_ENABLED,
32 |         "total_layer_num": 24,
33 |         "random_ltd_layer_num": 22,
34 |         "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
35 |         "model_mask_name": "attention_mask",
36 |         "model_type": "decoder",
37 |         "hidden_state_order": "seq_batch_dim",
38 |         "random_ltd_schedule": {
39 |           "min_value": LTD_MIN,
40 |           "max_value": LTD_MAX,
41 |           "schedule_type":"fixed_linear",
42 |           "schedule_config": {
43 |             "require_steps": LTD_STEP,
44 |             "seq_per_step": 16
45 |           }
46 |         }
47 |       } 
48 |     },
49 |     "data_sampling": {
50 |       "enabled": CL_ENABLED,
51 |       "num_workers": DATA_SAMPLING_NUM_WORKERS,
52 |       "curriculum_learning": {
53 |         "enabled": CL_ENABLED,
54 |         "data_cluster_path": "CL_CLUSTER_PATH",
55 |         "curriculum_metrics": {
56 |           "CL_1st_METRIC_NAME": {
57 |             "index_to_sample_path": "CL_1st_SAMPLE_PATH",
58 |             "index_to_metric_path": "CL_1st_METRIC_PATH",
59 |             "difficulty_type": "CL_1st_DIFF_TYPE",
60 |             "clustering_type": "CL_1st_CLUSTER_TYPE",
61 |             "min_difficulty": CL_1st_MIN,
62 |             "max_difficulty": CL_1st_MAX,
63 |             "schedule_type": "fixed_root",
64 |             "schedule_config": {
65 |               "total_curriculum_step": CL_1st_TOTAL_STEP,
66 |               "difficulty_step": CL_1st_DIFF_STEP,
67 |               "root_degree": CL_1st_ROOT
68 |             }
69 |           },
70 |           "CL_2nd_METRIC_NAME": {
71 |             "index_to_sample_path": "CL_2nd_SAMPLE_PATH",
72 |             "index_to_metric_path": "CL_2nd_METRIC_PATH",
73 |             "difficulty_type": "CL_2nd_DIFF_TYPE",
74 |             "clustering_type": "CL_2nd_CLUSTER_TYPE",
75 |             "min_difficulty": CL_2nd_MIN,
76 |             "max_difficulty": CL_2nd_MAX,
77 |             "schedule_type": "fixed_root",
78 |             "schedule_config": {
79 |               "total_curriculum_step": CL_2nd_TOTAL_STEP,
80 |               "difficulty_step": CL_2nd_DIFF_STEP,
81 |               "root_degree": CL_2nd_ROOT
82 |             }
83 |           }
84 |         }
85 |       }
86 |     }
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/examples/evaluate_ict_zeroshot_nq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained
 4 | # ICT model
 5 | 
 6 | # Datasets can be downloaded from the following link:
 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 8 | 
 9 | EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
10 | EMBEDDING_PATH=<Specify path of the embeddings>
11 | CHECKPOINT_PATH=<Specify path of pretrained ICT model>
12 | 
13 | QA_FILE=<Path of the natural question test dataset>
14 | 
15 | python tasks/main.py \
16 |     --task ICT-ZEROSHOT-NQ \
17 |     --tokenizer-type BertWordPieceLowerCase \
18 |     --num-layers 12 \
19 |     --hidden-size 768 \
20 |     --num-attention-heads 12 \
21 |     --tensor-model-parallel-size 1 \
22 |     --micro-batch-size 128 \
23 |     --checkpoint-activations \
24 |     --seq-length 512 \
25 |     --max-position-embeddings 512 \
26 |     --load ${CHECKPOINT_PATH} \
27 |     --evidence-data-path ${EVIDENCE_DATA_DIR} \
28 |     --embedding-path ${EMBEDDING_PATH} \
29 |     --retriever-seq-length 256 \
30 |     --vocab-file  bert-vocab.txt\
31 |     --qa-data-test ${QA_FILE} \
32 |     --num-workers 2 \
33 |     --faiss-use-gpu \
34 |     --retriever-report-topk-accuracies 1 5 20 100 \
35 |     --fp16
36 | 
37 | 


--------------------------------------------------------------------------------
/examples/evaluate_zeroshot_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TASK="LAMBADA"
12 | 
13 | VALID_DATA=<lambada path>
14 | VOCAB_FILE=gpt2-vocab.json
15 | MERGE_FILE=gpt2-merges.txt
16 | CHECKPOINT=checkpoints/gpt2_345m
17 | 
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
20 |                --task $TASK \
21 |                --valid-data $VALID_DATA \
22 |                --tokenizer-type GPT2BPETokenizer \
23 |                --strict-lambada \
24 |                --vocab-file $VOCAB_FILE \
25 |                --merge-file $MERGE_FILE \
26 |                --load $CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 8 \
32 |                --checkpoint-activations \
33 |                --seq-length 1024 \
34 |                --max-position-embeddings 1024 \
35 |                --log-interval 10 \
36 |                --fp16 \
37 |                --no-load-optim \
38 |                --no-load-rng
39 | 


--------------------------------------------------------------------------------
/examples/finetune_mnli_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv"
12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
13 |             data/glue_data/MNLI/dev_mismatched.tsv"
14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
15 | VOCAB_FILE=bert-vocab.txt
16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task MNLI \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 5 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 8 \
32 |                --checkpoint-activations \
33 |                --lr 5.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.065 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 500000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --fp16
45 | 


--------------------------------------------------------------------------------
/examples/finetune_race_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/RACE/train/middle"
12 | VALID_DATA="data/RACE/dev/middle \
13 |             data/RACE/dev/high"
14 | VOCAB_FILE=bert-vocab.txt
15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
16 | CHECKPOINT_PATH=checkpoints/bert_345m_race
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task RACE \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 3 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 4 \
32 |                --checkpoint-activations \
33 |                --lr 1.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.06 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 100000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --clip-grad 1.0 \
45 |                --hidden-dropout 0.1 \
46 |                --attention-dropout 0.1 \
47 |                --fp16
48 | 


--------------------------------------------------------------------------------
/examples/generate_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export TORCH_CUDA_ARCH_LIST=8.6+PTX
 3 | CHECKPOINT_PATH=checkpoints/gpt2_345m
 4 | VOCAB_FILE=gpt2-vocab.json
 5 | MERGE_FILE=gpt2-merges.txt
 6 | b=8
 7 | mp=1
 8 | experts=1
 9 | nodes=1
10 | gpus=1
11 | 
12 | 
13 | use_tutel=""
14 | #use_tutel="--use-tutel"
15 | 
16 | 
17 | #ds_inference=""
18 | ds_inference="--ds-inference"
19 | 
20 | launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus"
21 | L=24
22 | H=1024
23 | A=16
24 | #experts1=${experts[$k]}
25 | program_cmd="tools/generate_samples_gpt.py \
26 |        --tensor-model-parallel-size $mp \
27 |        --num-layers $L \
28 |        --hidden-size $H \
29 |        --num-attention-heads $A \
30 |        --max-position-embeddings 1024 \
31 |        --tokenizer-type GPT2BPETokenizer \
32 |        --fp16 \
33 |        --num-experts ${experts} \
34 |        --mlp-type standard \
35 |        --micro-batch-size $b \
36 |        --seq-length 1024 \
37 |        --out-seq-length 1024 \
38 |        --temperature 1.0 \
39 |        --vocab-file $VOCAB_FILE \
40 |        --merge-file $MERGE_FILE \
41 |        --genfile unconditional_samples.json \
42 |        --top_p 0.9 \
43 |        --log-interval 1 \
44 |        --num-samples 0 \
45 |        --load $CHECKPOINT_PATH \
46 |        $use_tutel $ds_inference"
47 | 
48 | echo $launch_cmd $program_cmd
49 | $launch_cmd $program_cmd
50 | 


--------------------------------------------------------------------------------
/examples/merge_mp_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TENSOR_MODEL_PARALLEL_SIZE=2
 4 | 
 5 | VOCAB_FILE=bert-vocab.txt
 6 | CHECKPOINT_PATH=checkpoints/bert_345m
 7 | 
 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
 9 |                                 --model-type BERT \
10 |                                 --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
11 |                                 --tokenizer-type BertWordPieceLowerCase \
12 |                                 --vocab-file $VOCAB_FILE \
13 |                                 --num-layers 24 \
14 |                                 --hidden-size 1024 \
15 |                                 --num-attention-heads 16 \
16 |                                 --seq-length 512 \
17 |                                 --max-position-embeddings 512 \
18 |                                 --load $CHECKPOINT_PATH
19 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0
 4 | WORLD_SIZE=1
 5 | DATA_PATH=<Specify path and file prefix>_text_sentence
 6 | CHECKPOINT_PATH=<Specify path>
 7 | 
 8 | python pretrain_bert.py \
 9 |        --num-layers 24 \
10 |        --hidden-size 1024 \
11 |        --num-attention-heads 16 \
12 |        --micro-batch-size 4 \
13 |        --global-batch-size 8 \
14 |        --seq-length 512 \
15 |        --max-position-embeddings 512 \
16 |        --train-iters 2000000 \
17 |        --lr-decay-iters 990000 \
18 |        --save $CHECKPOINT_PATH \
19 |        --load $CHECKPOINT_PATH \
20 |        --data-path $DATA_PATH \
21 |        --vocab-file bert-vocab.txt \
22 |        --data-impl mmap \
23 |        --split 949,50,1 \
24 |        --lr 0.0001 \
25 |        --min-lr 0.00001 \
26 |        --lr-decay-style linear \
27 |        --lr-warmup-fraction .01 \
28 |        --weight-decay 1e-2 \
29 |        --clip-grad 1.0 \
30 |        --log-interval 100 \
31 |        --save-interval 10000 \
32 |        --eval-interval 1000 \
33 |        --eval-iters 10 \
34 |        --fp16
35 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>_text_sentence
12 | CHECKPOINT_PATH=<Specify path>
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |        pretrain_bert.py \
18 |        --num-layers 24 \
19 |        --hidden-size 1024 \
20 |        --num-attention-heads 16 \
21 |        --micro-batch-size 4 \
22 |        --global-batch-size 32 \
23 |        --seq-length 512 \
24 |        --max-position-embeddings 512 \
25 |        --train-iters 1000000 \
26 |        --save $CHECKPOINT_PATH \
27 |        --load $CHECKPOINT_PATH \
28 |        --data-path $DATA_PATH \
29 |        --vocab-file bert-vocab.txt \
30 |        --data-impl mmap \
31 |        --split 949,50,1 \
32 |        --distributed-backend nccl \
33 |        --lr 0.0001 \
34 |        --lr-decay-style linear \
35 |        --min-lr 1.0e-5 \
36 |        --lr-decay-iters 990000 \
37 |        --weight-decay 1e-2 \
38 |        --clip-grad 1.0 \
39 |        --lr-warmup-fraction .01 \
40 |        --log-interval 100 \
41 |        --save-interval 10000 \
42 |        --eval-interval 1000 \
43 |        --eval-iters 10 \
44 |        --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>_text_sentence
12 | VOCAB_FILE=<Specify path to vocab.txt>
13 | CHECKPOINT_PATH=<Specify path>
14 | 
15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
16 | 
17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
18 |        pretrain_bert.py \
19 |        --tensor-model-parallel-size 2 \
20 |        --pipeline-model-parallel-size 2 \
21 |        --num-layers 24 \
22 |        --hidden-size 1024 \
23 |        --num-attention-heads 16 \
24 |        --micro-batch-size 2 \
25 |        --global-batch-size 16 \
26 |        --max-position-embeddings 512 \
27 |        --train-iters 1000000 \
28 |        --save $CHECKPOINT_PATH \
29 |        --load $CHECKPOINT_PATH \
30 |        --data-path $DATA_PATH \
31 |        --vocab-file $VOCAB_FILE \
32 |        --data-impl mmap \
33 |        --split 949,50,1 \
34 |        --distributed-backend nccl \
35 |        --lr 0.0001 \
36 |        --lr-decay-style linear \
37 |        --min-lr 1.0e-5 \
38 |        --lr-decay-iters 990000 \
39 |        --weight-decay 1e-2 \
40 |        --clip-grad 1.0 \
41 |        --lr-warmup-fraction .01 \
42 |        --log-interval 100 \
43 |        --save-interval 10000 \
44 |        --eval-interval 1000 \
45 |        --eval-iters 10 \
46 |        --fp16
47 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | DATA_PATH=<Specify path and file prefix>_text_document
 9 | CHECKPOINT_PATH=<Specify path>
10 | 
11 | 
12 | python pretrain_gpt.py \
13 |        --num-layers 24 \
14 |        --hidden-size 1024 \
15 |        --num-attention-heads 16 \
16 |        --micro-batch-size 4 \
17 |        --global-batch-size 8 \
18 |        --seq-length 1024 \
19 |        --max-position-embeddings 1024 \
20 |        --train-iters 500000 \
21 |        --lr-decay-iters 320000 \
22 |        --save $CHECKPOINT_PATH \
23 |        --load $CHECKPOINT_PATH \
24 |        --data-path $DATA_PATH \
25 |        --vocab-file gpt2-vocab.json \
26 |        --merge-file gpt2-merges.txt \
27 |        --data-impl mmap \
28 |        --split 949,50,1 \
29 |        --distributed-backend nccl \
30 |        --lr 0.00015 \
31 |        --min-lr 1.0e-5 \
32 |        --lr-decay-style cosine \
33 |        --weight-decay 1e-2 \
34 |        --clip-grad 1.0 \
35 |        --lr-warmup-fraction .01 \
36 |        --checkpoint-activations \
37 |        --log-interval 100 \
38 |        --save-interval 10000 \
39 |        --eval-interval 1000 \
40 |        --eval-iters 10 \
41 |        --fp16
42 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_175B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | #SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
 5 | 
 6 | 
 7 | DIR=`pwd`
 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 9 | mkdir -p $DIR/logs
10 | 
11 | 
12 | DATASET_1="<PATH TO THE FIRST DATASET>"
13 | DATASET_2="<PATH TO THE SECOND DATASET>"
14 | DATASET_3="<PATH TO THE THIRD DATASET>"
15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
16 | 
17 | 
18 | options=" \
19 | 	--tensor-model-parallel-size 8 \
20 | 	--pipeline-model-parallel-size 16 \
21 |         --num-layers 96 \
22 |         --hidden-size 12288 \
23 |         --num-attention-heads 96 \
24 |         --seq-length 2048 \
25 |         --max-position-embeddings 2048 \
26 | 	--micro-batch-size 1 \
27 | 	--global-batch-size 1536 \
28 | 	--rampup-batch-size 16 16 5859375 \
29 | 	--train-samples 146484375 \
30 |        	--lr-decay-samples 126953125 \
31 |         --lr-warmup-samples 183105 \
32 |         --lr 6.0e-5 \
33 | 	--min-lr 6.0e-6 \
34 |         --lr-decay-style cosine \
35 |         --log-interval 10 \
36 |         --eval-iters 40 \
37 |         --eval-interval 1000 \
38 | 	--data-path ${DATASET} \
39 | 	--vocab-file <PATH TO gpt-vocab.json> \
40 | 	--merge-file <PATH TO gpt-merges.txt> \
41 | 	--save-interval 1000 \
42 | 	--save <PATH TO CHECKPOINTS DIRECTORY> \
43 | 	--load <PATH TO CHECKPOINTS DIRECTORY> \
44 |         --split 98,2,0 \
45 |         --clip-grad 1.0 \
46 | 	--weight-decay 0.1 \
47 | 	--adam-beta1 0.9 \
48 | 	--adam-beta2 0.95 \
49 | 	--init-method-std 0.006 \
50 | 	--tensorboard-dir <TENSORBOARD DIRECTORY> \
51 |         --fp16 \
52 | 	--checkpoint-activations "
53 | 
54 | 
55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
56 | 
57 | 
58 | srun -l \
59 |      --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
60 |      --container-mounts "<DIRECTORIES TO MOUNT>" \
61 |      --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
62 | 
63 | 
64 | set +x
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | DATA_PATH=<Specify path and file prefix>_text_document
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19 |        pretrain_gpt.py \
20 |        --num-layers 24 \
21 |        --hidden-size 1024 \
22 |        --num-attention-heads 16 \
23 |        --micro-batch-size 8 \
24 |        --global-batch-size 64 \
25 |        --seq-length 1024 \
26 |        --max-position-embeddings 1024 \
27 |        --train-iters 500000 \
28 |        --lr-decay-iters 320000 \
29 |        --save $CHECKPOINT_PATH \
30 |        --load $CHECKPOINT_PATH \
31 |        --data-path $DATA_PATH \
32 |        --vocab-file gpt2-vocab.json \
33 |        --merge-file gpt2-merges.txt \
34 |        --data-impl mmap \
35 |        --split 949,50,1 \
36 |        --distributed-backend nccl \
37 |        --lr 0.00015 \
38 |        --lr-decay-style cosine \
39 |        --min-lr 1.0e-5 \
40 |        --weight-decay 1e-2 \
41 |        --clip-grad 1.0 \
42 |        --lr-warmup-fraction .01 \
43 |        --checkpoint-activations \
44 |        --log-interval 100 \
45 |        --save-interval 10000 \
46 |        --eval-interval 1000 \
47 |        --eval-iters 10 \
48 |        --fp16
49 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | DATA_PATH=<Specify path and file prefix>_text_document
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19 |        pretrain_gpt.py \
20 |        --tensor-model-parallel-size 2 \
21 |        --pipeline-model-parallel-size 2 \
22 |        --num-layers 24 \
23 |        --hidden-size 1024 \
24 |        --num-attention-heads 16 \
25 |        --micro-batch-size 4 \
26 |        --global-batch-size 16 \
27 |        --seq-length 1024 \
28 |        --max-position-embeddings 1024 \
29 |        --train-iters 500000 \
30 |        --lr-decay-iters 320000 \
31 |        --save $CHECKPOINT_PATH \
32 |        --load $CHECKPOINT_PATH \
33 |        --data-path $DATA_PATH \
34 |        --vocab-file gpt2-vocab.json \
35 |        --merge-file gpt2-merges.txt \
36 |        --data-impl mmap \
37 |        --split 949,50,1 \
38 |        --distributed-backend nccl \
39 |        --lr 0.00015 \
40 |        --lr-decay-style cosine \
41 |        --min-lr 1.0e-5 \
42 |        --weight-decay 1e-2 \
43 |        --clip-grad 1.0 \
44 |        --lr-warmup-fraction .01 \
45 |        --checkpoint-activations \
46 |        --log-interval 100 \
47 |        --save-interval 10000 \
48 |        --eval-interval 1000 \
49 |        --eval-iters 10 \
50 |        --fp16
51 | 


--------------------------------------------------------------------------------
/examples/pretrain_ict.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "217M" parameter biencoder model for ICT retriever
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
 9 | TEXT_DATA_PATH=<Specify path and file prefix of the text data>
10 | TITLE_DATA_PATH=<Specify path and file prefix od the titles>
11 | CHECKPOINT_PATH=<Specify path>
12 | 
13 | 
14 | python pretrain_ict.py \
15 |         --num-layers 12 \
16 |         --hidden-size 768 \
17 |         --num-attention-heads 12 \
18 |         --tensor-model-parallel-size 1 \
19 |         --micro-batch-size 32 \
20 |         --seq-length 256 \
21 |         --max-position-embeddings 512 \
22 |         --train-iters 100000 \
23 |         --vocab-file bert-vocab.txt \
24 |         --tokenizer-type BertWordPieceLowerCase \
25 |         --DDP-impl torch \
26 |         --bert-load ${PRETRAINED_BERT_PATH} \
27 |         --log-interval 100 \
28 |         --eval-interval 1000 \
29 |         --eval-iters 10 \
30 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
31 |         --retriever-score-scaling \
32 |         --load $CHECKPOINT_PATH \
33 |         --save $CHECKPOINT_PATH \
34 |         --data-path ${TEXT_DATA_PATH} \
35 |         --titles-data-path ${TITLE_DATA_PATH} \
36 |         --lr 0.0001 \
37 |         --lr-decay-style linear \
38 |         --weight-decay 1e-2 \
39 |         --clip-grad 1.0 \
40 |         --lr-warmup-fraction 0.01 \
41 |         --save-interval 4000 \
42 |         --exit-interval 8000 \
43 |         --query-in-block-prob 0.1 \
44 |         --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_llama_distributed.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -ex
  3 | 
  4 | ######################################
  5 | # Change the below configurations here
  6 | BASE_PATH=./tmp
  7 | DS_CONFIG=${BASE_PATH}/deepspeed.json
  8 | DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence"
  9 | DATASET="1 ${DATASET_1}"
 10 | CHECKPOINT_PATH=./tmp
 11 | TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model
 12 | 
 13 | TP=2
 14 | PP=2
 15 | ZERO_STAGE=0
 16 | 
 17 | GPUS_PER_NODE=8
 18 | MASTER_ADDR=localhost
 19 | MASTER_PORT=6000
 20 | NNODES=1
 21 | NODE_RANK=0
 22 | 
 23 | HIDDEN_SIZE=2048 # e.g. llama-13b: 5120
 24 | FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824
 25 | NUM_LAYERS=24 # e.g. llama-13b: 40
 26 | NUM_HEADS=16 # e.g. llama-13b: 40
 27 | SEQ_LENGTH=2048
 28 | 
 29 | MICRO_BATCH_SIZE=4
 30 | GLOBAL_BATCH_SIZE=16 # e.g. llama: 4M tokens
 31 | TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps
 32 | LR=3e-4
 33 | MIN_LR=3e-5
 34 | LR_WARMUP_STEPS=2000
 35 | WEIGHT_DECAY=0.1
 36 | GRAD_CLIP=1
 37 | 
 38 | # Below configuration required for llama model as per llama paper
 39 | #--no-query-key-layer-scaling \
 40 | # --attention-dropout 0 \
 41 | # --hidden-dropout 0 \
 42 | # --position-embedding-type rope \
 43 | # --untie-embeddings-and-output-weights \
 44 | # --activation swiglu \
 45 | # --normalization rmsnorm \
 46 | # --no-bias-gelu-fusion \
 47 | # --no-bias-dropout-fusion \
 48 | # --no-bias \
 49 | ######################################
 50 | 
 51 | 
 52 | 
 53 | cat <<EOT > $DS_CONFIG
 54 | {
 55 |   "train_batch_size" : $GLOBAL_BATCH_SIZE,
 56 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
 57 |   "steps_per_print": 1,
 58 | 
 59 |   "zero_optimization": {
 60 |     "stage": $ZERO_STAGE
 61 |   },
 62 | 
 63 |   "bf16": {
 64 |     "enabled": true
 65 |   }
 66 | }
 67 | EOT
 68 | 
 69 | ds_args=""
 70 | ds_args=" --deepspeed ${ds_args}"
 71 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
 72 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 73 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
 74 | 
 75 | 
 76 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 77 | 
 78 | torchrun $DISTRIBUTED_ARGS \
 79 |        pretrain_gpt.py \
 80 |        --tensor-model-parallel-size $TP \
 81 |        --pipeline-model-parallel-size $PP \
 82 |        --num-layers $NUM_LAYERS \
 83 |        --hidden-size $HIDDEN_SIZE \
 84 |        --ffn-hidden-size $FFN_HIDDEN_SIZE \
 85 |        --num-attention-heads $NUM_HEADS \
 86 |        --micro-batch-size $MICRO_BATCH_SIZE \
 87 |        --global-batch-size $GLOBAL_BATCH_SIZE \
 88 |        --seq-length $SEQ_LENGTH \
 89 |        --max-position-embeddings $SEQ_LENGTH \
 90 |        --train-iters $TRAIN_STEPS \
 91 |        --save $CHECKPOINT_PATH \
 92 |        --load $CHECKPOINT_PATH \
 93 |        --data-path $DATASET \
 94 |        --data-impl mmap \
 95 |        --tokenizer-type SPTokenizer \
 96 |        --tokenizer-model-file $TOKENIZER_PATH \
 97 |        --split 949,50,1 \
 98 |        --distributed-backend nccl \
 99 |        --lr $LR \
100 |        --lr-decay-style cosine \
101 |        --min-lr $MIN_LR \
102 |        --weight-decay $WEIGHT_DECAY \
103 |        --clip-grad $GRAD_CLIP \
104 |        --lr-warmup-iters $LR_WARMUP_STEPS \
105 |        --optimizer adam \
106 |        --adam-beta1 0.9 \
107 |        --adam-beta2 0.95 \
108 |        --checkpoint-activations \
109 |        --log-interval 100 \
110 |        --save-interval 10000 \
111 |        --eval-interval 1000 \
112 |        --eval-iters 10 \
113 |        --bf16 \
114 |        --no-query-key-layer-scaling \
115 |        --attention-dropout 0 \
116 |        --hidden-dropout 0 \
117 |        --position-embedding-type rope \
118 |        --untie-embeddings-and-output-weights \
119 |        --activation swiglu \
120 |        --normalization rmsnorm \
121 |        --no-bias-gelu-fusion \
122 |        --no-bias-dropout-fusion \
123 |        --no-bias
124 |        $ds_args


--------------------------------------------------------------------------------
/examples/pretrain_t5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0
 4 | WORLD_SIZE=1
 5 | DATA_PATH=<Specify path and file prefix>
 6 | VOCAB_FILE=<Specify path to vocab.txt>
 7 | CHECKPOINT_PATH=<Specify path>
 8 | 
 9 | python pretrain_t5.py \
10 |        --num-layers 12 \
11 |        --hidden-size 768 \
12 |        --num-attention-heads 12 \
13 |        --kv-channels 64 \
14 |        --ffn-hidden-size 3072 \
15 |        --encoder-seq-length 512 \
16 |        --decoder-seq-length 128 \
17 |        --micro-batch-size 16 \
18 |        --global-batch-size 2048 \
19 |        --max-position-embeddings 512 \
20 |        --train-iters 1000000 \
21 |        --lr-decay-iters 1000000 \
22 |        --save $CHECKPOINT_PATH \
23 |        --load $CHECKPOINT_PATH \
24 |        --data-path $DATA_PATH \
25 |        --vocab-file $VOCAB_FILE \
26 |        --data-impl mmap \
27 |        --split 949,50,1 \
28 |        --lr 0.0001 \
29 |        --min-lr 0.00001 \
30 |        --lr-decay-style linear \
31 |        --lr-warmup-fraction .01 \
32 |        --weight-decay 1e-2 \
33 |        --clip-grad 1.0 \
34 |        --log-interval 100 \
35 |        --save-interval 10000 \
36 |        --eval-interval 1000 \
37 |        --eval-iters 10 \
38 |        --fp16
39 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>
12 | VOCAB_FILE=<Specify path to vocab.txt>
13 | CHECKPOINT_PATH=<Specify path>
14 | 
15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
16 | 
17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
18 |        pretrain_t5.py \
19 |        --num-layers 12 \
20 |        --hidden-size 768 \
21 |        --num-attention-heads 12 \
22 |        --kv-channels 64 \
23 |        --ffn-hidden-size 3072 \
24 |        --encoder-seq-length 512 \
25 |        --decoder-seq-length 128 \
26 |        --micro-batch-size 16 \
27 |        --global-batch-size 2048 \
28 |        --max-position-embeddings 512 \
29 |        --train-iters 1000000 \
30 |        --lr-decay-iters 1000000 \
31 |        --save $CHECKPOINT_PATH \
32 |        --load $CHECKPOINT_PATH \
33 |        --data-path $DATA_PATH \
34 |        --vocab-file $VOCAB_FILE \
35 |        --data-impl mmap \
36 |        --split 949,50,1 \
37 |        --lr 0.0001 \
38 |        --min-lr 0.00001 \
39 |        --lr-decay-style linear \
40 |        --lr-warmup-fraction .01 \
41 |        --weight-decay 1e-2 \
42 |        --clip-grad 1.0 \
43 |        --log-interval 100 \
44 |        --save-interval 10000 \
45 |        --eval-interval 1000 \
46 |        --eval-iters 10 \
47 |        --fp16
48 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>
12 | CHECKPOINT_PATH=<Specify path>
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |        pretrain_t5.py \
18 |        --tensor-model-parallel-size 2 \
19 |        --num-layers 12 \
20 |        --hidden-size 768 \
21 |        --num-attention-heads 12 \
22 |        --kv-channels 64 \
23 |        --ffn-hidden-size 3072 \
24 |        --encoder-seq-length 512 \
25 |        --decoder-seq-length 128 \
26 |        --micro-batch-size 16 \
27 |        --global-batch-size 2048 \
28 |        --seq-length 512 \
29 |        --max-position-embeddings 512 \
30 |        --train-iters 1000000 \
31 |        --lr-decay-iters 1000000 \
32 |        --save $CHECKPOINT_PATH \
33 |        --load $CHECKPOINT_PATH \
34 |        --data-path $DATA_PATH \
35 |        --vocab-file t5-vocab.txt \
36 |        --data-impl mmap \
37 |        --split 949,50,1 \
38 |        --lr 0.0001 \
39 |        --min-lr 0.00001 \
40 |        --lr-decay-style linear \
41 |        --lr-warmup-fraction .01 \
42 |        --weight-decay 1e-2 \
43 |        --clip-grad 1.0 \
44 |        --log-interval 100 \
45 |        --save-interval 10000 \
46 |        --eval-interval 1000 \
47 |        --eval-iters 10 \
48 |        --fp16
49 | 


--------------------------------------------------------------------------------
/examples/run_deepspeed_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | BASE_PATH=/vc_data/Megatron-LM/data
 5 | DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
 6 | DS_CONFIG=ds_config.json
 7 | 
 8 | TP=1
 9 | PP=1
10 | NLAYERS=24
11 | HIDDEN=512
12 | 
13 | GLOBAL_BATCH=64
14 | MICRO_BATCH=4
15 | 
16 | ZERO_STAGE=2
17 | 
18 | OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
19 | #OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
20 | mkdir -p $OUTPUT_DIR
21 | 
22 | cat <<EOT > $DS_CONFIG
23 | {
24 |   "train_batch_size" : $GLOBAL_BATCH,
25 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH,
26 |   "steps_per_print": 1,
27 | 
28 |   "zero_optimization": {
29 |     "stage": $ZERO_STAGE
30 |   },
31 | 
32 |   "fp16": {
33 |     "enabled": true,
34 |     "initial_scale_power": 12
35 |   },
36 | 
37 |   "wall_clock_breakdown" : true
38 | }
39 | EOT
40 | 
41 | export NCCL_DEBUG=warn 
42 | 
43 | ds_args=""
44 | ds_args=" --deepspeed ${ds_args}"
45 | ds_args=" --no-pipeline-parallel ${ds_args}" 
46 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
47 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
48 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
49 | 
50 | 
51 | deepspeed pretrain_gpt.py \
52 |     --tensor-model-parallel-size $TP \
53 |     --pipeline-model-parallel-size $PP \
54 |     --num-layers $NLAYERS \
55 |     --hidden-size $HIDDEN \
56 |     --num-attention-heads 16 \
57 |     --seq-length 256 \
58 |     --loss-scale 12 \
59 |     --max-position-embeddings 1024 \
60 |     --micro-batch-size 4 \
61 |     --global-batch-size 1024 \
62 |     --train-iters 1000 \
63 |     --lr 6.0e-5 \
64 |     --min-lr 6.0e-6 \
65 |     --lr-decay-style cosine \
66 |     --log-interval 1 \
67 |     --eval-iters 40 \
68 |     --eval-interval 1000 \
69 |     --data-path $DATA_PATH \
70 |     --vocab-file $BASE_PATH/gpt2-vocab.json \
71 |     --merge-file $BASE_PATH/gpt2-merges.txt \
72 |     --save-interval 1000 \
73 |     --split 98,2,0 \
74 |     --clip-grad 1.0 \
75 |     --weight-decay 0.1 \
76 |     --adam-beta1 0.9 \
77 |     --adam-beta2 0.95 \
78 |     --init-method-std 0.006 \
79 |     --fp16 \
80 |     --checkpoint-activations \
81 |     --tensorboard-dir $OUTPUT_DIR \
82 |     $ds_args \
83 |     --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
84 | 
85 | 


--------------------------------------------------------------------------------
/images/cases_april2021.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LydiaXiaohongLi/Megatron-DeepSpeed/336573636757b6db74eab4218885460dc14cec58/images/cases_april2021.png


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import os
16 | import torch
17 | from deepspeed.accelerator import get_accelerator
18 | from .package_info import (
19 |     __description__,
20 |     __contact_names__,
21 |     __url__,
22 |     __download_url__,
23 |     __keywords__,
24 |     __license__,
25 |     __package_name__,
26 |     __version__,
27 | )
28 | 
29 | from .global_vars import get_args
30 | from .global_vars import get_current_global_batch_size
31 | from .global_vars import get_num_microbatches
32 | from .global_vars import update_num_microbatches
33 | from .global_vars import get_tokenizer
34 | from .global_vars import get_tensorboard_writer
35 | from .global_vars import get_adlr_autoresume
36 | from .global_vars import get_timers
37 | from .initialize  import initialize_megatron
38 | 
39 | def print_rank_0(message):
40 |     """If distributed is initialized, print only on rank 0."""
41 |     if torch.distributed.is_initialized():
42 |         if torch.distributed.get_rank() == 0:
43 |             print(message, flush=True)
44 |     else:
45 |         print(message, flush=True)
46 | 
47 | def is_last_rank():
48 |     return torch.distributed.get_rank() == (
49 |         torch.distributed.get_world_size() - 1)
50 | 
51 | def print_rank_last(message):
52 |     """If distributed is initialized, print only on last rank."""
53 |     if torch.distributed.is_initialized():
54 |         if is_last_rank():
55 |             print(message, flush=True)
56 |     else:
57 |         print(message, flush=True)
58 | 
59 | def is_aml():
60 |     # Are we running inside an Azure Machine Learning (AML) environment?
61 |     return 'AZUREML_EXPERIMENT_ID' in os.environ
62 | 
63 | def is_rank_0():
64 |     """Check whether it is rank 0. For AML, check if it is rank 0 of a node"""
65 |     if torch.distributed.is_initialized():
66 |         if torch.distributed.get_rank() == 0 or (
67 |             is_aml() and torch.distributed.get_rank() % get_accelerator().device_count() == 0
68 |             ):
69 |             return True
70 |         else:
71 |             return False
72 |     else:
73 |         return True
74 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Blendable dataset."""
17 | 
18 | import time
19 | 
20 | import numpy as np
21 | import torch
22 | 
23 | from megatron import print_rank_0
24 | from megatron import mpu
25 | 
26 | 
27 | class BlendableDataset(torch.utils.data.Dataset):
28 | 
29 | 
30 |     def __init__(self, datasets, weights):
31 | 
32 |         self.datasets = datasets
33 |         num_datasets = len(datasets)
34 |         assert num_datasets == len(weights)
35 | 
36 |         self.size = 0
37 |         for dataset in self.datasets:
38 |             self.size += len(dataset)
39 | 
40 |         # Normalize weights.
41 |         weights = np.array(weights, dtype=np.float64)
42 |         sum_weights = np.sum(weights)
43 |         assert sum_weights > 0.0
44 |         weights /= sum_weights
45 | 
46 |         # Build indecies.
47 |         start_time = time.time()
48 |         assert num_datasets < 255
49 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
50 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
51 | 
52 |         from megatron.data import helpers
53 |         helpers.build_blending_indices(self.dataset_index,
54 |                                        self.dataset_sample_index,
55 |                                        weights, num_datasets, self.size,
56 |                                        torch.distributed.get_rank() == 0)
57 |         print_rank_0('> elapsed time for building blendable dataset indices: '
58 |                      '{:.2f} (sec)'.format(time.time() - start_time))
59 | 
60 | 
61 |     def __len__(self):
62 |         return self.size
63 | 
64 | 
65 |     def __getitem__(self, idx):
66 |         dataset_idx = self.dataset_index[idx]
67 |         sample_idx = self.dataset_sample_index[idx]
68 |         return self.datasets[dataset_idx][sample_idx]
69 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/megatron/data/vit_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import os
16 | import torch
17 | from torchvision import datasets, transforms
18 | from megatron.data.autoaugment import ImageNetPolicy
19 | 
20 | 
21 | def build_train_valid_datasets(data_path, crop_size=224, color_jitter=True):
22 | 
23 |     # training dataset
24 |     train_data_path = os.path.join(data_path[0], "train")
25 |     normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
26 |     process = [
27 |         transforms.RandomResizedCrop(crop_size),
28 |         transforms.RandomHorizontalFlip(),
29 |     ]
30 |     if color_jitter:
31 |         process += [
32 |             transforms.ColorJitter(
33 |                 brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1
34 |             )
35 |         ]
36 |     fp16_t = transforms.ConvertImageDtype(torch.half)
37 |     process += [ImageNetPolicy(), transforms.ToTensor(), normalize, fp16_t]
38 |     transform_train = transforms.Compose(process)
39 |     train_data = datasets.ImageFolder(
40 |         root=train_data_path, transform=transform_train
41 |     )
42 | 
43 |     # validation dataset
44 |     val_data_path = os.path.join(data_path[0], "val")
45 |     transform_val = transforms.Compose(
46 |         [
47 |             transforms.Resize(crop_size),
48 |             transforms.CenterCrop(crop_size),
49 |             transforms.ToTensor(),
50 |             normalize,
51 |             fp16_t
52 |         ]
53 |     )
54 |     val_data = datasets.ImageFolder(
55 |         root=val_data_path, transform=transform_val
56 |     )
57 | 
58 |     return train_data, val_data
59 | 


--------------------------------------------------------------------------------
/megatron/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import enum
17 | 
18 | class LayerType(enum.Enum):
19 |     encoder = 1
20 |     decoder = 2
21 |  
22 | class AttnType(enum.Enum):
23 |     self_attn = 1
24 |     cross_attn = 2
25 | 
26 | class AttnMaskType(enum.Enum):
27 |     padding = 1
28 |     causal = 2
29 |     prefix = 3
30 | 
31 | class PositionEmbeddingType(enum.Enum):
32 |     rotary = 1
33 |     absolute = 2
34 |     alibi = 3
35 | 


--------------------------------------------------------------------------------
/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """For backward compatibility, we need the class definitions to deserialize."""
17 | 
18 | class LossScaler:
19 |     def __init__(self, scale=1):
20 |         self.cur_scale = scale
21 | 
22 | class DynamicLossScaler:
23 |     def __init__(self,
24 |                  init_scale=2**32,
25 |                  scale_factor=2.,
26 |                  scale_window=1000,
27 |                  min_scale=1,
28 |                  delayed_shift=1,
29 |                  consecutive_hysteresis=False):
30 |         self.cur_scale = init_scale
31 |         self.cur_iter = 0
32 |         self.last_overflow_iter = -1
33 |         self.scale_factor = scale_factor
34 |         self.scale_window = scale_window
35 |         self.min_scale = min_scale
36 |         self.delayed_shift = delayed_shift
37 |         self.cur_hysteresis = delayed_shift
38 |         self.consecutive_hysteresis = consecutive_hysteresis
39 | 
40 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | 
22 | 
23 | #ifndef TORCH_CHECK
24 | #define TORCH_CHECK AT_CHECK
25 | #endif
26 | 
27 | #ifdef VERSION_GE_1_3
28 | #define DATA_PTR data_ptr
29 | #else
30 | #define DATA_PTR data
31 | #endif
32 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     torch::Tensor const& mask,
28 |     float scale_factor);
29 | 
30 | torch::Tensor bwd_cuda(
31 |     torch::Tensor const& output_grads, 
32 |     torch::Tensor const& softmax_results,
33 |     float scale_factor);
34 | 
35 | torch::Tensor fwd(
36 |     torch::Tensor const& input,
37 |     torch::Tensor const& mask,
38 |     float scale_factor) {
39 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
40 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
41 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
42 |       "Only fp16 and bf16 are supported");
43 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
44 | 
45 |   return fwd_cuda(input, mask, scale_factor);
46 | }
47 | 
48 | torch::Tensor bwd(
49 |     torch::Tensor const& output_grads, 
50 |     torch::Tensor const& softmax_results,
51 |     float scale_factor) {
52 | 
53 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
54 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
55 | 
56 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
57 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
58 |       "Only fp16 and bf16 are supported");
59 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
60 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
61 |       "Only fp16 and bf16 are supported");
62 | 
63 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
64 | }
65 | 
66 | } // end namespace scaled_masked_softmax
67 | } // end namespace fused_softmax
68 | } // end namespace multihead_attn
69 | 
70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
71 |   m.def("forward", 
72 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
73 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
74 |   m.def("backward", 
75 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
76 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
77 | }
78 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     float scale_factor);
28 | 
29 | torch::Tensor bwd_cuda(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor);
33 | 
34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
35 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
36 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
37 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
38 |       "Only fp16 and bf16 are supported");
39 | 
40 |   return fwd_cuda(input, scale_factor);
41 | }
42 | 
43 | torch::Tensor bwd(
44 |     torch::Tensor const& output_grads, 
45 |     torch::Tensor const& softmax_results,
46 |     float scale_factor) {
47 | 
48 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
49 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
50 | 
51 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
52 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
53 |       "Only fp16 and bf16 are supported");
54 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
55 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
56 |       "Only fp16 and bf16 are supported");
57 | 
58 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
59 | }
60 | 
61 | } // end namespace scaled_upper_triang_masked_softmax
62 | } // end namespace fused_softmax
63 | } // end namespace multihead_attn
64 | 
65 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
66 |   m.def("forward", 
67 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
68 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
69 |   m.def("backward", 
70 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
71 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
72 | }
73 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* coding=utf-8
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ATen/ATen.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_fp16.h>
 21 | #ifndef __HIP_PLATFORM_HCC__
 22 | #include <cuda_profiler_api.h>
 23 | #endif
 24 | #include <ATen/cuda/CUDAContext.h>
 25 | #include <torch/extension.h>
 26 | #include "scaled_upper_triang_masked_softmax.h"
 27 | #include "type_shim.h"
 28 | 
 29 | namespace multihead_attn {
 30 | namespace fused_softmax {
 31 | namespace scaled_upper_triang_masked_softmax {
 32 | 
 33 | torch::Tensor fwd_cuda(
 34 |     torch::Tensor const& input, 
 35 |     float scale_factor)
 36 | {
 37 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
 38 |   const int attn_batches = input.size(0);
 39 |   const int seq_len = input.size(1);
 40 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
 41 | 
 42 |   // Output 
 43 |   auto act_options = input.options().requires_grad(false);
 44 |   torch::Tensor softmax_results = 
 45 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
 46 | 
 47 |   // Softmax Intermediate Result Ptr
 48 |   void* input_ptr = static_cast<void*>(input.data_ptr());
 49 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 50 | 
 51 |   DISPATCH_HALF_AND_BFLOAT(
 52 |       input.scalar_type(),
 53 |       "dispatch_scaled_upper_triang_masked_softmax_forward",
 54 |       dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
 55 | 	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
 56 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
 57 | 	  scale_factor,
 58 | 	  seq_len,
 59 | 	  seq_len,
 60 | 	  attn_batches);
 61 |       );
 62 |   return softmax_results;
 63 | }
 64 | 				      
 65 | 
 66 | torch::Tensor bwd_cuda(
 67 |     torch::Tensor const& output_grads_, 
 68 |     torch::Tensor const& softmax_results_, 
 69 |     float scale_factor)  {
 70 | 	
 71 |   auto output_grads = output_grads_.contiguous();
 72 |   auto softmax_results = softmax_results_.contiguous();
 73 | 
 74 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
 75 |   const int attn_batches = output_grads.size(0);
 76 |   const int seq_len = output_grads.size(1);
 77 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
 78 | 
 79 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 80 | 
 81 |   //Softmax Grad
 82 |   DISPATCH_HALF_AND_BFLOAT(
 83 |       output_grads_.scalar_type(),
 84 |       "dispatch_scaled_upper_triang_masked_softmax_backward",
 85 |       dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
 86 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
 87 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
 88 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
 89 | 	  scale_factor,
 90 | 	  seq_len,
 91 | 	  seq_len,
 92 | 	  attn_batches);
 93 |       );
 94 |   
 95 |   //backward pass is completely in-place
 96 |   return output_grads;
 97 | }
 98 | }
 99 | }
100 | }
101 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/type_shim.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | 
18 | #include <ATen/ATen.h>
19 | #include "compat.h"
20 | 
21 | 
22 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
23 |   switch(TYPE)								\
24 |     {									\
25 |     case at::ScalarType::Half:						\
26 |       {									\
27 | 	using scalar_t = at::Half;					\
28 | 	__VA_ARGS__;							\
29 | 	break;								\
30 |       }									\
31 |     case at::ScalarType::BFloat16:					\
32 |       {									\
33 | 	using scalar_t = at::BFloat16;					\
34 | 	__VA_ARGS__;							\
35 | 	break;								\
36 |       }									\
37 |     default:								\
38 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
39 |       }
40 | 
41 | 
42 | 
43 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
44 |   switch(TYPEIN)							\
45 |     {									\
46 |     case at::ScalarType::Float:						\
47 |       {									\
48 | 	using scalar_t_in = float;					\
49 | 	switch(TYPEOUT)							\
50 | 	  {								\
51 | 	  case at::ScalarType::Float:					\
52 | 	    {								\
53 | 	      using scalar_t_out = float;				\
54 | 	      __VA_ARGS__;						\
55 | 	      break;							\
56 | 	    }								\
57 | 	  case at::ScalarType::Half:					\
58 | 	    {								\
59 | 	      using scalar_t_out = at::Half;				\
60 | 	      __VA_ARGS__;						\
61 | 	      break;							\
62 | 	    }								\
63 | 	  case at::ScalarType::BFloat16:				\
64 | 	    {								\
65 | 	      using scalar_t_out = at::BFloat16;			\
66 | 	      __VA_ARGS__;						\
67 | 	      break;							\
68 | 	    }								\
69 | 	  default:							\
70 | 	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
71 | 	  }								\
72 | 	break;								\
73 |       }									\
74 |     case at::ScalarType::Half:						\
75 |       {									\
76 | 	using scalar_t_in = at::Half;					\
77 | 	using scalar_t_out = at::Half;					\
78 | 	__VA_ARGS__;							\
79 | 	break;								\
80 |       }									\
81 |     case at::ScalarType::BFloat16:					\
82 |       {									\
83 | 	using scalar_t_in = at::BFloat16;				\
84 | 	using scalar_t_out = at::BFloat16;				\
85 | 	__VA_ARGS__;							\
86 | 	break;								\
87 |       }									\
88 |     default:								\
89 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
90 |     }
91 | 
92 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from deepspeed.accelerator.real_accelerator import get_accelerator
16 | if get_accelerator().device_name() == 'cuda':
17 |     from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
18 | else:
19 |     from torch.nn import LayerNorm
20 | from .distributed import DistributedDataParallel
21 | from .bert_model import BertModel
22 | from .gpt_model import GPTModel, GPTModelPipe
23 | from .llama_model import LlamaModel, LlamaModelPipe
24 | from .t5_model import T5Model
25 | from .language_model import get_language_model
26 | from .module import Float16Module
27 | from .rotary_pos_embedding import RotaryEmbedding
28 | 


--------------------------------------------------------------------------------
/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import enum
17 | 
18 | class LayerType(enum.Enum):
19 |     encoder = 1
20 |     decoder = 2
21 |  
22 | class AttnType(enum.Enum):
23 |     self_attn = 1
24 |     cross_attn = 2
25 | 
26 | class AttnMaskType(enum.Enum):
27 |     padding = 1
28 |     causal = 2
29 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | torch._C._jit_set_profiling_mode(False)
19 | torch._C._jit_set_profiling_executor(False)
20 | torch._C._jit_override_can_fuse_on_cpu(True)
21 | torch._C._jit_override_can_fuse_on_gpu(True)
22 | 
23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
24 | # 1/sqrt(2*pi)-> 0.3989423
25 | # 1/sqrt(2)   -> 0.70710678
26 | # sqrt(2/pi)  -> 0.79788456
27 | # this function is tanh approximation of gelu
28 | # actual gelu is:
29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
30 | 
31 | @torch.jit.script
32 | def bias_gelu(bias, y):
33 |     x = bias + y
34 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
35 | 
36 | # gradient of tanh approximation of gelu
37 | # gradient of actual gelu is:
38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
39 | @torch.jit.script
40 | def bias_gelu_back(g, bias, y):
41 |     x = bias + y
42 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
43 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
44 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
45 |     return ff*g
46 | 
47 | class GeLUFunction(torch.autograd.Function):
48 |     @staticmethod
49 |     # bias is an optional argument
50 |     def forward(ctx, input, bias):
51 |         ctx.save_for_backward(input, bias)
52 |         return bias_gelu(bias, input)
53 | 
54 |     @staticmethod
55 |     def backward(ctx, grad_output):
56 |         input, bias = ctx.saved_tensors
57 |         tmp = bias_gelu_back(grad_output, bias, input)
58 |         return tmp, tmp
59 | 
60 | bias_gelu_impl = GeLUFunction.apply
61 | 


--------------------------------------------------------------------------------
/megatron/model/fused_layer_norm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """This code is copied fron NVIDIA apex:
17 |       https://github.com/NVIDIA/apex
18 |    with some changes. """
19 | 
20 | import numbers
21 | import torch
22 | from torch.nn.parameter import Parameter
23 | from torch.nn import init
24 | import importlib
25 | from torch.nn import functional as F
26 | 
27 | global fused_mix_prec_layer_norm_cuda
28 | fused_mix_prec_layer_norm_cuda = None
29 | 
30 | 
31 | class FusedLayerNormAffineFunction(torch.autograd.Function):
32 | 
33 |   @staticmethod
34 |   def forward(ctx, input, weight, bias, normalized_shape, eps):
35 | 
36 |     ctx.normalized_shape = normalized_shape
37 |     ctx.eps = eps
38 |     input_ = input.contiguous()
39 |     weight_ = weight.contiguous()
40 |     bias_ = bias.contiguous()
41 |     output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
42 |         input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
43 |     ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
44 | 
45 |     return output
46 | 
47 | 
48 |   @staticmethod
49 |   def backward(ctx, grad_output):
50 | 
51 |     input_, weight_, bias_, mean, invvar = ctx.saved_tensors
52 |     grad_input = grad_weight = grad_bias = None
53 |     grad_input, grad_weight, grad_bias \
54 |       = fused_mix_prec_layer_norm_cuda.backward_affine(
55 |         grad_output.contiguous(), mean, invvar,
56 |         input_, ctx.normalized_shape,
57 |         weight_, bias_, ctx.eps)
58 | 
59 |     return grad_input, grad_weight, grad_bias, None, None
60 | 
61 | 
62 | 
63 | class MixedFusedLayerNorm(torch.nn.Module):
64 | 
65 |   def __init__(self, normalized_shape, eps=1e-5):
66 |         super(MixedFusedLayerNorm, self).__init__()
67 | 
68 |         global fused_mix_prec_layer_norm_cuda
69 |         fused_mix_prec_layer_norm_cuda = importlib.import_module(
70 |           "fused_mix_prec_layer_norm_cuda")
71 | 
72 |         if isinstance(normalized_shape, numbers.Integral):
73 |             normalized_shape = (normalized_shape,)
74 |         self.normalized_shape = torch.Size(normalized_shape)
75 |         self.eps = eps
76 |         self.weight = Parameter(torch.Tensor(*normalized_shape))
77 |         self.bias = Parameter(torch.Tensor(*normalized_shape))
78 |         self.reset_parameters()
79 | 
80 | 
81 |   def reset_parameters(self):
82 | 
83 |     init.ones_(self.weight)
84 |     init.zeros_(self.bias)
85 | 
86 | 
87 |   def forward(self, input):
88 |     # CPU path is here for unittest sake.
89 |     if not input.is_cuda:
90 |         print("WARNING! The input of FusedLayerNorm should be on the GPU."
91 |               "This warning should only be triggered in the FusedLayerNorm unit tests.")
92 |         return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
93 |     return FusedLayerNormAffineFunction.apply(
94 |       input, self.weight, self.bias, self.normalized_shape,self.eps)
95 | 
96 | 


--------------------------------------------------------------------------------
/megatron/model/rotary_pos_embedding.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | from einops import rearrange
18 | from torch import einsum, nn
19 | 
20 | __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
21 | 
22 | 
23 | class RotaryEmbedding(nn.Module):
24 |     def __init__(self, dim):
25 |         super().__init__()
26 |         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
27 |         self.register_buffer('inv_freq', inv_freq)
28 | 
29 |     def forward(self, max_seq_len, offset=0):
30 |         seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
31 |         freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
32 |         # first part even vector components, second part odd vector components,
33 |         #  2 * dim in dimension size
34 |         emb = torch.cat((freqs, freqs), dim=-1)
35 |         # emb [seq_length, .., dim]
36 |         return rearrange(emb, 'n d -> n 1 1 d')
37 | 
38 | 
39 | def _rotate_half(x):
40 |     """
41 |     change sign so the last dimension becomes [-odd, +even]
42 |     """
43 |     x = rearrange(x, '... (j d) -> ... j d', j=2)
44 |     x1, x2 = x.unbind(dim=-2)
45 |     return torch.cat((-x2, x1), dim=-1)
46 | 
47 | 
48 | def apply_rotary_pos_emb(t, freqs):
49 |     """
50 |     input tensor t is of shape [seq_length, ..., dim]
51 |     rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
52 |     check https://kexue.fm/archives/8265 for detailed formulas
53 |     """
54 |     rot_dim = freqs.shape[-1]
55 |     # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
56 |     t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
57 |     # first part is cosine component
58 |     # second part is sine component, need to change signs with _rotate_half method
59 |     t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
60 |     return torch.cat((t, t_pass), dim=-1)
61 | 


--------------------------------------------------------------------------------
/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utilities for models."""
17 | 
18 | import math
19 | 
20 | import torch
21 | 
22 | from megatron import get_args
23 | 
24 | def init_method_normal(sigma):
25 |     """Init method based on N(0, sigma)."""
26 |     def init_(tensor):
27 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
28 | 
29 |     return init_
30 | 
31 | 
32 | def scaled_init_method_normal(sigma, num_layers):
33 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
34 |     std = sigma / math.sqrt(2.0 * num_layers)
35 | 
36 |     def init_(tensor):
37 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
38 | 
39 |     return init_
40 | 
41 | 
42 | def attention_mask_func(attention_scores, attention_mask):
43 |     args = get_args()
44 |     if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
45 |         attention_mask_ = attention_mask
46 |         actual_seqlen = attention_scores.size()[2]
47 |         if actual_seqlen != attention_mask_.size()[2]:
48 |             # attention_mask has size [1, 1, seqlen, seqlen]
49 |             attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous()
50 |         attention_scores.masked_fill_(attention_mask_, -10000.0)
51 |     else:
52 |         attention_scores.masked_fill_(attention_mask, -10000.0)
53 |     return attention_scores
54 | 
55 | 
56 | def get_linear_layer(rows, columns, init_method):
57 |     """Simple linear layer with weight initialization."""
58 |     layer = torch.nn.Linear(rows, columns)
59 |     init_method(layer.weight)
60 |     with torch.no_grad():
61 |         layer.bias.zero_()
62 |     return layer
63 | 
64 | @torch.jit.script
65 | def gelu_impl(x):
66 |     """OpenAI's gelu implementation."""
67 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
68 |                                        (1.0 + 0.044715 * x * x)))
69 | def openai_gelu(x):
70 |     return gelu_impl(x)
71 | 
72 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
73 | @torch.jit.script
74 | def erf_gelu(x):
75 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
76 | 


--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .initialize import is_unitialized
23 | from .initialize import destroy_model_parallel
24 | from .initialize import get_data_parallel_group
25 | from .initialize import get_data_parallel_rank
26 | from .initialize import get_data_parallel_world_size
27 | from .initialize import get_embedding_group
28 | from .initialize import get_model_parallel_group
29 | from .initialize import get_tensor_model_parallel_group
30 | from .initialize import get_pipeline_model_parallel_group
31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank
32 | from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank
33 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
34 | from .initialize import get_tensor_model_parallel_src_rank
35 | from .initialize import get_pipeline_model_parallel_first_rank
36 | from .initialize import get_pipeline_model_parallel_last_rank
37 | from .initialize import get_pipeline_model_parallel_next_rank
38 | from .initialize import get_pipeline_model_parallel_prev_rank
39 | from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size
40 | from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size
41 | from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank
42 | from .initialize import initialize_model_parallel
43 | from .initialize import model_parallel_is_initialized
44 | from .initialize import get_model_parallel_world_size, get_model_parallel_rank
45 | 
46 | from .layers import ColumnParallelLinear
47 | from .layers import RowParallelLinear
48 | from .layers import VocabParallelEmbedding
49 | from .layers import (set_tensor_model_parallel_attributes,
50 |                      set_defaults_if_not_set_tensor_model_parallel_attributes,
51 |                      copy_tensor_model_parallel_attributes)
52 |                      
53 | from .mappings import copy_to_tensor_model_parallel_region
54 | from .mappings import gather_from_tensor_model_parallel_region
55 | from .mappings import reduce_from_tensor_model_parallel_region
56 | from .mappings import scatter_to_tensor_model_parallel_region
57 | 
58 | from .random import checkpoint
59 | from .random import get_cuda_rng_tracker
60 | from .random import init_checkpointed_activations_memory_buffer
61 | from .random import model_parallel_cuda_manual_seed
62 | from .random import reset_checkpointed_activations_memory_buffer
63 | from .random import gather_split_1d_tensor
64 | from .random import split_tensor_into_1d_equal_chunks
65 | 
66 | from .utils import divide
67 | from .utils import split_tensor_along_last_dim
68 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LydiaXiaohongLi/Megatron-DeepSpeed/336573636757b6db74eab4218885460dc14cec58/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import os
18 | import random
19 | import numpy
20 | import torch
21 | 
22 | import mpu
23 | from deepspeed.accelerator import get_accelerator
24 | 
25 | class IdentityLayer(torch.nn.Module):
26 |     def __init__(self, size, scale=1.0):
27 |         super(IdentityLayer, self).__init__()
28 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
29 | 
30 |     def forward(self):
31 |         return self.weight
32 | 
33 | 
34 | def set_random_seed(seed):
35 |     """Set random seed for reproducability."""
36 |     random.seed(seed)
37 |     numpy.random.seed(seed)
38 |     torch.manual_seed(seed)
39 |     mpu.model_parallel_cuda_manual_seed(seed)
40 | 
41 | 
42 | def initialize_distributed(backend='nccl'):
43 |     """Initialize torch.distributed."""
44 |     # Get local rank in case it is provided.
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('--local_rank', type=int, default=None,
47 |                         help='local rank passed from distributed launcher')
48 |     args = parser.parse_args()
49 |     local_rank = args.local_rank
50 | 
51 |     # Get rank and world size.
52 |     rank = int(os.getenv('RANK', '0'))
53 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
54 | 
55 |     print('> initializing torch.distributed with local rank: {}, '
56 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
57 | 
58 |     # Set the device id.
59 |     device = rank % get_accelerator().device_count()
60 |     if local_rank is not None:
61 |         device = local_rank
62 |     get_accelerator().set_device(device)
63 | 
64 |     # Call the init process.
65 |     init_method = 'tcp://'
66 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
67 |     master_port = os.getenv('MASTER_PORT', '6000')
68 |     init_method += master_ip + ':' + master_port
69 |     torch.distributed.init_process_group(
70 |         backend=backend,
71 |         world_size=world_size,
72 |         rank=rank,
73 |         init_method=init_method)
74 | 
75 | 
76 | def print_separator(message):
77 |     torch.distributed.barrier()
78 |     filler_len = (78 - len(message)) // 2
79 |     filler = '-' * filler_len
80 |     string = '\n' + filler + ' {} '.format(message) + filler
81 |     if torch.distributed.get_rank() == 0:
82 |         print(string, flush=True)
83 |     torch.distributed.barrier()
84 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | from deepspeed.accelerator import get_accelerator
19 | from mpu import data as data_utils
20 | import mpu
21 | import torch
22 | import functools
23 | import operator
24 | import sys
25 | sys.path.append("../..")
26 | 
27 | 
28 | def test_broadcast_data(tensor_model_parallel_size):
29 | 
30 |     if torch.distributed.get_rank() == 0:
31 |         print('> testing broadcast_data with model parallel size {} ...'.
32 |               format(tensor_model_parallel_size))
33 | 
34 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
35 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
36 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
37 | 
38 |     key_size_t = {'key1': [7, 11],
39 |                   'key2': [8, 2, 1],
40 |                   'key3': [13],
41 |                   'key4': [5, 1, 2],
42 |                   'key5': [5, 12]}
43 |     keys = list(key_size_t.keys())
44 | 
45 |     data = {}
46 |     data_t = {}
47 |     for key in key_size_t:
48 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
49 |         data_t[key] = data[key].clone()
50 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
51 |     data_t['keyX'] = data['keyX'].clone()
52 |     if mpu.get_tensor_model_parallel_rank() != 0:
53 |         data = None
54 | 
55 |     data_utils._check_data_types(keys, data_t, torch.int64)
56 |     key_size, key_numel, \
57 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
58 |     for key in keys:
59 |         assert key_size[key] == key_size_t[key]
60 |     total_numel_t = 0
61 |     for key in keys:
62 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
63 |         assert key_numel[key] == target_size
64 |         total_numel_t += target_size
65 |     assert total_numel == total_numel_t
66 | 
67 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
68 |     for key in keys:
69 |         tensor = data_t[key].to(get_accelerator().device_name())
70 |         assert data_b[key].sub(tensor).abs().max() == 0
71 | 
72 |     # Reset groups
73 |     mpu.destroy_tensor_model_parallel()
74 | 
75 |     torch.distributed.barrier()
76 |     if torch.distributed.get_rank() == 0:
77 |         print('>> passed the test :-)')
78 | 
79 | 
80 | if __name__ == '__main__':
81 | 
82 |     initialize_distributed()
83 |     world_size = torch.distributed.get_world_size()
84 | 
85 |     tensor_model_parallel_size = 1
86 |     while tensor_model_parallel_size <= world_size:
87 |         print_separator('test test broadcast data')
88 |         test_broadcast_data(tensor_model_parallel_size)
89 |         tensor_model_parallel_size *= 2
90 | 


--------------------------------------------------------------------------------
/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
23 |         numerator, denominator)
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions,
34 |                                 contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |         first and last index of the vocabulary belonging to the `rank`
57 |         partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
61 |                                                   rank, world_size):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size)
71 | 


--------------------------------------------------------------------------------
/megatron/package_info.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | MAJOR = 1
17 | MINOR = 1.5
18 | 
19 | # Use the following formatting: (major, minor)
20 | VERSION = (MAJOR, MINOR)
21 | 
22 | __version__ = '.'.join(map(str, VERSION))
23 | __package_name__ = 'megatron-lm'
24 | __contact_names__ = 'NVIDIA INC'
25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM'
26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
30 | 
31 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/sp_tokenization.py:
--------------------------------------------------------------------------------
 1 | # from: https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
 2 | """Tokenization classes for SentencePiece tokenizer"""
 3 | 
 4 | from sentencepiece import SentencePieceProcessor
 5 | from typing import List
 6 | import os
 7 | 
 8 | 
 9 | 
10 | class SPTokenizer:
11 |     def __init__(self, model_path: str):
12 |         # reload tokenizer
13 |         assert os.path.isfile(model_path), model_path
14 |         self.sp_model = SentencePieceProcessor(model_file=model_path)
15 | 
16 |         # BOS / EOS token IDs
17 |         self.n_words: int = self.sp_model.vocab_size()
18 |         self.bos_id: int = self.sp_model.bos_id()
19 |         self.eos_id: int = self.sp_model.eos_id()
20 |         self.pad_id: int = self.sp_model.pad_id()
21 |         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
22 | 
23 |     def encode(self, s, bos=False, eos=False) -> List[int]:
24 |         assert type(s) is str
25 |         t = self.sp_model.encode(s)
26 |         if bos:
27 |             t = [self.bos_id] + t
28 |         if eos:
29 |             t = t + [self.eos_id]
30 |         return t
31 | 
32 |     def decode(self, t: List[int]) -> str:
33 |         return self.sp_model.decode(t)


--------------------------------------------------------------------------------
/pretrain_vit.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Pretrain VIT"""
17 | 
18 | import torch
19 | import torch.nn.functional as F
20 | from megatron import get_args, get_timers, mpu, print_rank_0
21 | from megatron.data.vit_dataset import build_train_valid_datasets
22 | from megatron.model.vit_model import VitModel
23 | from megatron.training import pretrain
24 | from megatron.utils import average_losses_across_data_parallel_group
25 | 
26 | def model_provider():
27 |     """Build the model."""
28 | 
29 |     print_rank_0("building VIT model ...")
30 |     args = get_args()
31 | 
32 |     model = VitModel(num_classes=args.num_classes)
33 |     return model
34 | 
35 | def get_batch(data_iterator):
36 |     """Build the batch."""
37 |     data = next(data_iterator)
38 | 
39 |     # only data parallelism; no need for broadcast
40 |     images = data[0].to(get_accelerator().device_name())
41 |     labels = data[1].to(get_accelerator().device_name())
42 | 
43 |     return images, labels
44 | 
45 | def forward_step(data_iterator, model, input_tensor):
46 |     """Forward step."""
47 |     timers = get_timers()
48 |     assert input_tensor is None
49 | 
50 |     # Get the batch.
51 |     timers("batch-generator").start()
52 |     (
53 |         images,
54 |         labels,
55 |     ) = get_batch(data_iterator)
56 |     timers("batch-generator").stop()
57 | 
58 |     # Forward model. lm_labels
59 |     logits = model(images).contiguous().float()
60 |     loss = F.cross_entropy(logits, labels)
61 | 
62 |     outputs = torch.argmax(logits, -1)
63 |     correct = (outputs == labels).float()
64 |     accuracy = torch.mean(correct)
65 | 
66 |     averaged_loss = average_losses_across_data_parallel_group([loss, accuracy])
67 | 
68 |     return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
69 | 
70 | 
71 | def train_valid_test_datasets_provider(train_val_test_num_samples):
72 |     """Build train, valid, and test datasets."""
73 |     args = get_args()
74 | 
75 |     print_rank_0(
76 |         "> building train, validation, and test datasets " "for VIT ..."
77 |     )
78 |     train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path)
79 |     print_rank_0("> finished creating VIT datasets ...")
80 | 
81 |     return train_ds, valid_ds, None
82 | 
83 | 
84 | if __name__ == "__main__":
85 | 
86 |     pretrain(
87 |         train_valid_test_datasets_provider,
88 |         model_provider,
89 |         forward_step,
90 |         args_defaults={'dataloader_type': 'cyclic'}
91 |     )
92 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pybind11
2 | torch
3 | six
4 | regex
5 | numpy
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Setup for pip package."""
17 | 
18 | import os
19 | import sys
20 | import setuptools
21 | 
22 | if sys.version_info < (3,):
23 |     raise Exception("Python 2 is not supported by Megatron.")
24 | 
25 | from megatron.package_info import (
26 |     __description__,
27 |     __contact_names__,
28 |     __url__,
29 |     __download_url__,
30 |     __keywords__,
31 |     __license__,
32 |     __package_name__,
33 |     __version__,
34 | )
35 | 
36 | with open("README.md", "r") as fh:
37 |     long_description = fh.read()
38 | 
39 | ###############################################################################
40 | #                             Dependency Loading                              #
41 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
42 | 
43 | 
44 | def req_file(filename):
45 |     with open(filename) as f:
46 |         content = f.readlines()
47 |     return [x.strip() for x in content]
48 | 
49 | 
50 | install_requires = req_file("requirements.txt")
51 | 
52 | setuptools.setup(
53 |     name=__package_name__,
54 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
55 |     # the version across setup.py and the project code, see
56 |     # https://packaging.python.org/en/latest/single_source_version.html
57 |     version=__version__,
58 |     description=__description__,
59 |     long_description=long_description,
60 |     long_description_content_type="text/markdown",
61 |     # The project's main homepage.
62 |     url=__url__,
63 |     author=__contact_names__,
64 |     maintainer=__contact_names__,
65 |     # The licence under which the project is released
66 |     license=__license__,
67 |     classifiers=[
68 |         'Intended Audience :: Developers',
69 |         'Intended Audience :: Science/Research',
70 |         'Intended Audience :: Information Technology',
71 |         # Indicate what your project relates to
72 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
73 |         'Topic :: Software Development :: Libraries :: Python Modules',
74 |         # Supported python versions
75 |         'Programming Language :: Python :: 3.6',
76 |         'Programming Language :: Python :: 3.7',
77 |         'Programming Language :: Python :: 3.8',
78 |         # Additional Setting
79 |         'Environment :: Console',
80 |         'Natural Language :: English',
81 |         'Operating System :: OS Independent',
82 |     ],
83 |     python_requires='>=3.6',
84 |     packages=setuptools.find_packages(),
85 |     install_requires=install_requires,
86 |     # Add in any packaged data.
87 |     include_package_data=True,
88 |     zip_safe=False,
89 |     # PyPI package information.
90 |     keywords=__keywords__
91 | )
92 | 


--------------------------------------------------------------------------------
/tasks/eval_harness/download.py:
--------------------------------------------------------------------------------
 1 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed
 2 | # under the license https://huggingface.co/spaces/bigscience/license
 3 | 
 4 | # Downloads the specified taks in the evaluation harness
 5 | # This is particularly useful when running in environments where the GPU nodes 
 6 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
 7 | 
 8 | from lm_eval import tasks
 9 | from lm_eval.tasks import ALL_TASKS
10 | import argparse
11 | import os
12 | 
13 | 
14 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
15 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
16 | args = parser.parse_args()
17 | 
18 | def main():
19 |     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
20 |     tasks.get_task_dict(task_list)
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 
25 | 
26 |     


--------------------------------------------------------------------------------
/tasks/eval_harness/report-to-csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed
 4 | # under the license https://huggingface.co/spaces/bigscience/license
 5 | 
 6 | # this script converts results.json:
 7 | #
 8 | #   "results": {
 9 | #     "arc_challenge": {
10 | #       "acc": 0.24232081911262798,
11 | #       "acc_stderr": 0.01252159329580012,
12 | #       "acc_norm": 0.2764505119453925,
13 | #       "acc_norm_stderr": 0.013069662474252425
14 | #     },
15 | #
16 | # into a format expected by a spreadsheet, which is:
17 | #
18 | #   task          metric   value    err
19 | #   arc_challenge acc      xxx      yyy
20 | #   arc_challenge acc_norm xxx      yyy
21 | #   arc_challenge f1       xxx      yyy
22 | #
23 | # usage:
24 | # report-to-csv.py results.json
25 | 
26 | 
27 | import sys
28 | import json
29 | import io
30 | import csv
31 | 
32 | results_file = sys.argv[1]
33 | 
34 | csv_file = results_file.replace("json", "csv")
35 | 
36 | print(f"Converting {results_file} to {csv_file}")
37 | 
38 | with io.open(results_file, 'r', encoding='utf-8') as f:
39 |     results = json.load(f)
40 | 
41 | with io.open(csv_file, 'w', encoding='utf-8') as f:
42 | 
43 |     writer = csv.writer(f)
44 |     writer.writerow(["task", "metric", "value", "err", "version"])
45 | 
46 |     versions = results["versions"]
47 | 
48 |     for k,v in sorted(results["results"].items()):
49 |         if k not in versions:
50 |             versions[k] = -1
51 | 
52 |         if "acc" in v:
53 |             writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]])
54 |         if "acc_norm" in v:
55 |             writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]])
56 |         if "f1" in v:
57 |             writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]])
58 |         # if "ppl" in v:
59 |         #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]])
60 |         # if "em" in v:
61 |         #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]])
62 | 


--------------------------------------------------------------------------------
/tasks/glue/cola.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """CoLA dataset."""
17 | 
18 | from megatron import print_rank_0
19 | from tasks.data_utils import clean_text
20 | from .data import GLUEAbstractDataset
21 | 
22 | 
23 | LABELS = [0, 1]
24 | 
25 | 
26 | class CoLADataset(GLUEAbstractDataset):
27 | 
28 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
29 |                  test_label=0):
30 |         self.test_label = test_label
31 |         super().__init__('CoLA', name, datapaths,
32 |                          tokenizer, max_seq_length)
33 | 
34 |     def process_samples_from_single_path(self, filename):
35 |         """"Implement abstract method."""
36 |         print_rank_0(' > Processing {} ...'.format(filename))
37 | 
38 |         samples = []
39 |         total = 0
40 |         first = True
41 |         is_test = False
42 |         with open(filename, 'r') as f:
43 |             for line in f:
44 |                 row = line.strip().split('\t')
45 |                 if first:
46 |                     first = False
47 |                     if len(row) == 2:
48 |                         is_test = True
49 |                         print_rank_0('   reading {} and {} columns and '
50 |                                      'setting labels to {}'.format(
51 |                                          row[0].strip(), row[1].strip(),
52 |                                          self.test_label))
53 |                         continue
54 | 
55 |                 if is_test:
56 |                     assert len(row) == 2, 'expected length 2: {}'.format(row)
57 |                     uid = int(row[0].strip())
58 |                     text_a = clean_text(row[1].strip())
59 |                     text_b = None
60 |                     label = self.test_label
61 |                     assert len(text_a) > 0
62 |                 else:
63 |                     if len(row) == 4:
64 |                         uid = total
65 |                         text_a = clean_text(row[3].strip())
66 |                         text_b = None
67 |                         label = int(row[1].strip())
68 |                     else:
69 |                         print_rank_0('***WARNING*** index error, '
70 |                                      'skipping: {}'.format(row))
71 |                         continue
72 |                     if len(text_a) == 0:
73 |                         print_rank_0('***WARNING*** zero length a, '
74 |                                      'skipping: {}'.format(row))
75 |                         continue
76 |                 assert label in LABELS
77 |                 assert uid >= 0
78 | 
79 |                 sample = {'uid': uid,
80 |                           'text_a': text_a,
81 |                           'text_b': text_b,
82 |                           'label': label}
83 |                 total += 1
84 |                 samples.append(sample)
85 | 
86 |                 if total % 50000 == 0:
87 |                     print_rank_0('  > processed {} so far ...'.format(total))
88 | 
89 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
90 |         return samples
91 | 


--------------------------------------------------------------------------------
/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """GLUE dataset."""
17 | 
18 | from abc import ABC
19 | from abc import abstractmethod
20 | 
21 | from torch.utils.data import Dataset
22 | 
23 | from megatron import print_rank_0
24 | from tasks.data_utils import build_sample
25 | from tasks.data_utils import build_tokens_types_paddings_from_text
26 | 
27 | 
28 | class GLUEAbstractDataset(ABC, Dataset):
29 |     """GLUE base dataset class."""
30 | 
31 |     def __init__(self, task_name, dataset_name, datapaths,
32 |                  tokenizer, max_seq_length):
33 |         # Store inputs.
34 |         self.task_name = task_name
35 |         self.dataset_name = dataset_name
36 |         self.tokenizer = tokenizer
37 |         self.max_seq_length = max_seq_length
38 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
39 |                                                              self.dataset_name))
40 |         # Process the files.
41 |         string = '  > paths:'
42 |         for path in datapaths:
43 |             string += ' ' + path
44 |         print_rank_0(string)
45 |         self.samples = []
46 |         for datapath in datapaths:
47 |             self.samples.extend(self.process_samples_from_single_path(datapath))
48 |         print_rank_0('  >> total number of samples: {}'.format(
49 |             len(self.samples)))
50 | 
51 |     def __len__(self):
52 |         return len(self.samples)
53 | 
54 |     def __getitem__(self, idx):
55 |         raw_sample = self.samples[idx]
56 |         ids, types, paddings = build_tokens_types_paddings_from_text(
57 |             raw_sample['text_a'], raw_sample['text_b'],
58 |             self.tokenizer, self.max_seq_length)
59 |         sample = build_sample(ids, types, paddings,
60 |                               raw_sample['label'], raw_sample['uid'])
61 |         return sample
62 | 
63 |     @abstractmethod
64 |     def process_samples_from_single_path(self, datapath):
65 |         """Abstract method that takes a single path / filename and
66 |         returns a list of dataset samples, each sample being a dict of
67 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
68 |         """
69 |         pass
70 | 


--------------------------------------------------------------------------------
/tasks/glue/mnli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """MNLI dataset."""
17 | 
18 | from megatron import print_rank_0
19 | from tasks.data_utils import clean_text
20 | from .data import GLUEAbstractDataset
21 | 
22 | 
23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
24 | 
25 | 
26 | class MNLIDataset(GLUEAbstractDataset):
27 | 
28 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
29 |                  test_label='contradiction'):
30 |         self.test_label = test_label
31 |         super().__init__('MNLI', name, datapaths,
32 |                          tokenizer, max_seq_length)
33 | 
34 |     def process_samples_from_single_path(self, filename):
35 |         """"Implement abstract method."""
36 |         print_rank_0(' > Processing {} ...'.format(filename))
37 | 
38 |         samples = []
39 |         total = 0
40 |         first = True
41 |         is_test = False
42 |         with open(filename, 'r') as f:
43 |             for line in f:
44 |                 row = line.strip().split('\t')
45 |                 if first:
46 |                     first = False
47 |                     if len(row) == 10:
48 |                         is_test = True
49 |                         print_rank_0(
50 |                             '   reading {}, {} and {} columns and setting '
51 |                             'labels to {}'.format(
52 |                                 row[0].strip(), row[8].strip(),
53 |                                 row[9].strip(), self.test_label))
54 |                     else:
55 |                         print_rank_0('    reading {} , {}, {}, and {} columns '
56 |                                      '...'.format(
57 |                                          row[0].strip(), row[8].strip(),
58 |                                          row[9].strip(), row[-1].strip()))
59 |                     continue
60 | 
61 |                 text_a = clean_text(row[8].strip())
62 |                 text_b = clean_text(row[9].strip())
63 |                 unique_id = int(row[0].strip())
64 |                 label = row[-1].strip()
65 |                 if is_test:
66 |                     label = self.test_label
67 | 
68 |                 assert len(text_a) > 0
69 |                 assert len(text_b) > 0
70 |                 assert label in LABELS
71 |                 assert unique_id >= 0
72 | 
73 |                 sample = {'text_a': text_a,
74 |                           'text_b': text_b,
75 |                           'label': LABELS[label],
76 |                           'uid': unique_id}
77 |                 total += 1
78 |                 samples.append(sample)
79 | 
80 |                 if total % 50000 == 0:
81 |                     print_rank_0('  > processed {} so far ...'.format(total))
82 | 
83 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
84 |         return samples
85 | 


--------------------------------------------------------------------------------
/tasks/glue/sst2.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """SST-2 dataset."""
17 | 
18 | from megatron import print_rank_0
19 | from tasks.data_utils import clean_text
20 | from .data import GLUEAbstractDataset
21 | 
22 | 
23 | LABELS = [0, 1]
24 | 
25 | 
26 | class SST2Dataset(GLUEAbstractDataset):
27 | 
28 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
29 |                  test_label=0):
30 |         self.test_label = test_label
31 |         super().__init__('SST-2', name, datapaths,
32 |                          tokenizer, max_seq_length)
33 | 
34 |     def process_samples_from_single_path(self, filename):
35 |         """"Implement abstract method."""
36 |         print_rank_0(' > Processing {} ...'.format(filename))
37 | 
38 |         samples = []
39 |         total = 0
40 |         first = True
41 |         is_test = False
42 |         with open(filename, 'r') as f:
43 |             for line in f:
44 |                 row = line.strip().split('\t')
45 |                 if first:
46 |                     first = False
47 |                     if row[0].strip() == 'index':
48 |                         is_test = True
49 |                         print_rank_0('   reading {} and {} columns and '
50 |                                      'setting labels to {}'.format(
51 |                                          row[0].strip(), row[1].strip(),
52 |                                          self.test_label))
53 |                     else:
54 |                         assert len(row) == 2
55 |                         print_rank_0('    reading {} and {} columns'
56 |                                      ' ...'.format(
57 |                                          row[0].strip(), row[1].strip()))
58 |                     continue
59 | 
60 |                 if is_test:
61 |                     assert len(row) == 2, 'expected length 2: {}'.format(row)
62 |                     uid = int(row[0].strip())
63 |                     text_a = clean_text(row[1].strip())
64 |                     text_b = None
65 |                     label = self.test_label
66 |                     assert len(text_a) > 0
67 |                 else:
68 |                     if len(row) == 2:
69 |                         uid = total
70 |                         text_a = clean_text(row[0].strip())
71 |                         text_b = None
72 |                         label = int(row[-1].strip())
73 |                     else:
74 |                         print_rank_0('***WARNING*** index error, '
75 |                                      'skipping: {}'.format(row))
76 |                         continue
77 |                     if len(text_a) == 0:
78 |                         print_rank_0('***WARNING*** zero length a, '
79 |                                      'skipping: {}'.format(row))
80 |                         continue
81 |                 assert label in LABELS
82 |                 assert uid >= 0
83 | 
84 |                 sample = {'uid': uid,
85 |                           'text_a': text_a,
86 |                           'text_b': text_b,
87 |                           'label': label}
88 |                 total += 1
89 |                 samples.append(sample)
90 | 
91 |                 if total % 50000 == 0:
92 |                     print_rank_0('  > processed {} so far ...'.format(total))
93 | 
94 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
95 |         return samples
96 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | import os
19 | import sys
20 | 
21 | from megatron import get_args
22 | from tasks.orqa.evaluate_utils import ORQAEvaluator
23 | 
24 | def main():
25 |     """
26 |     Main program
27 |     """
28 | 
29 |     args = get_args()
30 | 
31 |     # Set up the model and evaluator
32 |     evaluator = ORQAEvaluator()
33 | 
34 |     # Run evaluation
35 |     if args.qa_data_dev is not None:
36 |         evaluator.evaluate(args.qa_data_dev, "DEV")
37 | 
38 |     if args.qa_data_test is not None:
39 |         evaluator.evaluate(args.qa_data_test, "TEST")
40 | 
41 | 


--------------------------------------------------------------------------------
/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Race."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron import get_tokenizer
21 | from megatron import mpu
22 | from megatron.model.multiple_choice import MultipleChoice
23 | from tasks.eval_utils import accuracy_func_provider
24 | from tasks.finetune_utils import finetune
25 | from tasks.race.data import RaceDataset
26 | 
27 | 
28 | def train_valid_datasets_provider():
29 |     """Provide train and validation datasets."""
30 |     args = get_args()
31 |     tokenizer = get_tokenizer()
32 | 
33 |     train_dataset = RaceDataset('training', args.train_data,
34 |                                 tokenizer, args.seq_length)
35 |     valid_dataset = RaceDataset('validation', args.valid_data,
36 |                                 tokenizer, args.seq_length)
37 | 
38 |     return train_dataset, valid_dataset
39 | 
40 | 
41 | def model_provider(pre_process=True, post_process=True):
42 |     """Build the model."""
43 | 
44 |     print_rank_0('building multichoice model for RACE ...')
45 |     model = MultipleChoice(num_tokentypes=2,
46 |                            pre_process=pre_process,
47 |                            post_process=post_process)
48 | 
49 |     return model
50 | 
51 | 
52 | def metrics_func_provider():
53 |     """Privde metrics callback function."""
54 |     args = get_args()
55 |     tokenizer = get_tokenizer()
56 | 
57 |     def single_dataset_provider(datapath):
58 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
59 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
60 | 
61 |     return accuracy_func_provider(single_dataset_provider)
62 | 
63 | 
64 | def main():
65 | 
66 |     finetune(train_valid_datasets_provider, model_provider,
67 |              end_of_epoch_callback_provider=metrics_func_provider)
68 | 


--------------------------------------------------------------------------------
/tasks/vision/classification.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Vision-classification finetuning/evaluation."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron.model.vit_model import VitModel
21 | from megatron.data.vit_dataset import build_train_valid_datasets
22 | from tasks.vision.eval_utils import accuracy_func_provider
23 | from tasks.vision.finetune_utils import finetune
24 | 
25 | 
26 | def classification():
27 |     def train_valid_datasets_provider():
28 |         """Build train and validation dataset."""
29 |         args = get_args()
30 | 
31 |         train_ds, valid_ds = build_train_valid_datasets(
32 |             data_path=args.data_path,
33 |             crop_size=args.img_dim,
34 |         )
35 |         return train_ds, valid_ds
36 | 
37 |     def model_provider():
38 |         """Build the model."""
39 |         args = get_args()
40 | 
41 |         print_rank_0("building classification model for ImageNet ...")
42 | 
43 |         return VitModel(num_classes=args.num_classes, finetune=True)
44 | 
45 |     """Finetune/evaluate."""
46 |     finetune(
47 |         train_valid_datasets_provider,
48 |         model_provider,
49 |         end_of_epoch_callback_provider=accuracy_func_provider,
50 |     )
51 | 
52 | 
53 | def main():
54 |     classification()
55 | 


--------------------------------------------------------------------------------
/tasks/vision/eval_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Evaluation utilities."""
17 | 
18 | import os
19 | import torch
20 | from megatron import get_args
21 | from megatron import print_rank_0
22 | from megatron import mpu
23 | from tasks.vision.finetune_utils import build_data_loader
24 | from tasks.vision.finetune_utils import process_batch
25 | from torchvision import datasets, transforms
26 | from deepspeed.accelerator import get_accelerator
27 | 
28 | def accuracy_func_provider():
29 |     """Provide function that calculates accuracies."""
30 |     args = get_args()
31 |     data_path = args.data_path
32 |     crop_size = args.img_dim
33 | 
34 |     # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
35 |     # Build dataloaders.
36 |     val_data_path = os.path.join(data_path[0], "val")
37 |     normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
38 |     transform_val = transforms.Compose(
39 |         [
40 |             transforms.Resize(crop_size),
41 |             transforms.CenterCrop(crop_size),
42 |             transforms.ToTensor(),
43 |             normalize,
44 |         ]
45 |     )
46 |     dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val)
47 | 
48 |     dataloader = build_data_loader(
49 |         dataset,
50 |         args.micro_batch_size,
51 |         num_workers=args.num_workers,
52 |         drop_last=(mpu.get_data_parallel_world_size() > 1),
53 |     )
54 | 
55 |     def metrics_func(model, epoch):
56 |         print_rank_0("calculating metrics ...")
57 |         correct, total = calculate_correct_answers(model, dataloader, epoch)
58 |         percent = float(correct) * 100.0 / float(total)
59 |         print_rank_0(
60 |             " >> |epoch: {}| overall: correct / total = {} / {} = "
61 |             "{:.4f} %".format(epoch, correct, total, percent)
62 |         )
63 | 
64 |     return metrics_func
65 | 
66 | 
67 | def calculate_correct_answers(model, dataloader, epoch):
68 |     """Calculate correct over total answers"""
69 | 
70 |     model.eval()
71 |     with torch.no_grad():
72 |         # For all the batches in the dataset.
73 |         total = 0
74 |         correct = 0
75 |         for _, batch in enumerate(dataloader):
76 |             # Run the model forward.
77 |             images, labels = process_batch(batch)
78 |             logits = model(images).contiguous().float()
79 |             # Add output predictions.
80 |             # Compute the correct answers.
81 |             predicted = torch.argmax(logits, dim=-1)
82 |             corrects = (predicted == labels).float()
83 |             # Add to the counters.
84 |             total += labels.size(0)
85 |             correct += corrects.sum().item()
86 |     model.train()
87 | 
88 |     # Reduce.
89 |     unreduced = get_accelerator().LongTensor([correct, total])
90 |     torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group())
91 | 
92 |     # Print on screen.
93 |     correct_ans = unreduced[0].item()
94 |     total_count = unreduced[1].item()
95 |     return correct_ans, total_count
96 | 


--------------------------------------------------------------------------------
/tasks/vision/main.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | import os
19 | import sys
20 | 
21 | sys.path.append(
22 |     os.path.abspath(
23 |         os.path.join(
24 |             os.path.join(os.path.dirname(__file__), os.path.pardir),
25 |             os.path.pardir,
26 |         )
27 |     )
28 | )
29 | from megatron import get_args
30 | from megatron.initialize import initialize_megatron
31 | from classification import main
32 | 
33 | 
34 | def get_tasks_args(parser):
35 |     """Provide extra arguments required for tasks."""
36 |     group = parser.add_argument_group(title="tasks")
37 | 
38 |     group.add_argument(
39 |         "--epochs",
40 |         type=int,
41 |         default=None,
42 |         help="Number of finetunning epochs. Zero results in "
43 |         "evaluation only.",
44 |     )
45 |     group.add_argument(
46 |         "--pretrained-checkpoint",
47 |         type=str,
48 |         default=None,
49 |         help="Pretrained checkpoint used for finetunning.",
50 |     )
51 |     group.add_argument(
52 |         "--keep-last",
53 |         action="store_true",
54 |         help="Keep the last batch (maybe incomplete) in" "the data loader",
55 |     )
56 | 
57 |     return parser
58 | 
59 | 
60 | if __name__ == "__main__":
61 | 
62 |     initialize_megatron(extra_args_provider=get_tasks_args)
63 |     args = get_args()
64 |     main()
65 | 


--------------------------------------------------------------------------------
/tasks/zeroshot_gpt/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Detokenization."""
17 | 
18 | import re
19 | 
20 | 
21 | def ptb_detokenizer(string):
22 |     string = string.replace(" '", "'")
23 |     string = string.replace(" \n", "\n")
24 |     string = string.replace("\n ", "\n")
25 |     string = string.replace(" n't", "n't")
26 |     string = string.replace(" N ", "1 ")
27 |     string = string.replace("$ 1", "$1")
28 |     string = string.replace("# 1", "#1")
29 |     return string
30 | 
31 | 
32 | def wikitext_detokenizer(string):
33 |     # contractions
34 |     string = string.replace("s '", "s'")
35 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
36 |     # number separators
37 |     string = string.replace(" @-@ ", "-")
38 |     string = string.replace(" @,@ ", ",")
39 |     string = string.replace(" @.@ ", ".")
40 |     # punctuation
41 |     string = string.replace(" : ", ": ")
42 |     string = string.replace(" ; ", "; ")
43 |     string = string.replace(" . ", ". ")
44 |     string = string.replace(" ! ", "! ")
45 |     string = string.replace(" ? ", "? ")
46 |     string = string.replace(" , ", ", ")
47 |     # double brackets
48 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
49 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
50 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
51 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
52 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
53 |     # miscellaneous
54 |     string = string.replace("= = = =", "====")
55 |     string = string.replace("= = =", "===")
56 |     string = string.replace("= =", "==")
57 |     string = string.replace(" " + chr(176) + " ", chr(176))
58 |     string = string.replace(" \n", "\n")
59 |     string = string.replace("\n ", "\n")
60 |     string = string.replace(" N ", " 1 ")
61 |     string = string.replace(" 's", "'s")
62 | 
63 |     return string
64 | 
65 | 
66 | def lambada_detokenizer(string):
67 |     return string
68 | 
69 | 
70 | _DETOKENIZERS = {
71 |     'ptb': ptb_detokenizer,
72 |     'wiki': wikitext_detokenizer,
73 |     'lambada': lambada_detokenizer,
74 | }
75 | 
76 | 
77 | def get_detokenizer(path):
78 |     for key in _DETOKENIZERS.keys():
79 |         if key in path:
80 |             return _DETOKENIZERS[key]
81 | 


--------------------------------------------------------------------------------
/tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/test_megatron.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import re
 4 | import subprocess
 5 | 
 6 | 
 7 | @pytest.fixture(params=[1])
 8 | def moe_num_experts(request):
 9 |     return str(request.param)
10 | 
11 | 
12 | @pytest.fixture(params=[1])
13 | def mp_size(request):
14 |     return str(request.param)
15 | 
16 | 
17 | @pytest.fixture
18 | def params(moe_num_experts, mp_size):
19 |     base_dir = os.getenv("MEGATRON_CKPT_DIR")
20 |     assert base_dir, "Please set MEGATRON_CKPT_DIR in your environment"
21 | 
22 |     vocab_file = os.path.join(base_dir, "gpt2-vocab.json")
23 |     merge_file = os.path.join(base_dir, "gpt2-merges.txt")
24 |     ckpt_path = os.path.join(base_dir, "checkpoints/gpt2_345m")
25 | 
26 |     return [
27 |         "--micro-batch-size", "1",
28 |         "--num-layers", "24",
29 |         "--hidden-size", "1024",
30 |         "--num-attention-heads", "16",
31 |         "--max-position-embeddings", "1024",
32 |         "--vocab-file", vocab_file,
33 |         "--merge-file", merge_file,
34 |         "--load", ckpt_path,
35 |         "--seq-length", "1024",
36 |         "--out-seq-length", "1024",
37 |         "--tensor-model-parallel-size", mp_size,
38 |         "--tokenizer-type", "GPT2BPETokenizer",
39 |         "--num-experts", moe_num_experts,
40 |         "--mlp-type", "standard",
41 |         "--num-samples", "0",
42 |         "--fp16",
43 |     ]
44 | 
45 | 
46 | def test_moe_megatron(params, mp_size):
47 |     output_re = r"===START OUTPUT===([\S\s]*)===END OUTPUT==="
48 | 
49 |     # Run the baseline
50 |     baseline_cmd = ["deepspeed", "--num_gpus", mp_size, "./run_megatron.py"] + params
51 |     result = subprocess.run(baseline_cmd, stdout=subprocess.PIPE)
52 |     baseline_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
53 | 
54 |     # Run with DeepSpeed
55 |     deepspeed_cmd = baseline_cmd + ["--ds-inference"]
56 |     result = subprocess.run(deepspeed_cmd, stdout=subprocess.PIPE)
57 |     deepspeed_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
58 | 
59 |     assert (
60 |         baseline_output == deepspeed_output
61 |     ), f"outputs do not match: {baseline_output}\n{deepspeed_output}"
62 | 


--------------------------------------------------------------------------------
/tools/convert_checkpoint/deepspeed_to_transformers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import torch
 5 | import json
 6 | 
 7 | from deepspeed_checkpoint import DeepSpeedCheckpoint
 8 | from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
 9 | 
10 | # the import was tested to work with this version
11 | # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
12 | # copying that version here instead
13 | from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
14 | from transformers import GPT2Config
15 | 
16 | def main():
17 | 
18 |     # this first part comes mainly from deepspeed_to_megatron.main
19 |     args = parse_arguments()
20 |     print(f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}')
21 | 
22 |     ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp)
23 |     iteration = ds_checkpoint.get_iteration()
24 |     input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, args.for_release)
25 | 
26 |     # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
27 |     # Spell out all parameters in case the defaults change.
28 |     config = GPT2Config(
29 |         vocab_size=50257,
30 |         n_positions=1024,
31 |         n_ctx=1024,
32 |         n_embd=1024,
33 |         n_layer=24,
34 |         n_head=16,
35 |         n_inner=4096,
36 |         activation_function="gelu",  # used to be "gelu_new" in earlier versions
37 |         resid_pdrop=0.1,
38 |         embd_pdrop=0.1,
39 |         attn_pdrop=0.1,
40 |         layer_norm_epsilon=1e-5,
41 |         initializer_range=0.02,
42 |         summary_type="cls_index",
43 |         summary_use_proj=True,
44 |         summary_activation=None,
45 |         summary_proj_to_labels=True,
46 |         summary_first_dropout=0.1,
47 |         scale_attn_weights=True,
48 |         gradient_checkpointing=False,
49 |         use_cache=True,
50 |         bos_token_id=50256,
51 |         eos_token_id=50256,
52 |     )
53 | 
54 |     # Convert.
55 |     print("Converting to HF Checkpoint")
56 |     output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
57 | 
58 |     basename = args.output_folder
59 |     os.makedirs(basename, exist_ok=True)
60 | 
61 |     # Print the structure of converted state dict.
62 |     #if args.print_checkpoint_structure:
63 |     #    recursive_print(None, output_state_dict)
64 | 
65 |     # Store the config to file.
66 |     output_config_file = os.path.join(basename, "config.json")
67 |     output_config = config.to_dict()
68 |     output_config["architectures"] = ["GPT2LMHeadModel"]
69 |     output_config["model_type"] = "gpt2"
70 |     print(f'Saving config to "{output_config_file}"')
71 |     with open(output_config_file, "w") as f:
72 |         json.dump(output_config, f)
73 | 
74 |     # Store the state_dict to file.
75 |     output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
76 |     print(f'Saving checkpoint to "{output_checkpoint_file}"')
77 |     torch.save(output_state_dict, output_checkpoint_file)
78 | 
79 |     print("Now add tokenizer files and upload to the hub")
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/tools/convert_checkpoint/inspect_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import os
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | def dump_data(datum, name_list=[]):
 8 |     if type(datum) in (dict, OrderedDict):
 9 |         for k, v in datum.items():
10 |             dump_data(v, name_list+[str(k)])
11 |     elif type(datum) in (list, tuple):
12 |         for v in datum:
13 |             dump_data(v, name_list)
14 |     elif torch.is_tensor(datum):
15 |         prefix = '.'.join(name_list)
16 |         print(f'[tensor] {prefix} = {datum.shape}')
17 |     else:
18 |         #pass 
19 |         prefix = '.'.join(name_list)
20 |         print(f'[other] {prefix} = {datum}')
21 | 
22 | def main():
23 |     if len(sys.argv) < 2:
24 |         print(f'Usage: {sys.argv[0]} <checkpoint file>')
25 |         exit(1)
26 | 
27 |     ckpt_file = sys.argv[1]
28 |     if not os.path.isfile(ckpt_file):
29 |         print(f'{ckpt_file} is not a valid file')
30 |         exit(1)
31 | 
32 |     print(f'loading checkpoint file: {ckpt_file}')
33 |     sd = torch.load(ckpt_file)
34 |     dump_data(sd)
35 | 
36 |     quit()
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from deepspeed_checkpoint import DeepSpeedCheckpoint
 3 | 
 4 | def list_files(file_list, tag):
 5 |     print(f'Listing files: {tag}')
 6 |     for i, file in enumerate(file_list):
 7 |         print(f'{i+1}: {file}')
 8 | 
 9 | def parse_arguments():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--folder', default=None, type=str, help='DeepSpeed Checkpoint folder')
12 |     parser.add_argument('--target_tp', default=None, type=int, help='Target TP degree')
13 |     parser.add_argument('--target_pp', default=None, type=int, help='Target PP degree')
14 |     args = parser.parse_args()
15 |     print(f'args = {args}')
16 |     return args 
17 | 
18 | 
19 | def show_input_files(ds_checkpoint):
20 |     list_files(ds_checkpoint.file_list, 'all')
21 |     list_files(ds_checkpoint.zero_files, 'zero')
22 |     list_files(ds_checkpoint.layer_files, 'layer')
23 |     list_files(ds_checkpoint.mp_rank_files, 'mp rank')
24 | 
25 | def show_simple_state(ds_checkpoint):
26 |     print(f'layer keys = {ds_checkpoint.layer_keys}')
27 |     print(f'layer count = {ds_checkpoint.layer_count}')
28 | 
29 |     print(f'tp_degree_count = {ds_checkpoint.tp_degree}')
30 |     print(f'pp_degree_count = {ds_checkpoint.pp_degree}')
31 |     print(f'dp_degree_count = {ds_checkpoint.dp_degree}')
32 | 
33 | def show_mappings(ds_checkpoint):
34 |     ds_checkpoint.show_pp_tranformer_map()
35 |     ds_checkpoint.show_transformer_file_map()
36 |     ds_checkpoint.show_tp_embedding_map()
37 |     ds_checkpoint.show_tp_final_norm_map()
38 | 
39 | def show_state_summary(tag, sd):
40 |     summary = {k:v.shape for k,v in sd.items()}
41 |     print(f'{tag} = {summary}')
42 | 
43 | def show_embedding_states(ds_checkpoint):
44 |     for i in range(0, ds_checkpoint.tp_degree):
45 |         sd = ds_checkpoint.get_embedding_state(i)
46 |         show_state_summary(f'embedding[{i}]', sd)
47 | 
48 | def show_final_norm_states(ds_checkpoint):
49 |     for i in range(0, ds_checkpoint.tp_degree):
50 |         sd = ds_checkpoint.get_final_norm_state(i)
51 |         show_state_summary(f'final_norm[{i}]', sd)
52 | 
53 | def show_transformer_states(ds_checkpoint):
54 |     for i in range(0, ds_checkpoint.tp_degree):
55 |         for j in range(0, ds_checkpoint.pp_degree):
56 |             state_list = ds_checkpoint.get_transformer_state(tp_index=i, pp_index=j)
57 |             print(f'tp_pp_rank[{i},{j}] = ')
58 |             for k, sd in enumerate(state_list):
59 |                 show_state_summary(f'      block[{k}]', sd)
60 |                 print("")
61 | 
62 | 
63 | def main():
64 |     print(f'Inspecting DeepSpeed Checkpoint')
65 |     args = parse_arguments()
66 | 
67 |     ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp, args.target_pp)
68 |     ds_checkpoint.validate_files()
69 |     
70 |     show_input_files(ds_checkpoint)
71 |     show_simple_state(ds_checkpoint)
72 |     show_mappings(ds_checkpoint)
73 |     show_embedding_states(ds_checkpoint)
74 |     show_final_norm_states(ds_checkpoint)
75 |     show_transformer_states(ds_checkpoint)
76 |     checkpoint_args = ds_checkpoint.get_args()
77 |     print(f'checkpoint args = {checkpoint_args}')
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/tools/create_doc_index.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
 4 |                                              os.path.pardir)))
 5 | 
 6 | from megatron import print_rank_0
 7 | from megatron.indexer import IndexBuilder
 8 | from megatron.initialize import initialize_megatron
 9 | 
10 | 
11 | def main():
12 |     """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
13 |     - Include all args needed for initial model specification
14 | 
15 |     Other key args:
16 |         --block-data-path: path to write to
17 |         --ict-load or --realm-load: path to checkpoint with which to embed
18 |         --data-path and --titles-data-path: paths for dataset
19 |         --indexer-log-interval: reporting interval
20 |         --indexer-batch-size: size specific for indexer jobs
21 | 
22 |     Check README.md for example script
23 |     """
24 | 
25 |     initialize_megatron(extra_args_provider=None,
26 |                         args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
27 |     index_builder = IndexBuilder()
28 |     index_builder.build_and_save_index()
29 |     print_rank_0("Build and save indices: done!")
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 
34 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/openwebtext/README.md:
--------------------------------------------------------------------------------
 1 | The following steps show how to prepare training dataset to train the mode.
 2 | 
 3 | # Libraries to install
 4 | 
 5 | ```
 6 |     pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 
 7 |     git clone https://github.com/mattilyra/LSH
 8 |     cd LSH
 9 |     python setup.py install
10 | ``` 
11 | 
12 | # Download the dataset
13 | 
14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
15 | 2. Remove blacklisted URLs.
16 | ```
17 | python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
18 | ```
19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
20 | 
21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique.
22 | 
23 | # Prepare the data for GPT-2 training:
24 | 
25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards.
26 | ```
27 | python cleanup_dataset.py <input data file> <output cleaned data filename>
28 | ```
29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`.
30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`.
31 | ```
32 | python find_duplicates.py --inputs <pairlist list of input cleaned data files and keys, e.g. cc.json cc_id news.json news_id> --output <output possible duplicate urls filename>
33 | ```
34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest.
35 | ```
36 | python group_duplicate_urls.py <possible duplicate urls file> <output file containing similar urls>
37 | ```
38 | 4. Remove similar documents that were detected in the last step.
39 | ```
40 | python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
41 | ```
42 | 
43 | 5. Shuffle the dataset.
44 | ```
45 | shuf <cleaned deduped data file> -o train_data.json
46 | ```
47 | 
48 | # Deduplicating ngrams
49 | 
50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command.
51 | 
52 | ```
53 | python filter_ngrams.py --tasks <name of he task, e.g. lambada, squad> --dedup-dataset <training dataset to deduplicate> <json key> --output <output training dataset>
54 | ```
55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments.
56 | 
57 | Only for the lambada task, we need to provide the path, `--lambada-path <path of the lambada test data>`.
58 | 
59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details.
60 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import json
18 | import os
19 | import time
20 | 
21 | """
22 | This code adds id to each json object in a json file. User can add prefix
23 | to the ids.
24 | """
25 | 
26 | if __name__ == '__main__':
27 | 
28 |     print('parsing the arguments ...')
29 | 
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
32 |         ' json file where id needs to be added')
33 |     parser.add_argument('--output-file', type=str, default=None, help=\
34 |         'Output file name with id')
35 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
36 |         'Id prefix')
37 |     parser.add_argument('--log-interval', type=int, default=100,
38 |                        help='Log interval')
39 |     args = parser.parse_args()
40 | 
41 |     print('Adding ids to dataset ...')
42 | 
43 |     f_input = open(args.input_file, 'r', encoding='utf-8')
44 |     f_output = open(args.output_file, 'wb')
45 | 
46 |     unique_ids = 1
47 |     start_time = time.time()
48 |     for row in f_input:
49 |         each_row = json.loads(row)
50 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
51 |         each_row['adlr_id'] = adlr_id_string
52 |         myjson = json.dumps(each_row, ensure_ascii=False)
53 | 
54 |         f_output.write(myjson.encode('utf-8'))
55 |         f_output.write('\n'.encode('utf-8'))
56 | 
57 |         if unique_ids % args.log_interval == 0:
58 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
59 |                     unique_ids, time.time() - start_time), flush=True)
60 | 
61 |         unique_ids += 1
62 | 
63 |     # Close the file.
64 |     f_input.close()
65 |     f_output.close()
66 |     
67 |     print('done :-)', flush=True)
68 | 


--------------------------------------------------------------------------------
/tools/openwebtext/group_duplicate_url.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import time
18 | import sys
19 | 
20 | 
21 | if __name__ == '__main__':
22 | 
23 | 
24 |     print('grouping duplicate urls ...')
25 | 
26 |     input = sys.argv[1]
27 |     output = sys.argv[2]
28 |     if len(sys.argv) > 3:
29 |         jaccard_similarity_threshold = float(sys.argv[3])
30 |     else:
31 |         jaccard_similarity_threshold = 0.7
32 | 
33 |     url_to_index = {}
34 |     index_to_urls = []
35 |     counter = 0
36 |     start_time = time.time()
37 |     with open(input, 'r') as f:
38 |         for line in f:
39 |             counter += 1
40 |             myjson = json.loads(line)
41 |             urls = []
42 |             for main_url in myjson.keys():
43 |                 urls.append(main_url)
44 |                 for value in myjson[main_url]:
45 |                     for other_url, js in value.items():
46 |                         if js >= jaccard_similarity_threshold:
47 |                             urls.append(other_url)
48 |             current_index = -1
49 |             other_indices = set()
50 |             for url in urls:
51 |                 if url in url_to_index:
52 |                     if current_index == -1:
53 |                         current_index = url_to_index[url]
54 |                     elif current_index != url_to_index[url]:
55 |                         other_indices.add(url_to_index[url])
56 |             if current_index == -1:
57 |                 current_index = len(index_to_urls)
58 |                 index_to_urls.append(set())
59 |             for url in urls:
60 |                 url_to_index[url] = current_index
61 |                 index_to_urls[current_index].add(url)
62 |             for index in other_indices:
63 |                 for url in index_to_urls[index]:
64 |                     index_to_urls[current_index].add(url)
65 |                     url_to_index[url] = current_index
66 |                 index_to_urls[index] = None
67 | 
68 |             if counter % 100000 == 0:
69 |                 print(' > processed {} lines in {} seconds ...'.format(
70 |                     counter, time.time() - start_time))
71 | 
72 | 
73 |     total_remove = 0
74 |     total_remain = 0
75 |     for urls in index_to_urls:
76 |         if urls is not None:
77 |             if len(urls) > 1:
78 |                 total_remove += (len(urls) - 1)
79 |                 total_remain += 1
80 |     print('out of {} urls, only {} are unique and {} should be removed'.format(
81 |         total_remove+total_remain, total_remain, total_remove))
82 | 
83 |     with open(output, 'wb') as f:
84 |         for i, urls in enumerate(index_to_urls):
85 |             if urls is not None:
86 |                 if len(urls) > 1:
87 |                     myjson = json.dumps({str(i): list(urls)},
88 |                                         ensure_ascii=False)
89 |                     f.write(myjson.encode('utf-8'))
90 |                     f.write('\n'.encode('utf-8'))
91 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import glob
18 | import sys
19 | import json
20 | import argparse
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--json_path", type=str, default=".",
26 |         help="path where all the json files are located")
27 | 
28 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
29 |         help="filename where the merged json should go")
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     json_path = args.json_path
34 |     out_file = args.output_file
35 | 
36 |     json_files = glob.glob(json_path + '/*.json')
37 | 
38 |     counter = 0
39 | 
40 |     with open(out_file, 'w') as outfile:
41 |         for fname in json_files:
42 |             counter += 1
43 | 
44 |             if counter % 1024 == 0:
45 |                 print("Merging at ", counter, flush=True)
46 | 
47 |             with open(fname, 'r') as infile:
48 |                 for row in infile:
49 |                     each_row = json.loads(row)
50 |                     outfile.write(row)
51 | 
52 | 
53 |     print("Merged file", out_file, flush=True)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import json
18 | import time
19 | import sys
20 | 
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     url_filename = sys.argv[1]
25 |     data_filename = sys.argv[2]
26 |     output_filename = sys.argv[3]
27 | 
28 |     urls = set()
29 |     with open(url_filename, 'r') as f:
30 |         for line in f:
31 |             myjson = json.loads(line)
32 |             for key in myjson:
33 |                 this_urls = myjson[key]
34 |                 for i in range(1, len(this_urls)):
35 |                     urls.add(this_urls[i])
36 |     print('will be removing {} urls'.format(len(urls)), flush=True)
37 | 
38 |     written_docs = 0
39 |     removed_docs = 0
40 |     removed_chars = 0
41 |     start_time = time.time()
42 |     with open(output_filename, 'wb') as fout:
43 |         with open(data_filename, 'r') as fin:
44 |             for line in fin:
45 |                 try:
46 |                     myjson = json.loads(line)
47 |                     url = myjson['url']
48 |                     if url in urls:
49 |                         print('removing', myjson)
50 |                         removed_docs += 1
51 |                         removed_chars += len(myjson['text'])
52 |                         continue
53 |                     myjson = json.dumps(myjson, ensure_ascii=False)
54 |                     fout.write(myjson.encode('utf-8'))
55 |                     fout.write('\n'.encode('utf-8'))
56 |                     written_docs += 1
57 |                     if written_docs % 10000 == 0:
58 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
59 |                               '| removed: {} (char: {})'.format(
60 |                                   time.time() - start_time,
61 |                                   written_docs, removed_docs, removed_chars))
62 |                 except Exception as e:
63 |                     print('[SKIPPING]', line, e)
64 | 
65 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
66 |           '| removed: {} (char: {})'.format(
67 |               time.time() - start_time,
68 |               written_docs, removed_docs, removed_chars))
69 |     print('done :-)')
70 | 


--------------------------------------------------------------------------------