├── .github └── workflows │ ├── ci.md │ └── main.yml ├── .gitignore ├── .gitlab-ci.yml ├── CODEOWNERS ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── examples ├── create_embeddings.sh ├── curriculum_learning │ ├── README.md │ ├── ds_config_cl.json │ └── pretrain_gpt_cl.sh ├── evaluate_ict_zeroshot_nq.sh ├── evaluate_zeroshot_gpt.sh ├── finetune_mnli_distributed.sh ├── finetune_race_distributed.sh ├── generate_text.sh ├── merge_mp_bert.sh ├── pretrain_bert.sh ├── pretrain_bert_distributed.sh ├── pretrain_bert_distributed_with_mp.sh ├── pretrain_gpt.sh ├── pretrain_gpt3_175B.sh ├── pretrain_gpt_distributed.sh ├── pretrain_gpt_distributed_with_mp.sh ├── pretrain_gpt_multilingual.sh ├── pretrain_gpt_single_node.sh ├── pretrain_gpt_tiny.sh ├── pretrain_ict.sh ├── pretrain_t5.sh ├── pretrain_t5_distributed.sh ├── pretrain_t5_distributed_with_mp.sh ├── run_evalharness.sh ├── run_evalharness_deepspeed.md ├── run_evalharness_deepspeed.slurm └── run_evalharness_tr11-176b-ml.slurm ├── finetune_t0_non_causal_decoder.py ├── images └── cases_april2021.png ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── data │ ├── Makefile │ ├── __init__.py │ ├── autoaugment.py │ ├── bert_dataset.py │ ├── biencoder_dataset_utils.py │ ├── blendable_dataset.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── decoder_packed_mtf_dataset.py │ ├── distdata.py │ ├── gpt_dataset.py │ ├── helpers.cpp │ ├── ict_dataset.py │ ├── indexed_dataset.py │ ├── mlm_dataset.py │ ├── mtf_dataset.py │ ├── orqa_wiki_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ ├── t5_dataset.py │ ├── test │ │ ├── test_indexed_dataset.py │ │ └── test_preprocess_data.sh │ └── vit_dataset.py ├── enums.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── layer_norm_cuda.cpp │ ├── layer_norm_cuda_kernel.cu │ ├── scaled_masked_softmax.cpp │ ├── scaled_masked_softmax.h │ ├── scaled_masked_softmax_cuda.cu │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax.h │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ ├── tests │ │ ├── __init__.py │ │ └── test_fused_kernels.py │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── learning_rates.py ├── logging.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── distributed.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_softmax.py │ ├── glu_activations.py │ ├── gpt_model.py │ ├── language_model.py │ ├── module.py │ ├── multiple_choice.py │ ├── positional_embeddings.py │ ├── realm_model.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vit_model.py ├── mpu │ ├── __init__.py │ ├── cross_entropy.py │ ├── data.py │ ├── initialize.py │ ├── layers.py │ ├── mappings.py │ ├── random.py │ ├── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py │ └── utils.py ├── optimizer │ ├── __init__.py │ ├── clip_grads.py │ ├── grad_scaler.py │ └── optimizer.py ├── p2p_communication.py ├── package_info.py ├── schedules.py ├── testing_utils.py ├── text_generation_utils.py ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_ict.py ├── pretrain_prefix_lm.py ├── pretrain_t5.py ├── pretrain_vit.py ├── pyproject.toml ├── requirements.txt ├── run.sh ├── run_bf16.sh ├── run_fp16.sh ├── run_universal_bf16.sh ├── scripts ├── README.md ├── bloom-inference-scripts │ └── README.md ├── bloom-inference-server │ └── README.md └── test_multiple_dataset_sampling │ ├── create_dummy_dataset.py │ ├── preprocess_data.py │ ├── test_sampling.py │ └── test_sampling.sh ├── setup.cfg ├── setup.py ├── start_fast.md ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_harness │ ├── download.py │ ├── evaluate.py │ └── report-to-csv.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── main.py ├── orqa │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ └── natural_questions │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification.py │ ├── eval_utils.py │ ├── finetune_utils.py │ └── main.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── README.md ├── conftest.py ├── data │ └── gpt2 │ │ ├── README.md │ │ ├── ag_news_prompt_inputs_document.bin │ │ ├── ag_news_prompt_inputs_document.idx │ │ ├── ag_news_prompt_targets_document.bin │ │ ├── ag_news_prompt_targets_document.idx │ │ ├── generate_ag_news_mtf_dataset.sh │ │ ├── gpt2-tiny-merges.txt │ │ ├── gpt2-tiny-vocab.json │ │ ├── meg-gpt2-openwebtext_text_document.bin │ │ ├── meg-gpt2-openwebtext_text_document.idx │ │ └── openwebtext-1000.jsonl ├── ds_config.json ├── ds_config_bf16.json ├── ds_config_cl.json ├── ds_config_inference.json ├── test_activations.py ├── test_basic.py ├── test_checkpoints.py ├── test_dataloaders.py ├── test_model.py ├── test_preprocessing.py ├── test_tensor_parallel.py ├── test_training.py └── tools │ ├── README.md │ ├── openwebtext-to-jsonl.py │ └── shrink-tokenizer.py └── tools ├── README.md ├── convert_checkpoint ├── README.md ├── deepspeed_to_deepspeed.py ├── deepspeed_to_megatron.py ├── deepspeed_to_transformers.py ├── ds_to_universal.py ├── inspect_checkpoint.py └── inspect_deepspeed_checkpoint.py ├── create_doc_index.py ├── generate_samples_gpt.py ├── linter.py ├── logs ├── rescale-logs.py ├── tb-rename-events.py └── tb-rescale-scalars.py ├── merge_mp_partitions.py ├── merge_preprocessed_data.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py ├── preprocess_data.py ├── preprocess_data_dist.py ├── preprocess_data_many_cores.py ├── sample_idxs_to_text.py └── tb ├── tb-remove-events-by-group.py ├── tb-remove-events-by-tag.py └── tb-rename-events.py /.gitignore: -------------------------------------------------------------------------------- 1 | # tests 2 | # megatron autogenerated indices 3 | tests/data/*/*npy 4 | tests/tools/openwebtext-1000.jsonl 5 | tmp/ 6 | 7 | # macOS 8 | .DS_Store 9 | 10 | # Byte-compiled / optimized / DLL files 11 | */__pycache__/ 12 | *.py[cod] 13 | *.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | cover/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | Pipfile 96 | Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Intellij project settings 115 | .idea/ 116 | .iml 117 | 118 | # VSCode 119 | .vscode/ 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # static files generated from Django application 142 | media 143 | staticfiles 144 | /tags 145 | 146 | # tmp files 147 | *.swp 148 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel 2 | 3 | test: 4 | script: 5 | - pytest --junitxml=report.xml tests 6 | artifacts: 7 | when: always 8 | reports: 9 | junit: report.xml 10 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @bigscience-workshop/megatron-deepspeed-codeowners 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test style 2 | 3 | check_dirs := tests tools/convert_checkpoint 4 | 5 | help: ## this help 6 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-22s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 7 | 8 | test: ## run tests 9 | pytest tests 10 | 11 | style: ## checks for code style and applies formatting 12 | black $(check_dirs) 13 | isort $(check_dirs) 14 | -------------------------------------------------------------------------------- /examples/create_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute embeddings for each entry of a given dataset (e.g. Wikipedia) 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | # Wikipedia data can be downloaded from the following link: 9 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 10 | EVIDENCE_DATA_DIR= 11 | EMBEDDING_PATH= 12 | CHECKPOINT_PATH= 13 | 14 | python tools/create_doc_index.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 128 \ 20 | --checkpoint-activations \ 21 | --seq-length 512 \ 22 | --retriever-seq-length 256 \ 23 | --max-position-embeddings 512 \ 24 | --load ${CHECKPOINT_PATH} \ 25 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 26 | --embedding-path ${EMBEDDING_PATH} \ 27 | --indexer-log-interval 1000 \ 28 | --indexer-batch-size 128 \ 29 | --vocab-file bert-vocab.txt \ 30 | --num-workers 2 \ 31 | --fp16 32 | 33 | -------------------------------------------------------------------------------- /examples/curriculum_learning/ds_config_cl.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 0 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false, 26 | "curriculum_learning": { 27 | "enabled": true, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": 8, 30 | "max_difficulty": 1024, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": 60000, 34 | "difficulty_step": 8 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /examples/curriculum_learning/pretrain_gpt_cl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is a dummy train script to show how to use curriculum 4 | # learning, some parameters are not for actual GPT pretraining. 5 | 6 | TARGET_GLOBAL_BATCH_SIZE=512 7 | TRAIN_SAMPLES=146_484_375 8 | LR=1.0e-4 9 | MIN_LR=1.0e-5 10 | LR_DECAY_SAMPLES=126_953_125 11 | LR_WARMUP_SAMPLES=183_105 12 | SEQLEN=1024 13 | 14 | ############################################################ 15 | # New configs for curriculum learning, see README.md 16 | TRAIN_TOKENS=10_000_000_000 17 | LR_DECAY_TOKENS=$(($LR_DECAY_SAMPLES*$SEQLEN)) 18 | ############################################################ 19 | 20 | LOG_INTERVAL=100 21 | EVAL_ITERS=10 22 | EVAL_INTERVAL=100 23 | SAVE_INTERVAL=1000 24 | 25 | VOCAB_PATH=/data/Megatron-LM/data/gpt2-vocab.json 26 | MERGE_PATH=/data/Megatron-LM/data/gpt2-merges.txt 27 | DATA_PATH=/data/Megatron-LM/data/indexed_datasets/megatron 28 | 29 | MICRO_BATCH_SIZE=1 30 | MP_SIZE=1 31 | PP_SIZE=1 32 | 33 | NUM_GPUS=128 34 | echo ${NUM_GPUS} 35 | if [[ $PP_SIZE -gt 0 ]]; then 36 | DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) )) 37 | else 38 | DP_SIZE=$(( ${NUM_GPUS} / ${MP_SIZE} )) 39 | fi 40 | GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${MICRO_BATCH_SIZE} * ${DP_SIZE}) )) 41 | 42 | NAME="gpt-117M-pp${PP_SIZE}-mp${MP_SIZE}-bsz${TARGET_GLOBAL_BATCH_SIZE}-mbsz${MICRO_BATCH_SIZE}-cl" 43 | current_time=$(date "+%Y.%m.%d-%H.%M.%S") 44 | host="${HOSTNAME}" 45 | TENSORBOARD_DIR="tensorboard/${NAME}_${host}_${current_time}" 46 | mkdir -p ${TENSORBOARD_DIR} 47 | CHECKPOINT_PATH="checkpoints/${NAME}" 48 | 49 | megatron_options=" \ 50 | --data-path ${DATA_PATH} \ 51 | --vocab-file ${VOCAB_PATH} \ 52 | --merge-file ${MERGE_PATH} \ 53 | --data-impl mmap \ 54 | --override-lr-scheduler \ 55 | --adam-beta1 0.9 \ 56 | --adam-beta2 0.95 \ 57 | --tensor-model-parallel-size ${MP_SIZE} \ 58 | --init-method-std 0.014 \ 59 | --lr-decay-tokens ${LR_DECAY_TOKENS} \ 60 | --lr-warmup-samples ${LR_WARMUP_SAMPLES} \ 61 | --micro-batch-size ${MICRO_BATCH_SIZE} \ 62 | --global-batch-size ${TARGET_GLOBAL_BATCH_SIZE} \ 63 | --num-layers 12 \ 64 | --hidden-size 768 \ 65 | --num-attention-heads 16 \ 66 | --seq-length ${SEQLEN} \ 67 | --max-position-embeddings ${SEQLEN} \ 68 | --train-samples ${TRAIN_SAMPLES} \ 69 | --train-tokens ${TRAIN_TOKENS} \ 70 | --lr ${LR} \ 71 | --min-lr ${MIN_LR} \ 72 | --lr-decay-style cosine \ 73 | --split 98,2,0 \ 74 | --log-interval ${LOG_INTERVAL} \ 75 | --eval-interval ${EVAL_INTERVAL} \ 76 | --eval-iters ${EVAL_ITERS} \ 77 | --save-interval ${SAVE_INTERVAL} \ 78 | --weight-decay 0.1 \ 79 | --clip-grad 1.0 \ 80 | --hysteresis 2 \ 81 | --num-workers 0 \ 82 | --checkpoint-activations \ 83 | --fp16 \ 84 | --load ${CHECKPOINT_PATH} \ 85 | --save ${CHECKPOINT_PATH} \ 86 | --tensorboard-queue-size 1 \ 87 | --log-timers-to-tensorboard \ 88 | --log-batch-size-to-tensorboard \ 89 | --log-validation-ppl-to-tensorboard \ 90 | --tensorboard-dir ${TENSORBOARD_DIR}" 91 | 92 | config_json="ds_config_cl.json" 93 | 94 | deepspeed_options=" \ 95 | --deepspeed \ 96 | --deepspeed_config ${config_json} \ 97 | --pipeline-model-parallel-size ${PP_SIZE} \ 98 | --partition-activations" 99 | 100 | run_cmd="deepspeed ../../pretrain_gpt.py ${megatron_options} ${deepspeed_options} &>> ${NAME}.log" 101 | echo ${run_cmd} 102 | eval ${run_cmd} 103 | set +x 104 | -------------------------------------------------------------------------------- /examples/evaluate_ict_zeroshot_nq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained 4 | # ICT model 5 | 6 | # Datasets can be downloaded from the following link: 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 8 | 9 | EVIDENCE_DATA_DIR= 10 | EMBEDDING_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | QA_FILE= 14 | 15 | python tasks/main.py \ 16 | --task ICT-ZEROSHOT-NQ \ 17 | --tokenizer-type BertWordPieceLowerCase \ 18 | --num-layers 12 \ 19 | --hidden-size 768 \ 20 | --num-attention-heads 12 \ 21 | --tensor-model-parallel-size 1 \ 22 | --micro-batch-size 128 \ 23 | --checkpoint-activations \ 24 | --seq-length 512 \ 25 | --max-position-embeddings 512 \ 26 | --load ${CHECKPOINT_PATH} \ 27 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 28 | --embedding-path ${EMBEDDING_PATH} \ 29 | --retriever-seq-length 256 \ 30 | --vocab-file bert-vocab.txt\ 31 | --qa-data-test ${QA_FILE} \ 32 | --num-workers 2 \ 33 | --faiss-use-gpu \ 34 | --retriever-report-topk-accuracies 1 5 20 100 \ 35 | --fp16 36 | 37 | -------------------------------------------------------------------------------- /examples/evaluate_zeroshot_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TASK="LAMBADA" 12 | 13 | VALID_DATA= 14 | VOCAB_FILE=gpt2-vocab.json 15 | MERGE_FILE=gpt2-merges.txt 16 | CHECKPOINT=checkpoints/gpt2_345m 17 | 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 20 | --task $TASK \ 21 | --valid-data $VALID_DATA \ 22 | --tokenizer-type GPT2BPETokenizer \ 23 | --strict-lambada \ 24 | --vocab-file $VOCAB_FILE \ 25 | --merge-file $MERGE_FILE \ 26 | --load $CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 8 \ 32 | --checkpoint-activations \ 33 | --seq-length 1024 \ 34 | --max-position-embeddings 1024 \ 35 | --log-interval 10 \ 36 | --fp16 \ 37 | --no-load-optim \ 38 | --no-load-rng 39 | -------------------------------------------------------------------------------- /examples/finetune_mnli_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv" 12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \ 13 | data/glue_data/MNLI/dev_mismatched.tsv" 14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 15 | VOCAB_FILE=bert-vocab.txt 16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task MNLI \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 5 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 8 \ 32 | --checkpoint-activations \ 33 | --lr 5.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.065 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 500000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/finetune_race_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/RACE/train/middle" 12 | VALID_DATA="data/RACE/dev/middle \ 13 | data/RACE/dev/high" 14 | VOCAB_FILE=bert-vocab.txt 15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 16 | CHECKPOINT_PATH=checkpoints/bert_345m_race 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task RACE \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 3 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 4 \ 32 | --checkpoint-activations \ 33 | --lr 1.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.06 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 100000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --clip-grad 1.0 \ 45 | --hidden-dropout 0.1 \ 46 | --attention-dropout 0.1 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /examples/generate_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHECKPOINT_PATH=checkpoints/gpt2_345m 4 | VOCAB_FILE=gpt2-vocab.json 5 | MERGE_FILE=gpt2-merges.txt 6 | 7 | python tools/generate_samples_gpt2.py \ 8 | --tensor-model-parallel-size 1 \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --load $CHECKPOINT_PATH \ 12 | --num-attention-heads 16 \ 13 | --max-position-embeddings 1024 \ 14 | --tokenizer-type GPT2BPETokenizer \ 15 | --fp16 \ 16 | --batch-size 2 \ 17 | --seq-length 1024 \ 18 | --out-seq-length 1024 \ 19 | --temperature 1.0 \ 20 | --vocab-file $VOCAB_FILE \ 21 | --merge-file $MERGE_FILE \ 22 | --genfile unconditional_samples.json \ 23 | --num-samples 2 \ 24 | --top_p 0.9 \ 25 | --recompute 26 | -------------------------------------------------------------------------------- /examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TENSOR_MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH=_text_sentence 6 | CHECKPOINT_PATH= 7 | 8 | python pretrain_bert.py \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --num-attention-heads 16 \ 12 | --micro-batch-size 4 \ 13 | --global-batch-size 8 \ 14 | --seq-length 512 \ 15 | --max-position-embeddings 512 \ 16 | --train-iters 2000000 \ 17 | --lr-decay-iters 990000 \ 18 | --save $CHECKPOINT_PATH \ 19 | --load $CHECKPOINT_PATH \ 20 | --data-path $DATA_PATH \ 21 | --vocab-file bert-vocab.txt \ 22 | --data-impl mmap \ 23 | --split 949,50,1 \ 24 | --lr 0.0001 \ 25 | --min-lr 0.00001 \ 26 | --lr-decay-style linear \ 27 | --lr-warmup-fraction .01 \ 28 | --weight-decay 1e-2 \ 29 | --clip-grad 1.0 \ 30 | --log-interval 100 \ 31 | --save-interval 10000 \ 32 | --eval-interval 1000 \ 33 | --eval-iters 10 \ 34 | --fp16 35 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH=_text_sentence 12 | CHECKPOINT_PATH= 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | pretrain_bert.py \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --num-attention-heads 16 \ 21 | --micro-batch-size 4 \ 22 | --global-batch-size 32 \ 23 | --seq-length 512 \ 24 | --max-position-embeddings 512 \ 25 | --train-iters 1000000 \ 26 | --save $CHECKPOINT_PATH \ 27 | --load $CHECKPOINT_PATH \ 28 | --data-path $DATA_PATH \ 29 | --vocab-file bert-vocab.txt \ 30 | --data-impl mmap \ 31 | --split 949,50,1 \ 32 | --distributed-backend nccl \ 33 | --lr 0.0001 \ 34 | --lr-decay-style linear \ 35 | --min-lr 1.0e-5 \ 36 | --lr-decay-iters 990000 \ 37 | --weight-decay 1e-2 \ 38 | --clip-grad 1.0 \ 39 | --lr-warmup-fraction .01 \ 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH=_text_sentence 12 | VOCAB_FILE= 13 | CHECKPOINT_PATH= 14 | 15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 16 | 17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 18 | pretrain_bert.py \ 19 | --tensor-model-parallel-size 2 \ 20 | --pipeline-model-parallel-size 2 \ 21 | --num-layers 24 \ 22 | --hidden-size 1024 \ 23 | --num-attention-heads 16 \ 24 | --micro-batch-size 2 \ 25 | --global-batch-size 16 \ 26 | --max-position-embeddings 512 \ 27 | --train-iters 1000000 \ 28 | --save $CHECKPOINT_PATH \ 29 | --load $CHECKPOINT_PATH \ 30 | --data-path $DATA_PATH \ 31 | --vocab-file $VOCAB_FILE \ 32 | --data-impl mmap \ 33 | --split 949,50,1 \ 34 | --distributed-backend nccl \ 35 | --lr 0.0001 \ 36 | --lr-decay-style linear \ 37 | --min-lr 1.0e-5 \ 38 | --lr-decay-iters 990000 \ 39 | --weight-decay 1e-2 \ 40 | --clip-grad 1.0 \ 41 | --lr-warmup-fraction .01 \ 42 | --log-interval 100 \ 43 | --save-interval 10000 \ 44 | --eval-interval 1000 \ 45 | --eval-iters 10 \ 46 | --fp16 47 | -------------------------------------------------------------------------------- /examples/pretrain_gpt.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | DATA_PATH=_text_document 9 | CHECKPOINT_PATH= 10 | 11 | 12 | deepspeed --num_gpus 1 pretrain_gpt.py \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --micro-batch-size 4 \ 17 | --global-batch-size 8 \ 18 | --seq-length 1024 \ 19 | --max-position-embeddings 1024 \ 20 | --train-iters 500000 \ 21 | --lr-decay-iters 320000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file gpt2-vocab.json \ 26 | --merge-file gpt2-merges.txt \ 27 | --data-impl mmap \ 28 | --split 949,50,1 \ 29 | --distributed-backend nccl \ 30 | --lr 0.00015 \ 31 | --min-lr 1.0e-5 \ 32 | --lr-decay-style cosine \ 33 | --weight-decay 1e-2 \ 34 | --clip-grad 1.0 \ 35 | --lr-warmup-fraction .01 \ 36 | --checkpoint-activations \ 37 | --log-interval 100 \ 38 | --save-interval 10000 \ 39 | --eval-interval 1000 \ 40 | --eval-iters 10 \ 41 | --fp16 42 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_175B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #SBATCH --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b 5 | 6 | 7 | DIR=`pwd` 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 9 | mkdir -p $DIR/logs 10 | 11 | 12 | DATASET_1="" 13 | DATASET_2="" 14 | DATASET_3="" 15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" 16 | 17 | 18 | options=" \ 19 | --tensor-model-parallel-size 8 \ 20 | --pipeline-model-parallel-size 16 \ 21 | --num-layers 96 \ 22 | --hidden-size 12288 \ 23 | --num-attention-heads 96 \ 24 | --seq-length 2048 \ 25 | --max-position-embeddings 2048 \ 26 | --micro-batch-size 1 \ 27 | --global-batch-size 1536 \ 28 | --rampup-batch-size 16 16 5859375 \ 29 | --train-samples 146484375 \ 30 | --lr-decay-samples 126953125 \ 31 | --lr-warmup-samples 183105 \ 32 | --lr 6.0e-5 \ 33 | --min-lr 6.0e-6 \ 34 | --lr-decay-style cosine \ 35 | --log-interval 10 \ 36 | --eval-iters 40 \ 37 | --eval-interval 1000 \ 38 | --data-path ${DATASET} \ 39 | --vocab-file \ 40 | --merge-file \ 41 | --save-interval 1000 \ 42 | --save \ 43 | --load \ 44 | --split 98,2,0 \ 45 | --clip-grad 1.0 \ 46 | --weight-decay 0.1 \ 47 | --adam-beta1 0.9 \ 48 | --adam-beta2 0.95 \ 49 | --init-method-std 0.006 \ 50 | --tensorboard-dir \ 51 | --fp16 \ 52 | --checkpoint-activations " 53 | 54 | 55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}" 56 | 57 | 58 | srun -l \ 59 | --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \ 60 | --container-mounts "" \ 61 | --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}" 62 | 63 | 64 | set +x 65 | 66 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DATA_PATH=_text_document 14 | CHECKPOINT_PATH= 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 19 | pretrain_gpt.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --micro-batch-size 8 \ 24 | --global-batch-size 64 \ 25 | --seq-length 1024 \ 26 | --max-position-embeddings 1024 \ 27 | --train-iters 500000 \ 28 | --lr-decay-iters 320000 \ 29 | --save $CHECKPOINT_PATH \ 30 | --load $CHECKPOINT_PATH \ 31 | --data-path $DATA_PATH \ 32 | --vocab-file gpt2-vocab.json \ 33 | --merge-file gpt2-merges.txt \ 34 | --data-impl mmap \ 35 | --split 949,50,1 \ 36 | --distributed-backend nccl \ 37 | --lr 0.00015 \ 38 | --lr-decay-style cosine \ 39 | --min-lr 1.0e-5 \ 40 | --weight-decay 1e-2 \ 41 | --clip-grad 1.0 \ 42 | --lr-warmup-fraction .01 \ 43 | --checkpoint-activations \ 44 | --log-interval 100 \ 45 | --save-interval 10000 \ 46 | --eval-interval 1000 \ 47 | --eval-iters 10 \ 48 | --fp16 49 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DATA_PATH=_text_document 14 | CHECKPOINT_PATH= 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 19 | pretrain_gpt.py \ 20 | --tensor-model-parallel-size 2 \ 21 | --pipeline-model-parallel-size 2 \ 22 | --num-layers 24 \ 23 | --hidden-size 1024 \ 24 | --num-attention-heads 16 \ 25 | --micro-batch-size 4 \ 26 | --global-batch-size 16 \ 27 | --seq-length 1024 \ 28 | --max-position-embeddings 1024 \ 29 | --train-iters 500000 \ 30 | --lr-decay-iters 320000 \ 31 | --save $CHECKPOINT_PATH \ 32 | --load $CHECKPOINT_PATH \ 33 | --data-path $DATA_PATH \ 34 | --vocab-file gpt2-vocab.json \ 35 | --merge-file gpt2-merges.txt \ 36 | --data-impl mmap \ 37 | --split 949,50,1 \ 38 | --distributed-backend nccl \ 39 | --lr 0.00015 \ 40 | --lr-decay-style cosine \ 41 | --min-lr 1.0e-5 \ 42 | --weight-decay 1e-2 \ 43 | --clip-grad 1.0 \ 44 | --lr-warmup-fraction .01 \ 45 | --checkpoint-activations \ 46 | --log-interval 100 \ 47 | --save-interval 10000 \ 48 | --eval-interval 1000 \ 49 | --eval-iters 10 \ 50 | --fp16 51 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_multilingual.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | # paths to multilingual preprocessed datasets 9 | DATA_PATH_EN=_text_document 10 | DATA_PATH_AR=_text_document 11 | DATA_PATH_KR=_text_document 12 | DATA_PATH_JP=_text_document 13 | 14 | CHECKPOINT_PATH= 15 | 16 | 17 | deepspeed --num_gpus 1 pretrain_gpt.py \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --num-attention-heads 16 \ 21 | --micro-batch-size 4 \ 22 | --global-batch-size 8 \ 23 | --seq-length 1024 \ 24 | --max-position-embeddings 1024 \ 25 | --train-iters 500000 \ 26 | --lr-decay-iters 320000 \ 27 | --save $CHECKPOINT_PATH \ 28 | --load $CHECKPOINT_PATH \ 29 | --train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \ 30 | --valid-weighted-split-paths \ 31 | "VALID_EN: 1 0.6:0.8 $DATA_EN" \ 32 | "VALID_AR: 1 0.6:0.8 $DATA_AR" \ 33 | "VALID_JP: 1 0.6:0.8 $DATA_KR" \ 34 | "VALID_KR: 1 0.6:0.8 $DATA_JP" \ 35 | "VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \ 36 | --test-weighted-split-paths \ 37 | "TEST_EN: 1 0.8:1 $DATA_EN" \ 38 | "TEST_AR: 1 0.8:1 $DATA_AR" \ 39 | "TEST_JP: 1 0.8:1 $DATA_JP" \ 40 | "TEST_KR: 1 0.8:1 $DATA_KR" \ 41 | "TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \ 42 | --vocab-file gpt2-vocab.json \ 43 | --merge-file gpt2-merges.txt \ 44 | --data-impl mmap \ 45 | --split 949,50,1 \ 46 | --distributed-backend nccl \ 47 | --lr 0.00015 \ 48 | --min-lr 1.0e-5 \ 49 | --lr-decay-style cosine \ 50 | --weight-decay 1e-2 \ 51 | --clip-grad 1.0 \ 52 | --lr-warmup-fraction .01 \ 53 | --checkpoint-activations \ 54 | --log-interval 100 \ 55 | --save-interval 10000 \ 56 | --eval-interval 1000 \ 57 | --eval-iters 10 \ 58 | --fp16 59 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_single_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Adapted to use deepspeed on a single node 4 | # 5 | # Multi-node will require either a `hostfile` or switching to `torch.distributed.launch` 6 | 7 | # adjust to the number of GPUs to use 8 | N_GPUS=1 9 | 10 | CHECKPOINT_PATH=checkpoints/gpt2 11 | VOCAB_FILE=data/gpt2-vocab.json 12 | MERGE_FILE=data/gpt2-merges.txt 13 | DATA_PATH=data/meg-gpt2_text_document 14 | 15 | GPT_ARGS=" \ 16 | --num-layers 24 \ 17 | --hidden-size 1024 \ 18 | --num-attention-heads 16 \ 19 | --seq-length 1024 \ 20 | --max-position-embeddings 1024 \ 21 | --micro-batch-size 4 \ 22 | --global-batch-size 8 \ 23 | --lr-decay-iters 320000 \ 24 | --lr 0.00015 \ 25 | --min-lr 1.0e-5 \ 26 | --lr-decay-style cosine \ 27 | --train-iters 5000 \ 28 | --vocab-file $VOCAB_FILE \ 29 | --merge-file $MERGE_FILE \ 30 | --data-impl mmap \ 31 | --split 949,50,1 \ 32 | --distributed-backend nccl \ 33 | --weight-decay 1e-2 \ 34 | --clip-grad 1.0 \ 35 | --lr-warmup-fraction .01 \ 36 | --fp16 \ 37 | " 38 | 39 | OUTPUT_ARGS=" \ 40 | --log-interval 10 \ 41 | --save-interval 500 \ 42 | --eval-interval 100 \ 43 | --eval-iters 10 \ 44 | --checkpoint-activations \ 45 | " 46 | 47 | DATA_ARGS=" \ 48 | --save $CHECKPOINT_PATH \ 49 | --load $CHECKPOINT_PATH \ 50 | --data-path $DATA_PATH \ 51 | " 52 | 53 | ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS" 54 | 55 | LAUNCHER="deepspeed --num_gpus $N_GPUS" 56 | 57 | CMD="$LAUNCHER pretrain_gpt.py $ALL_ARGS" 58 | 59 | echo $CMD 60 | 61 | $CMD 62 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_tiny.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | DATA_PATH=GPT2/c4_en_partial_gpt2_text_document 9 | CHECKPOINT_PATH=GPT2 10 | 11 | 12 | deepspeed --num_gpus 1 pretrain_gpt.py \ 13 | --num-layers 2 \ 14 | --hidden-size 128 \ 15 | --num-attention-heads 4 \ 16 | --micro-batch-size 4 \ 17 | --global-batch-size 8 \ 18 | --seq-length 256 \ 19 | --max-position-embeddings 256 \ 20 | --train-iters 10000 \ 21 | --lr-decay-iters 5000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --tokenizer-type PretrainedFromHF \ 26 | --tokenizer-name-or-path t5-small \ 27 | --data-impl mmap \ 28 | --split 949,50,1 \ 29 | --distributed-backend nccl \ 30 | --lr 0.00015 \ 31 | --min-lr 1.0e-5 \ 32 | --lr-decay-style cosine \ 33 | --weight-decay 1e-2 \ 34 | --clip-grad 1.0 \ 35 | --lr-warmup-fraction .01 \ 36 | --checkpoint-activations \ 37 | --log-interval 100 \ 38 | --save-interval 10000 \ 39 | --eval-interval 1000 \ 40 | --eval-iters 10 \ 41 | --fp16 \ 42 | --tensorboard-dir GPT2 43 | 44 | # --vocab-file GPT2/gpt2-vocab.json \ 45 | # --merge-file GPT2/gpt2-merges.txt \ 46 | -------------------------------------------------------------------------------- /examples/pretrain_ict.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "217M" parameter biencoder model for ICT retriever 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | PRETRAINED_BERT_PATH= 9 | TEXT_DATA_PATH= 10 | TITLE_DATA_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | 14 | python pretrain_ict.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 32 \ 20 | --seq-length 256 \ 21 | --max-position-embeddings 512 \ 22 | --train-iters 100000 \ 23 | --vocab-file bert-vocab.txt \ 24 | --tokenizer-type BertWordPieceLowerCase \ 25 | --DDP-impl torch \ 26 | --bert-load ${PRETRAINED_BERT_PATH} \ 27 | --log-interval 100 \ 28 | --eval-interval 1000 \ 29 | --eval-iters 10 \ 30 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 31 | --retriever-score-scaling \ 32 | --load $CHECKPOINT_PATH \ 33 | --save $CHECKPOINT_PATH \ 34 | --data-path ${TEXT_DATA_PATH} \ 35 | --titles-data-path ${TITLE_DATA_PATH} \ 36 | --lr 0.0001 \ 37 | --lr-decay-style linear \ 38 | --weight-decay 1e-2 \ 39 | --clip-grad 1.0 \ 40 | --lr-warmup-fraction 0.01 \ 41 | --save-interval 4000 \ 42 | --exit-interval 8000 \ 43 | --query-in-block-prob 0.1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_t5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH= 6 | VOCAB_FILE= 7 | CHECKPOINT_PATH= 8 | 9 | python pretrain_t5.py \ 10 | --num-layers 12 \ 11 | --hidden-size 768 \ 12 | --num-attention-heads 12 \ 13 | --kv-channels 64 \ 14 | --ffn-hidden-size 3072 \ 15 | --encoder-seq-length 512 \ 16 | --decoder-seq-length 128 \ 17 | --micro-batch-size 16 \ 18 | --global-batch-size 2048 \ 19 | --max-position-embeddings 512 \ 20 | --train-iters 1000000 \ 21 | --lr-decay-iters 1000000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file $VOCAB_FILE \ 26 | --data-impl mmap \ 27 | --split 949,50,1 \ 28 | --lr 0.0001 \ 29 | --min-lr 0.00001 \ 30 | --lr-decay-style linear \ 31 | --lr-warmup-fraction .01 \ 32 | --weight-decay 1e-2 \ 33 | --clip-grad 1.0 \ 34 | --log-interval 100 \ 35 | --save-interval 10000 \ 36 | --eval-interval 1000 \ 37 | --eval-iters 10 \ 38 | --fp16 39 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH= 12 | VOCAB_FILE= 13 | CHECKPOINT_PATH= 14 | 15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 16 | 17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 18 | pretrain_t5.py \ 19 | --num-layers 12 \ 20 | --hidden-size 768 \ 21 | --num-attention-heads 12 \ 22 | --kv-channels 64 \ 23 | --ffn-hidden-size 3072 \ 24 | --encoder-seq-length 512 \ 25 | --decoder-seq-length 128 \ 26 | --micro-batch-size 16 \ 27 | --global-batch-size 2048 \ 28 | --max-position-embeddings 512 \ 29 | --train-iters 1000000 \ 30 | --lr-decay-iters 1000000 \ 31 | --save $CHECKPOINT_PATH \ 32 | --load $CHECKPOINT_PATH \ 33 | --data-path $DATA_PATH \ 34 | --vocab-file $VOCAB_FILE \ 35 | --data-impl mmap \ 36 | --split 949,50,1 \ 37 | --lr 0.0001 \ 38 | --min-lr 0.00001 \ 39 | --lr-decay-style linear \ 40 | --lr-warmup-fraction .01 \ 41 | --weight-decay 1e-2 \ 42 | --clip-grad 1.0 \ 43 | --log-interval 100 \ 44 | --save-interval 10000 \ 45 | --eval-interval 1000 \ 46 | --eval-iters 10 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH= 12 | CHECKPOINT_PATH= 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | pretrain_t5.py \ 18 | --tensor-model-parallel-size 2 \ 19 | --num-layers 12 \ 20 | --hidden-size 768 \ 21 | --num-attention-heads 12 \ 22 | --kv-channels 64 \ 23 | --ffn-hidden-size 3072 \ 24 | --encoder-seq-length 512 \ 25 | --decoder-seq-length 128 \ 26 | --micro-batch-size 16 \ 27 | --global-batch-size 2048 \ 28 | --seq-length 512 \ 29 | --max-position-embeddings 512 \ 30 | --train-iters 1000000 \ 31 | --lr-decay-iters 1000000 \ 32 | --save $CHECKPOINT_PATH \ 33 | --load $CHECKPOINT_PATH \ 34 | --data-path $DATA_PATH \ 35 | --vocab-file t5-vocab.txt \ 36 | --data-impl mmap \ 37 | --split 949,50,1 \ 38 | --lr 0.0001 \ 39 | --min-lr 0.00001 \ 40 | --lr-decay-style linear \ 41 | --lr-warmup-fraction .01 \ 42 | --weight-decay 1e-2 \ 43 | --clip-grad 1.0 \ 44 | --log-interval 100 \ 45 | --save-interval 10000 \ 46 | --eval-interval 1000 \ 47 | --eval-iters 10 \ 48 | --fp16 49 | -------------------------------------------------------------------------------- /examples/run_evalharness.sh: -------------------------------------------------------------------------------- 1 | CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/ 2 | 3 | PP_SIZE=1 4 | TP_SIZE=1 5 | VOCAB_FILE=gpt2-vocab.json 6 | MERGE_FILE=gpt2-merges.txt 7 | 8 | export HF_DATASETS_OFFLINE=1 9 | 10 | #dummy arguments to make megatron happy. 11 | MEGATRON_REQUIRED_ARGS="\ 12 | --num-layers -1\ 13 | --hidden-size -1\ 14 | --num-attention-heads -1\ 15 | --seq-length -1 \ 16 | --max-position-embeddings -1 17 | " 18 | 19 | CMD="./tasks/eval_harness/evaluate.py \ 20 | --load $CHECKPOINT_PATH\ 21 | --tensor-model-parallel-size $TP_SIZE \ 22 | --pipeline-model-parallel-size $PP_SIZE\ 23 | --vocab-file $VOCAB_FILE\ 24 | --merge-file $MERGE_FILE\ 25 | --micro-batch-size 64\ 26 | --adaptive_seq_len\ 27 | --eval_fp32\ 28 | --task_list hellaswag,mrpc,piqa\ 29 | $MEGATRON_REQUIRED_ARGS\ 30 | " 31 | 32 | N_GPUS=1 33 | LAUNCHER="deepspeed --num_gpus $N_GPUS" 34 | $LAUNCHER $CMD -------------------------------------------------------------------------------- /examples/run_evalharness_deepspeed.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval-harness-deepspeed 3 | #SBATCH --constraint=v100-16g 4 | #SBATCH --nodes=1 5 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 6 | #SBATCH --cpus-per-task=40 # number of cores per tasks 7 | #SBATCH --hint=nomultithread # we get physical cores not logical 8 | #SBATCH --gres=gpu:1 # number of gpus 9 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 10 | #SBATCH --output=%x-%j.out # output file name 11 | #SBATCH --account=six@gpu 12 | 13 | 14 | set -x -e 15 | 16 | source $six_ALL_CCFRWORK/start-prod 17 | 18 | echo "START TIME: $(date)" 19 | 20 | # a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same "results.json" file. 21 | VARIANT="tr9c-1B3-swiglu" 22 | 23 | CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023 24 | MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed 25 | 26 | # you want these 2 on JZ, and pre-download/cache any datasets/tokenizers/models 27 | # but comment these out if you're running on a node with Internet access 28 | export HF_DATASETS_OFFLINE=1 29 | export TRANSFORMERS_OFFLINE=1 30 | 31 | cd $MEGATRON_DEEPSPEED_REPO 32 | 33 | # eval topology 34 | PP_SIZE=1 35 | TP_SIZE=1 36 | 37 | VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json 38 | MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt 39 | SEQ_LEN=2048 40 | 41 | # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS 42 | # make as big as it can fit into gpu w/o OOM, but not too close to 100% 43 | 44 | EVAL_MICRO_BATCH_SIZE=6 # 16GB GPU 1.3B model 45 | #EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model 46 | 47 | 48 | #dummy arguments to make megatron happy. 49 | MEGATRON_REQUIRED_ARGS=" \ 50 | --num-layers -1 \ 51 | --hidden-size -1 \ 52 | --num-attention-heads -1 \ 53 | --seq-length -1 \ 54 | --max-position-embeddings -1 55 | " 56 | 57 | 58 | ZERO_STAGE=0 59 | 60 | config_json="./ds_config.json" 61 | cat < $config_json 62 | { 63 | "train_micro_batch_size_per_gpu": 1, 64 | "train_batch_size": 1, 65 | "zero_optimization": { "stage": $ZERO_STAGE }, 66 | "fp16": { "enabled": true }, 67 | "steps_per_print": 2000, 68 | "wall_clock_breakdown": false 69 | } 70 | EOT 71 | 72 | CMD="./tasks/eval_harness/evaluate.py \ 73 | --load $CHECKPOINT_PATH \ 74 | --results_path $VARIANT-results.json \ 75 | --tensor-model-parallel-size $TP_SIZE \ 76 | --pipeline-model-parallel-size $PP_SIZE \ 77 | --vocab-file $VOCAB_FILE \ 78 | --merge-file $MERGE_FILE \ 79 | --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ 80 | --no-load-optim \ 81 | --no-load-rng \ 82 | --inference \ 83 | --deepspeed \ 84 | --deepspeed_config ds_config.json \ 85 | --seq-length $SEQ_LEN \ 86 | --adaptive_seq_len \ 87 | --eval_fp32 \ 88 | --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \ 89 | $MEGATRON_REQUIRED_ARGS \ 90 | " 91 | 92 | N_GPUS=1 93 | LAUNCHER="deepspeed --num_gpus $N_GPUS" 94 | echo $LAUNCHER $CMD 95 | 96 | export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO 97 | 98 | $LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log 99 | -------------------------------------------------------------------------------- /examples/run_evalharness_tr11-176b-ml.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=run_evalharness-tr11-176b-ml 3 | #SBATCH --partition=gpu_p5 4 | #SBATCH --constraint=a100 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 7 | #SBATCH --cpus-per-task=64 # number of cores per tasks 8 | #SBATCH --hint=nomultithread # we get physical cores not logical 9 | #SBATCH --gres=gpu:8 # number of gpus 10 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 11 | #SBATCH --output=%x-%j.out # output file name 12 | #SBATCH --account=six@a100 13 | 14 | 15 | set -x -e 16 | 17 | source $six_ALL_CCFRWORK/start-py38-pt111 18 | 19 | echo "START TIME: $(date)" 20 | 21 | # a unique identifier for the current eval ideally correspnding to the modelname 22 | VARIANT="tr11-176b-ml" 23 | 24 | 25 | CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step50000 26 | MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed 27 | export HF_DATASETS_OFFLINE=1 28 | export TRANSFORMERS_OFFLINE=1 29 | 30 | export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models 31 | export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets 32 | export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules 33 | export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics 34 | 35 | cd $MEGATRON_DEEPSPEED_REPO 36 | 37 | TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles 38 | 39 | PP_SIZE=8 40 | TP_SIZE=1 41 | SEQ_LEN=2048 42 | 43 | # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS 44 | # make as big as it can fit into gpu w/o OOM, but not too close to 100% 45 | EVAL_MICRO_BATCH_SIZE=1 46 | 47 | #dummy arguments to make megatron happy. 48 | MEGATRON_REQUIRED_ARGS=" \ 49 | --num-layers -1 \ 50 | --hidden-size -1 \ 51 | --num-attention-heads -1 \ 52 | --seq-length -1 \ 53 | --max-position-embeddings -1 \ 54 | " 55 | 56 | 57 | ZERO_STAGE=0 58 | 59 | config_json="./ds_config.json" 60 | 61 | # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() 62 | cat < $config_json 63 | { 64 | "train_micro_batch_size_per_gpu": 1, 65 | "train_batch_size": 1, 66 | "gradient_clipping": 1.0, 67 | "zero_optimization": { 68 | "stage": $ZERO_STAGE 69 | }, 70 | "bf16": { 71 | "enabled": true 72 | }, 73 | "steps_per_print": 2000, 74 | "wall_clock_breakdown": false 75 | } 76 | EOT 77 | 78 | 79 | CMD="./tasks/eval_harness/evaluate.py \ 80 | --load $CHECKPOINT_PATH \ 81 | --results_path $VARIANT-results.json \ 82 | --tensor-model-parallel-size $TP_SIZE \ 83 | --pipeline-model-parallel-size $PP_SIZE \ 84 | --tokenizer-type PretrainedFromHF \ 85 | --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ 86 | --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ 87 | --no-load-optim \ 88 | --no-load-rng \ 89 | --bf16 \ 90 | --inference \ 91 | --seq-length $SEQ_LEN \ 92 | --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \ 93 | --deepspeed \ 94 | --deepspeed_config ds_config.json \ 95 | --bootstrap_iters 2 \ 96 | --intermed_results \ 97 | --adaptive_seq_len \ 98 | --micro_bs_multiplier 4 \ 99 | $MEGATRON_REQUIRED_ARGS \ 100 | " 101 | 102 | GPUS_PER_NODE=8 103 | NNODES=$SLURM_NNODES 104 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 105 | MASTER_PORT=6000 106 | export LAUNCHER="python -u -m torch.distributed.run \ 107 | --nproc_per_node $GPUS_PER_NODE \ 108 | --nnodes $NNODES \ 109 | --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ 110 | --rdzv_backend c10d \ 111 | --max_restarts 0 \ 112 | --tee 3 \ 113 | " 114 | 115 | export CUDA_LAUNCH_BLOCKING=1 116 | 117 | echo $LAUNCHER $CMD 118 | 119 | export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO 120 | 121 | $LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log 122 | -------------------------------------------------------------------------------- /images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/images/cases_april2021.png -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from .package_info import ( 18 | __description__, 19 | __contact_names__, 20 | __url__, 21 | __download_url__, 22 | __keywords__, 23 | __license__, 24 | __package_name__, 25 | __version__, 26 | ) 27 | 28 | from .global_vars import get_args 29 | from .global_vars import get_current_global_batch_size 30 | from .global_vars import get_num_microbatches 31 | from .global_vars import update_num_microbatches 32 | from .global_vars import get_tokenizer 33 | from .global_vars import get_tensorboard_writer 34 | from .global_vars import get_adlr_autoresume 35 | from .global_vars import get_timers 36 | from .initialize import initialize_megatron 37 | 38 | def print_rank_0(message): 39 | """If distributed is initialized, print only on rank 0.""" 40 | if torch.distributed.is_initialized(): 41 | if torch.distributed.get_rank() == 0: 42 | print(message, flush=True) 43 | else: 44 | print(message, flush=True) 45 | 46 | def is_last_rank(): 47 | return torch.distributed.get_rank() == ( 48 | torch.distributed.get_world_size() - 1) 49 | 50 | def print_rank_last(message): 51 | """If distributed is initialized, print only on last rank.""" 52 | if torch.distributed.is_initialized(): 53 | if is_last_rank(): 54 | print(message, flush=True) 55 | else: 56 | print(message, flush=True) 57 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | 2 | PYTHON3CONFIG := $(shell command -v python3-config 2> /dev/null) 3 | 4 | ifndef PYTHON3CONFIG 5 | $(error "python3-config is not available. Please install it. It may be in a python-dev or another package") 6 | endif 7 | 8 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 9 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 10 | LIBNAME = helpers 11 | LIBEXT = $(shell python3-config --extension-suffix) 12 | 13 | default: $(LIBNAME)$(LIBEXT) 14 | 15 | %$(LIBEXT): %.cpp 16 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 17 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Blendable dataset.""" 17 | 18 | import time 19 | 20 | import numpy as np 21 | import torch 22 | 23 | from megatron import print_rank_0 24 | from megatron import mpu 25 | 26 | 27 | class BlendableDataset(torch.utils.data.Dataset): 28 | 29 | 30 | def __init__(self, datasets, weights): 31 | 32 | self.datasets = datasets 33 | num_datasets = len(datasets) 34 | assert num_datasets == len(weights) 35 | 36 | self.size = 0 37 | for dataset in self.datasets: 38 | self.size += len(dataset) 39 | 40 | # Normalize weights. 41 | weights = np.array(weights, dtype=np.float64) 42 | sum_weights = np.sum(weights) 43 | assert sum_weights > 0.0 44 | weights /= sum_weights 45 | 46 | # Build indecies. 47 | start_time = time.time() 48 | assert num_datasets < 255 49 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 50 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 51 | 52 | from megatron.data import helpers 53 | helpers.build_blending_indices(self.dataset_index, 54 | self.dataset_sample_index, 55 | weights, num_datasets, self.size, 56 | torch.distributed.get_rank() == 0) 57 | print_rank_0('> elapsed time for building blendable dataset indices: ' 58 | '{:.2f} (sec)'.format(time.time() - start_time)) 59 | 60 | 61 | def __len__(self): 62 | return self.size 63 | 64 | 65 | def __getitem__(self, idx): 66 | dataset_idx = self.dataset_index[idx] 67 | sample_idx = self.dataset_sample_index[idx] 68 | return self.datasets[dataset_idx][sample_idx] 69 | -------------------------------------------------------------------------------- /megatron/data/mtf_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Multitask Finetune style dataset.""" 17 | 18 | import time 19 | 20 | import numpy as np 21 | import torch 22 | 23 | from megatron import print_rank_0 24 | from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset 25 | 26 | class MTFDataset(torch.utils.data.Dataset): 27 | 28 | def __init__( 29 | self, 30 | name, 31 | data_prefix, 32 | data_impl, 33 | skip_warmup, 34 | documents, 35 | ): 36 | # Params to store. 37 | self.name = name 38 | 39 | # Dataset. 40 | self.input_indexed_dataset = get_indexed_dataset(data_prefix, is_input=True, data_impl=data_impl, skip_warmup=skip_warmup) 41 | self.target_indexed_dataset = get_indexed_dataset(data_prefix, is_input=False, data_impl=data_impl, skip_warmup=skip_warmup) 42 | 43 | # Checks 44 | assert np.min(documents) >= 0 45 | assert np.max(documents) < self.input_indexed_dataset.sizes.shape[0] 46 | assert np.max(documents) < self.target_indexed_dataset.sizes.shape[0] 47 | assert self.input_indexed_dataset.sizes.shape[0] == self.target_indexed_dataset.sizes.shape[0] 48 | 49 | def __len__(self): 50 | return len(self.input_indexed_dataset) 51 | 52 | def __getitem__(self, idx): 53 | input_tokens = self.input_indexed_dataset.get(idx) 54 | target_tokens = self.target_indexed_dataset.get(idx) 55 | 56 | assert len(input_tokens) > 0 57 | assert len(target_tokens) > 0 58 | 59 | return { 60 | 'input_tokens': input_tokens, 61 | 'target_tokens': target_tokens, 62 | } 63 | 64 | def size(self, index): 65 | return { 66 | 'input_tokens': self.input_indexed_dataset.size(index), 67 | 'target_tokens': self.target_indexed_dataset.size(index), 68 | } 69 | 70 | def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_warmup: bool): 71 | if is_input: 72 | field = "inputs" 73 | else: 74 | field = "targets" 75 | 76 | return get_indexed_dataset_(f"{data_prefix}_{field}_document", data_impl, skip_warmup) 77 | 78 | def get_indexed_dataset_(path, data_impl, skip_warmup): 79 | """Build indexed dataset.""" 80 | print_rank_0(' > building dataset index ...') 81 | start_time = time.time() 82 | indexed_dataset = make_indexed_dataset(path, 83 | data_impl, 84 | skip_warmup) 85 | print_rank_0(' > finished creating indexed dataset in {:4f} ' 86 | 'seconds'.format(time.time() - start_time)) 87 | print_rank_0(' number of documents: {}'.format( 88 | indexed_dataset.sizes.shape[0])) 89 | 90 | return indexed_dataset 91 | -------------------------------------------------------------------------------- /megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /megatron/data/vit_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import os 16 | import torch 17 | from torchvision import datasets, transforms 18 | from megatron.data.autoaugment import ImageNetPolicy 19 | 20 | 21 | def build_train_valid_datasets(data_path, crop_size=224, color_jitter=True): 22 | 23 | # training dataset 24 | train_data_path = os.path.join(data_path[0], "train") 25 | normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 26 | process = [ 27 | transforms.RandomResizedCrop(crop_size), 28 | transforms.RandomHorizontalFlip(), 29 | ] 30 | if color_jitter: 31 | process += [ 32 | transforms.ColorJitter( 33 | brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1 34 | ) 35 | ] 36 | fp16_t = transforms.ConvertImageDtype(torch.half) 37 | process += [ImageNetPolicy(), transforms.ToTensor(), normalize, fp16_t] 38 | transform_train = transforms.Compose(process) 39 | train_data = datasets.ImageFolder( 40 | root=train_data_path, transform=transform_train 41 | ) 42 | 43 | # validation dataset 44 | val_data_path = os.path.join(data_path[0], "val") 45 | transform_val = transforms.Compose( 46 | [ 47 | transforms.Resize(crop_size), 48 | transforms.CenterCrop(crop_size), 49 | transforms.ToTensor(), 50 | normalize, 51 | fp16_t 52 | ] 53 | ) 54 | val_data = datasets.ImageFolder( 55 | root=val_data_path, transform=transform_val 56 | ) 57 | 58 | return train_data, val_data 59 | -------------------------------------------------------------------------------- /megatron/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 # Overrides `attention_mask` to be a lower triangular matrix 29 | prefix = 3 30 | custom = 4 # Forces one to pass an `attention_mask` that's 1 if we need to mask. Tensor that can be broadcast to [micro_batch_size, n_head, seq_length, seq_length] 31 | 32 | class PositionEmbeddingType(enum.Enum): 33 | rotary = 1 34 | absolute = 2 35 | alibi = 3 36 | -------------------------------------------------------------------------------- /megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """For backward compatibility, we need the class definitions to deserialize.""" 17 | 18 | class LossScaler: 19 | def __init__(self, scale=1): 20 | self.cur_scale = scale 21 | 22 | class DynamicLossScaler: 23 | def __init__(self, 24 | init_scale=2**32, 25 | scale_factor=2., 26 | scale_window=1000, 27 | min_scale=1, 28 | delayed_shift=1, 29 | consecutive_hysteresis=False): 30 | self.cur_scale = init_scale 31 | self.cur_iter = 0 32 | self.last_overflow_iter = -1 33 | self.scale_factor = scale_factor 34 | self.scale_window = scale_window 35 | self.min_scale = min_scale 36 | self.delayed_shift = delayed_shift 37 | self.cur_hysteresis = delayed_shift 38 | self.consecutive_hysteresis = consecutive_hysteresis 39 | 40 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | 22 | 23 | #ifndef TORCH_CHECK 24 | #define TORCH_CHECK AT_CHECK 25 | #endif 26 | 27 | #ifdef VERSION_GE_1_3 28 | #define DATA_PTR data_ptr 29 | #else 30 | #define DATA_PTR data 31 | #endif 32 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | torch::Tensor const& mask, 28 | float scale_factor); 29 | 30 | torch::Tensor bwd_cuda( 31 | torch::Tensor const& output_grads, 32 | torch::Tensor const& softmax_results, 33 | float scale_factor); 34 | 35 | int get_batch_per_block_cuda( 36 | int query_seq_len, 37 | int key_seq_len, 38 | int batches, 39 | int attn_heads); 40 | 41 | torch::Tensor fwd( 42 | torch::Tensor const& input, 43 | torch::Tensor const& mask, 44 | float scale_factor) { 45 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 46 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 47 | (input.scalar_type() == at::ScalarType::BFloat16), 48 | "Only fp16 and bf16 are supported"); 49 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 50 | 51 | return fwd_cuda(input, mask, scale_factor); 52 | } 53 | 54 | torch::Tensor bwd( 55 | torch::Tensor const& output_grads, 56 | torch::Tensor const& softmax_results, 57 | float scale_factor) { 58 | 59 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 60 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 61 | 62 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 63 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 64 | "Only fp16 and bf16 are supported"); 65 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 66 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 67 | "Only fp16 and bf16 are supported"); 68 | 69 | return bwd_cuda(output_grads, softmax_results, scale_factor); 70 | } 71 | 72 | int get_batch_per_block( 73 | int query_seq_len, 74 | int key_seq_len, 75 | int batches, 76 | int attn_heads) { 77 | return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads); 78 | } 79 | 80 | } // end namespace scaled_masked_softmax 81 | } // end namespace fused_softmax 82 | } // end namespace multihead_attn 83 | 84 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 85 | m.def("forward", 86 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 87 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 88 | 89 | m.def("backward", 90 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 91 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 92 | 93 | m.def("get_batch_per_block", 94 | &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block, 95 | "Return Batch per block size." 96 | ); 97 | } 98 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | float scale_factor); 28 | 29 | torch::Tensor bwd_cuda( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor); 33 | 34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 35 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 36 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 37 | (input.scalar_type() == at::ScalarType::BFloat16), 38 | "Only fp16 and bf16 are supported"); 39 | 40 | return fwd_cuda(input, scale_factor); 41 | } 42 | 43 | torch::Tensor bwd( 44 | torch::Tensor const& output_grads, 45 | torch::Tensor const& softmax_results, 46 | float scale_factor) { 47 | 48 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 49 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 50 | 51 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 52 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 55 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 56 | "Only fp16 and bf16 are supported"); 57 | 58 | return bwd_cuda(output_grads, softmax_results, scale_factor); 59 | } 60 | 61 | } // end namespace scaled_upper_triang_masked_softmax 62 | } // end namespace fused_softmax 63 | } // end namespace multihead_attn 64 | 65 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 66 | m.def("forward", 67 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 68 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 69 | m.def("backward", 70 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 71 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 72 | } 73 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "scaled_upper_triang_masked_softmax.h" 25 | #include "type_shim.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_upper_triang_masked_softmax { 30 | 31 | torch::Tensor fwd_cuda( 32 | torch::Tensor const& input, 33 | float scale_factor) 34 | { 35 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 36 | const int attn_batches = input.size(0); 37 | const int seq_len = input.size(1); 38 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 39 | 40 | // Output 41 | auto act_options = input.options().requires_grad(false); 42 | torch::Tensor softmax_results = 43 | torch::empty({attn_batches, seq_len, seq_len}, act_options); 44 | 45 | // Softmax Intermediate Result Ptr 46 | void* input_ptr = static_cast(input.data_ptr()); 47 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 48 | 49 | DISPATCH_HALF_AND_BFLOAT( 50 | input.scalar_type(), 51 | "dispatch_scaled_upper_triang_masked_softmax_forward", 52 | dispatch_scaled_upper_triang_masked_softmax_forward( 53 | reinterpret_cast(softmax_results_ptr), 54 | reinterpret_cast(input_ptr), 55 | scale_factor, 56 | seq_len, 57 | seq_len, 58 | attn_batches); 59 | ); 60 | return softmax_results; 61 | } 62 | 63 | 64 | torch::Tensor bwd_cuda( 65 | torch::Tensor const& output_grads_, 66 | torch::Tensor const& softmax_results_, 67 | float scale_factor) { 68 | 69 | auto output_grads = output_grads_.contiguous(); 70 | auto softmax_results = softmax_results_.contiguous(); 71 | 72 | //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 73 | const int attn_batches = output_grads.size(0); 74 | const int seq_len = output_grads.size(1); 75 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 76 | 77 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 78 | 79 | //Softmax Grad 80 | DISPATCH_HALF_AND_BFLOAT( 81 | output_grads_.scalar_type(), 82 | "dispatch_scaled_upper_triang_masked_softmax_backward", 83 | dispatch_scaled_upper_triang_masked_softmax_backward( 84 | reinterpret_cast(output_grads_ptr), 85 | reinterpret_cast(output_grads_ptr), 86 | reinterpret_cast(softmax_results.data_ptr()), 87 | scale_factor, 88 | seq_len, 89 | seq_len, 90 | attn_batches); 91 | ); 92 | 93 | //backward pass is completely in-place 94 | return output_grads; 95 | } 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/fused_kernels/type_shim.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | #include 19 | #include "compat.h" 20 | 21 | 22 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 23 | switch(TYPE) \ 24 | { \ 25 | case at::ScalarType::Half: \ 26 | { \ 27 | using scalar_t = at::Half; \ 28 | __VA_ARGS__; \ 29 | break; \ 30 | } \ 31 | case at::ScalarType::BFloat16: \ 32 | { \ 33 | using scalar_t = at::BFloat16; \ 34 | __VA_ARGS__; \ 35 | break; \ 36 | } \ 37 | default: \ 38 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 39 | } 40 | 41 | 42 | 43 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ 44 | switch(TYPEIN) \ 45 | { \ 46 | case at::ScalarType::Float: \ 47 | { \ 48 | using scalar_t_in = float; \ 49 | switch(TYPEOUT) \ 50 | { \ 51 | case at::ScalarType::Float: \ 52 | { \ 53 | using scalar_t_out = float; \ 54 | __VA_ARGS__; \ 55 | break; \ 56 | } \ 57 | case at::ScalarType::Half: \ 58 | { \ 59 | using scalar_t_out = at::Half; \ 60 | __VA_ARGS__; \ 61 | break; \ 62 | } \ 63 | case at::ScalarType::BFloat16: \ 64 | { \ 65 | using scalar_t_out = at::BFloat16; \ 66 | __VA_ARGS__; \ 67 | break; \ 68 | } \ 69 | default: \ 70 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ 71 | } \ 72 | break; \ 73 | } \ 74 | case at::ScalarType::Half: \ 75 | { \ 76 | using scalar_t_in = at::Half; \ 77 | using scalar_t_out = at::Half; \ 78 | __VA_ARGS__; \ 79 | break; \ 80 | } \ 81 | case at::ScalarType::BFloat16: \ 82 | { \ 83 | using scalar_t_in = at::BFloat16; \ 84 | using scalar_t_out = at::BFloat16; \ 85 | __VA_ARGS__; \ 86 | break; \ 87 | } \ 88 | default: \ 89 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ 90 | } 91 | 92 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 17 | 18 | from .distributed import DistributedDataParallel 19 | from .bert_model import BertModel 20 | from .gpt_model import GPTModel, GPTModelPipe 21 | from .t5_model import T5Model 22 | from .language_model import get_language_model 23 | from .module import Float16Module 24 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | torch._C._jit_set_profiling_mode(False) 19 | torch._C._jit_set_profiling_executor(False) 20 | torch._C._jit_override_can_fuse_on_cpu(True) 21 | torch._C._jit_override_can_fuse_on_gpu(True) 22 | 23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 24 | # 1/sqrt(2*pi)-> 0.3989423 25 | # 1/sqrt(2) -> 0.70710678 26 | # sqrt(2/pi) -> 0.79788456 27 | # this function is tanh approximation of gelu 28 | # actual gelu is: 29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 30 | 31 | @torch.jit.script 32 | def bias_gelu(bias, y): 33 | x = bias + y 34 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 35 | 36 | # gradient of tanh approximation of gelu 37 | # gradient of actual gelu is: 38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 39 | @torch.jit.script 40 | def bias_gelu_back(g, bias, y): 41 | x = bias + y 42 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 43 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 44 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 45 | return ff*g 46 | 47 | class GeLUFunction(torch.autograd.Function): 48 | @staticmethod 49 | # bias is an optional argument 50 | def forward(ctx, input, bias): 51 | ctx.save_for_backward(input, bias) 52 | return bias_gelu(bias, input) 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | input, bias = ctx.saved_tensors 57 | tmp = bias_gelu_back(grad_output, bias, input) 58 | return tmp, tmp 59 | 60 | bias_gelu_impl = GeLUFunction.apply 61 | -------------------------------------------------------------------------------- /megatron/model/fused_layer_norm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """This code is copied fron NVIDIA apex: 17 | https://github.com/NVIDIA/apex 18 | with some changes. """ 19 | 20 | import numbers 21 | 22 | 23 | from megatron import get_args 24 | from megatron import mpu 25 | from packaging import version 26 | from torch import nn 27 | from torch.nn import init 28 | from torch.nn.parameter import Parameter 29 | import importlib 30 | import torch 31 | import torch.nn.functional as F 32 | 33 | global fused_mix_prec_layer_norm_cuda 34 | fused_mix_prec_layer_norm_cuda = None 35 | 36 | 37 | class FusedLayerNormAffineFunction(torch.autograd.Function): 38 | 39 | @staticmethod 40 | def forward(ctx, input, weight, bias, normalized_shape, eps): 41 | 42 | ctx.normalized_shape = normalized_shape 43 | ctx.eps = eps 44 | input_ = input.contiguous() 45 | weight_ = weight.contiguous() 46 | bias_ = bias.contiguous() 47 | output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine( 48 | input_, ctx.normalized_shape, weight_, bias_, ctx.eps) 49 | ctx.save_for_backward(input_, weight_, bias_, mean, invvar) 50 | 51 | return output 52 | 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | 57 | input_, weight_, bias_, mean, invvar = ctx.saved_tensors 58 | grad_input = grad_weight = grad_bias = None 59 | grad_input, grad_weight, grad_bias \ 60 | = fused_mix_prec_layer_norm_cuda.backward_affine( 61 | grad_output.contiguous(), mean, invvar, 62 | input_, ctx.normalized_shape, 63 | weight_, bias_, ctx.eps) 64 | 65 | return grad_input, grad_weight, grad_bias, None, None 66 | 67 | 68 | 69 | class MixedFusedLayerNorm(torch.nn.Module): 70 | 71 | def __init__(self, normalized_shape, eps=1e-5): 72 | super(MixedFusedLayerNorm, self).__init__() 73 | 74 | global fused_mix_prec_layer_norm_cuda 75 | fused_mix_prec_layer_norm_cuda = importlib.import_module( 76 | "fused_mix_prec_layer_norm_cuda") 77 | 78 | if isinstance(normalized_shape, numbers.Integral): 79 | normalized_shape = (normalized_shape,) 80 | self.normalized_shape = torch.Size(normalized_shape) 81 | self.eps = eps 82 | self.weight = Parameter(torch.Tensor(*normalized_shape)) 83 | self.bias = Parameter(torch.Tensor(*normalized_shape)) 84 | self.reset_parameters() 85 | 86 | args = get_args() 87 | self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters 88 | 89 | self.use_meg_ds_fused_layer_norm = ( 90 | args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm 91 | or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920 92 | ) 93 | 94 | 95 | def reset_parameters(self): 96 | 97 | init.ones_(self.weight) 98 | init.zeros_(self.bias) 99 | 100 | 101 | def forward(self, input): 102 | 103 | if self.layernorm_tp_auto_sync: 104 | torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) 105 | torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) 106 | 107 | if self.use_meg_ds_fused_layer_norm: 108 | return FusedLayerNormAffineFunction.apply( 109 | input, self.weight, self.bias, self.normalized_shape, self.eps) 110 | else: 111 | return F.layer_norm(input, self.normalized_shape, self.weight, self.bias) 112 | -------------------------------------------------------------------------------- /megatron/model/glu_activations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from megatron import logging 6 | from megatron.model.utils import log_debug_usage 7 | 8 | logger = logging.get_logger(__name__) 9 | 10 | class _GLUBaseModule(nn.Module): 11 | def __init__(self, activation_fn): 12 | super().__init__() 13 | self.activation_fn = activation_fn 14 | 15 | def forward(self, x): 16 | # dim=-1 breaks in jit for pt<1.10 17 | x1, x2 = x.chunk(2, dim=(x.ndim - 1)) 18 | return x1 * self.activation_fn(x2) 19 | 20 | 21 | class LiGLU(_GLUBaseModule): 22 | def __init__(self): 23 | super().__init__(nn.Identity()) 24 | 25 | 26 | class GEGLU(_GLUBaseModule): 27 | def __init__(self): 28 | super().__init__(F.gelu) 29 | 30 | 31 | class ReGLU(_GLUBaseModule): 32 | def __init__(self): 33 | super().__init__(F.relu) 34 | 35 | 36 | class SwiGLU(_GLUBaseModule): 37 | def __init__(self): 38 | super().__init__(F.silu) 39 | 40 | 41 | liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU())) 42 | geglu = log_debug_usage(logger, "Using GLU activation: GELU.")(torch.jit.script(GEGLU())) 43 | reglu = log_debug_usage(logger, "Using GLU activation: ReGLU.")(torch.jit.script(ReGLU())) 44 | swiglu = log_debug_usage(logger, "Using GLU activation: SwiGLU.")(torch.jit.script(SwiGLU())) 45 | 46 | 47 | GLU_ACTIVATIONS = { 48 | "geglu": geglu, 49 | "liglu": liglu, 50 | "reglu": reglu, 51 | "swiglu": swiglu, 52 | } 53 | -------------------------------------------------------------------------------- /megatron/model/positional_embeddings.py: -------------------------------------------------------------------------------- 1 | # Extracted from: https://github.com/EleutherAI/gpt-neox 2 | import torch 3 | 4 | 5 | class RotaryEmbedding(torch.nn.Module): 6 | 7 | def __init__(self, dim, base=10000, precision=torch.half): 8 | super().__init__() 9 | inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim)) 10 | self.register_buffer('inv_freq', inv_freq) 11 | self.max_seq_len_cached = None 12 | self.cos_cached = None 13 | self.sin_cached = None 14 | self.precision = precision 15 | 16 | def forward(self, x, seq_dim=1, seq_len=None): 17 | if seq_len is None: 18 | seq_len = x.shape[seq_dim] 19 | if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached): 20 | self.max_seq_len_cached = seq_len 21 | t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) 22 | freqs = torch.einsum('i,j->ij', t, self.inv_freq) 23 | # Different from paper, but it uses a different permutation in order to obtain the same calculation 24 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 25 | if self.precision == torch.bfloat16: 26 | emb = emb.float() 27 | # [sx, 1 (b * np), hn] 28 | self.cos_cached = emb.cos()[:, None, :] 29 | self.sin_cached = emb.sin()[:, None, :] 30 | if self.precision == torch.bfloat16: 31 | self.cos_cached = self.cos_cached.bfloat16() 32 | self.sin_cached = self.sin_cached.bfloat16() 33 | return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...] 34 | 35 | 36 | # rotary pos emb helpers: 37 | 38 | def rotate_half(x): 39 | x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:] 40 | return torch.cat((-x2, x1), dim=x1.ndim - 1) # dim=-1 triggers a bug in earlier torch versions 41 | 42 | 43 | @torch.jit.script 44 | def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0): 45 | cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...] 46 | return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) 47 | 48 | 49 | def apply_rotary_pos_emb_torch(q, k, cos, sin, offset: int = 0): # jitting fails with bf16 50 | cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...] 51 | return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) -------------------------------------------------------------------------------- /megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for models.""" 17 | 18 | import math 19 | from functools import wraps 20 | 21 | import torch 22 | 23 | from megatron import get_args 24 | 25 | def init_method_normal(sigma): 26 | """Init method based on N(0, sigma).""" 27 | def init_(tensor): 28 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 29 | 30 | return init_ 31 | 32 | 33 | def scaled_init_method_normal(sigma, num_layers): 34 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 35 | std = sigma / math.sqrt(2.0 * num_layers) 36 | 37 | def init_(tensor): 38 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 39 | 40 | return init_ 41 | 42 | 43 | def attention_mask_func(attention_scores, attention_mask): 44 | args = get_args() 45 | if args.curriculum_learning: 46 | attention_mask_ = attention_mask 47 | actual_seqlen = attention_scores.size()[2] 48 | if actual_seqlen != attention_mask_.size()[2]: 49 | # attention_mask has size [1, 1, seqlen, seqlen] 50 | attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous() 51 | attention_scores.masked_fill_(attention_mask_, torch.finfo(attention_scores.dtype).min) 52 | else: 53 | attention_scores.masked_fill_(attention_mask, torch.finfo(attention_scores.dtype).min) 54 | return attention_scores 55 | 56 | 57 | def get_linear_layer(rows, columns, init_method): 58 | """Simple linear layer with weight initialization.""" 59 | layer = torch.nn.Linear(rows, columns) 60 | init_method(layer.weight) 61 | with torch.no_grad(): 62 | layer.bias.zero_() 63 | return layer 64 | 65 | @torch.jit.script 66 | def gelu_impl(x): 67 | """OpenAI's gelu implementation.""" 68 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 69 | (1.0 + 0.044715 * x * x))) 70 | def openai_gelu(x): 71 | return gelu_impl(x) 72 | 73 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 74 | @torch.jit.script 75 | def erf_gelu(x): 76 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 77 | 78 | def log_debug_usage(logger, msg: str): 79 | def log_debug_usage_(func): 80 | """Helper function in order to log a message when using a function for the first time""" 81 | func.__logged_message__ = False 82 | 83 | @wraps(func) 84 | def wrapped(*args, **kwargs): 85 | if func.__logged_message__ is False: 86 | logger.debug(msg) 87 | func.__logged_message__ = True 88 | return func(*args, **kwargs) 89 | 90 | return wrapped 91 | return log_debug_usage_ 92 | -------------------------------------------------------------------------------- /megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .initialize import is_unitialized 23 | from .initialize import destroy_model_parallel 24 | from .initialize import get_data_parallel_group 25 | from .initialize import get_data_parallel_rank 26 | from .initialize import get_data_parallel_world_size 27 | from .initialize import get_embedding_group 28 | from .initialize import get_model_parallel_group 29 | from .initialize import get_tensor_model_parallel_group 30 | from .initialize import get_pipeline_model_parallel_group 31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank 32 | from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank 33 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage 34 | from .initialize import get_tensor_model_parallel_src_rank 35 | from .initialize import get_pipeline_model_parallel_first_rank 36 | from .initialize import get_pipeline_model_parallel_last_rank 37 | from .initialize import get_pipeline_model_parallel_next_rank 38 | from .initialize import get_pipeline_model_parallel_prev_rank 39 | from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size 40 | from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size 41 | from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank 42 | from .initialize import initialize_model_parallel 43 | from .initialize import model_parallel_is_initialized 44 | from .initialize import get_model_parallel_world_size, get_model_parallel_rank 45 | 46 | from .layers import ColumnParallelLinear 47 | from .layers import RowParallelLinear 48 | from .layers import VocabParallelEmbedding 49 | from .layers import (set_tensor_model_parallel_attributes, 50 | set_defaults_if_not_set_tensor_model_parallel_attributes, 51 | copy_tensor_model_parallel_attributes) 52 | 53 | from .mappings import copy_to_tensor_model_parallel_region 54 | from .mappings import gather_from_tensor_model_parallel_region 55 | from .mappings import reduce_from_tensor_model_parallel_region 56 | from .mappings import scatter_to_tensor_model_parallel_region 57 | 58 | from .random import checkpoint 59 | from .random import get_cuda_rng_tracker 60 | from .random import init_checkpointed_activations_memory_buffer 61 | from .random import model_parallel_cuda_manual_seed 62 | from .random import reset_checkpointed_activations_memory_buffer 63 | from .random import gather_split_1d_tensor 64 | from .random import split_tensor_into_1d_equal_chunks 65 | 66 | from .utils import divide 67 | from .utils import split_tensor_along_last_dim 68 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | import mpu 23 | 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | 30 | def forward(self): 31 | return self.weight 32 | 33 | 34 | def set_random_seed(seed): 35 | """Set random seed for reproducability.""" 36 | random.seed(seed) 37 | numpy.random.seed(seed) 38 | torch.manual_seed(seed) 39 | mpu.model_parallel_cuda_manual_seed(seed) 40 | 41 | 42 | def initialize_distributed(backend='nccl'): 43 | """Initialize torch.distributed.""" 44 | # Get local rank in case it is provided. 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--local_rank', type=int, default=None, 47 | help='local rank passed from distributed launcher') 48 | args = parser.parse_args() 49 | local_rank = args.local_rank 50 | 51 | # Get rank and world size. 52 | rank = int(os.getenv('RANK', '0')) 53 | world_size = int(os.getenv("WORLD_SIZE", '1')) 54 | 55 | print('> initializing torch.distributed with local rank: {}, ' 56 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 57 | 58 | # Set the device id. 59 | device = rank % torch.cuda.device_count() 60 | if local_rank is not None: 61 | device = local_rank 62 | torch.cuda.set_device(device) 63 | 64 | # Call the init process. 65 | init_method = 'tcp://' 66 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 67 | master_port = os.getenv('MASTER_PORT', '6000') 68 | init_method += master_ip + ':' + master_port 69 | torch.distributed.init_process_group( 70 | backend=backend, 71 | world_size=world_size, 72 | rank=rank, 73 | init_method=init_method) 74 | 75 | 76 | def print_separator(message): 77 | torch.distributed.barrier() 78 | filler_len = (78 - len(message)) // 2 79 | filler = '-' * filler_len 80 | string = '\n' + filler + ' {} '.format(message) + filler 81 | if torch.distributed.get_rank() == 0: 82 | print(string, flush=True) 83 | torch.distributed.barrier() 84 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | from mpu import data as data_utils 19 | import mpu 20 | import torch 21 | import functools 22 | import operator 23 | import sys 24 | sys.path.append("../..") 25 | 26 | 27 | def test_broadcast_data(tensor_model_parallel_size): 28 | 29 | if torch.distributed.get_rank() == 0: 30 | print('> testing broadcast_data with model parallel size {} ...'. 31 | format(tensor_model_parallel_size)) 32 | 33 | mpu.initialize_model_parallel(tensor_model_parallel_size) 34 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 35 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 36 | 37 | key_size_t = {'key1': [7, 11], 38 | 'key2': [8, 2, 1], 39 | 'key3': [13], 40 | 'key4': [5, 1, 2], 41 | 'key5': [5, 12]} 42 | keys = list(key_size_t.keys()) 43 | 44 | data = {} 45 | data_t = {} 46 | for key in key_size_t: 47 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 48 | data_t[key] = data[key].clone() 49 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 50 | data_t['keyX'] = data['keyX'].clone() 51 | if mpu.get_tensor_model_parallel_rank() != 0: 52 | data = None 53 | 54 | data_utils._check_data_types(keys, data_t, torch.int64) 55 | key_size, key_numel, \ 56 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 57 | for key in keys: 58 | assert key_size[key] == key_size_t[key] 59 | total_numel_t = 0 60 | for key in keys: 61 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 62 | assert key_numel[key] == target_size 63 | total_numel_t += target_size 64 | assert total_numel == total_numel_t 65 | 66 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 67 | for key in keys: 68 | tensor = data_t[key].cuda() 69 | assert data_b[key].sub(tensor).abs().max() == 0 70 | 71 | # Reset groups 72 | mpu.destroy_tensor_model_parallel() 73 | 74 | torch.distributed.barrier() 75 | if torch.distributed.get_rank() == 0: 76 | print('>> passed the test :-)') 77 | 78 | 79 | if __name__ == '__main__': 80 | 81 | initialize_distributed() 82 | world_size = torch.distributed.get_world_size() 83 | 84 | tensor_model_parallel_size = 1 85 | while tensor_model_parallel_size <= world_size: 86 | print_separator('test test broadcast data') 87 | test_broadcast_data(tensor_model_parallel_size) 88 | tensor_model_parallel_size *= 2 89 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | import mpu 19 | import torch 20 | import sys 21 | sys.path.append("../..") 22 | 23 | 24 | def test_initialize_model_parallel(tensor_model_parallel_size): 25 | 26 | if torch.distributed.get_rank() == 0: 27 | print('> testing initialize_model_parallel with size {} ...'.format( 28 | tensor_model_parallel_size)) 29 | tensor_model_parallel_size_ = min(tensor_model_parallel_size, 30 | torch.distributed.get_world_size()) 31 | assert not mpu.model_parallel_is_initialized() 32 | mpu.initialize_model_parallel(tensor_model_parallel_size_) 33 | assert mpu.model_parallel_is_initialized() 34 | 35 | # Checks. 36 | def check(group, world_size, rank): 37 | assert world_size == torch.distributed.get_world_size(group=group) 38 | assert rank == torch.distributed.get_rank(group=group) 39 | 40 | # Model parallel. 41 | world_size = tensor_model_parallel_size_ 42 | rank = torch.distributed.get_rank() % tensor_model_parallel_size_ 43 | assert world_size == mpu.get_tensor_model_parallel_world_size() 44 | assert rank == mpu.get_tensor_model_parallel_rank() 45 | check(mpu.get_tensor_model_parallel_group(), world_size, rank) 46 | 47 | # Data parallel. 48 | world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_ 49 | rank = torch.distributed.get_rank() // tensor_model_parallel_size 50 | assert world_size == mpu.get_data_parallel_world_size() 51 | assert rank == mpu.get_data_parallel_rank() 52 | check(mpu.get_data_parallel_group(), world_size, rank) 53 | 54 | # Reset groups 55 | mpu.destroy_model_parallel() 56 | 57 | torch.distributed.barrier() 58 | if torch.distributed.get_rank() == 0: 59 | print('>> passed the test :-)') 60 | 61 | 62 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): 63 | 64 | if torch.distributed.get_rank() == 0: 65 | print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format( 66 | tensor_model_parallel_size_)) 67 | tensor_model_parallel_size = min(tensor_model_parallel_size_, 68 | torch.distributed.get_world_size()) 69 | assert not mpu.model_parallel_is_initialized() 70 | mpu.initialize_model_parallel(tensor_model_parallel_size) 71 | assert mpu.model_parallel_is_initialized() 72 | 73 | # Checks 74 | src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank() 75 | assert mpu.get_tensor_model_parallel_src_rank() == src_rank 76 | 77 | # Reset groups 78 | mpu.destroy_model_parallel() 79 | 80 | torch.distributed.barrier() 81 | if torch.distributed.get_rank() == 0: 82 | print('>> passed the test :-)') 83 | 84 | 85 | if __name__ == '__main__': 86 | 87 | initialize_distributed() 88 | world_size = torch.distributed.get_world_size() 89 | tensor_model_parallel_size = 1 90 | while tensor_model_parallel_size <= world_size: 91 | print_separator('test initialize model parallel') 92 | test_initialize_model_parallel(tensor_model_parallel_size) 93 | print_separator('test model parallel source rank') 94 | test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) 95 | tensor_model_parallel_size *= 2 96 | -------------------------------------------------------------------------------- /megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /megatron/package_info.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | MAJOR = 1 17 | MINOR = 1.5 18 | 19 | # Use the following formatting: (major, minor) 20 | VERSION = (MAJOR, MINOR) 21 | 22 | __version__ = '.'.join(map(str, VERSION)) + '.bs' 23 | __package_name__ = 'megatron-lm' 24 | __contact_names__ = 'NVIDIA INC' 25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM' 26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.' 28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE' 29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language' 30 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /pretrain_vit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Pretrain VIT""" 17 | 18 | import torch 19 | import torch.nn.functional as F 20 | from megatron import get_args, get_timers, mpu, print_rank_0 21 | from megatron.data.vit_dataset import build_train_valid_datasets 22 | from megatron.model.vit_model import VitModel 23 | from megatron.training import pretrain 24 | from megatron.utils import average_losses_across_data_parallel_group 25 | 26 | def model_provider(): 27 | """Build the model.""" 28 | 29 | print_rank_0("building VIT model ...") 30 | args = get_args() 31 | 32 | model = VitModel(num_classes=args.num_classes) 33 | return model 34 | 35 | def get_batch(data_iterator): 36 | """Build the batch.""" 37 | data = next(data_iterator) 38 | 39 | # only data parallelism; no need for broadcast 40 | images = data[0].cuda() 41 | labels = data[1].cuda() 42 | 43 | return images, labels 44 | 45 | def forward_step(data_iterator, model, input_tensor): 46 | """Forward step.""" 47 | timers = get_timers() 48 | assert input_tensor is None 49 | 50 | # Get the batch. 51 | timers("batch-generator").start() 52 | ( 53 | images, 54 | labels, 55 | ) = get_batch(data_iterator) 56 | timers("batch-generator").stop() 57 | 58 | # Forward model. lm_labels 59 | logits = model(images).contiguous().float() 60 | loss = F.cross_entropy(logits, labels) 61 | 62 | outputs = torch.argmax(logits, -1) 63 | correct = (outputs == labels).float() 64 | accuracy = torch.mean(correct) 65 | 66 | averaged_loss = average_losses_across_data_parallel_group([loss, accuracy]) 67 | 68 | return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]} 69 | 70 | 71 | def train_valid_test_datasets_provider(train_val_test_num_samples): 72 | """Build train, valid, and test datasets.""" 73 | args = get_args() 74 | 75 | print_rank_0( 76 | "> building train, validation, and test datasets " "for VIT ..." 77 | ) 78 | train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path) 79 | print_rank_0("> finished creating VIT datasets ...") 80 | 81 | return train_ds, valid_ds, None 82 | 83 | 84 | if __name__ == "__main__": 85 | 86 | pretrain( 87 | train_valid_test_datasets_provider, 88 | model_provider, 89 | forward_step, 90 | args_defaults={'dataloader_type': 'cyclic'} 91 | ) 92 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 119 3 | target-version = ['py35'] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | nltk 3 | numpy 4 | parameterized 5 | pybind11 6 | regex 7 | six 8 | tensorboard 9 | torch>=1.7 10 | transformers 11 | DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git 12 | # versions from HF transformers 13 | black==21.4b0 14 | isort>=5.5.4 15 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | DIR=`pwd` 5 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 6 | #mkdir -p $DIR/logs 7 | #mkdir -p /tmp/logs 8 | 9 | 10 | #DATASET_1="" 11 | #DATASET_2="" 12 | #DATASET_3="" 13 | #DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" 14 | 15 | BASE_DATA_PATH=/data/Megatron-LM/data 16 | DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron 17 | VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json 18 | MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt 19 | 20 | 21 | script_path=$(realpath $0) 22 | script_dir=$(dirname $script_path) 23 | #CONFIG_JSON="$script_dir/ds_config.json" 24 | CONFIG_JSON="/tmp/ds_config.json" 25 | 26 | USE_DEEPSPEED=1 27 | ZERO_STAGE=0 28 | 29 | 30 | # Debug 31 | #TP=4 32 | #PP=4 33 | #LAYERS=8 34 | #HIDDEN=512 35 | #SEQ=1024 36 | #GLOBAL_BATCH=128 37 | #WORKER_STR="-i worker-0" 38 | 39 | 40 | TP=1 41 | PP=2 42 | HIDDEN=1024 43 | LAYERS=24 44 | SEQ=1024 45 | GLOBAL_BATCH=2 46 | WORKER_STR="" 47 | 48 | MICRO_BATCH=1 49 | 50 | DTYPE="bf16" 51 | 52 | LOG_DIR="/tmp/tensorboard/tp${TP}_pp${PP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_${DTYPE}_fix3" 53 | mkdir -p $LOG_DIR 54 | 55 | while [[ $# -gt 0 ]] 56 | do 57 | key="$1" 58 | case $key in 59 | --no-deepspeed) 60 | USE_DEEPSPEED=0; 61 | shift 62 | ;; 63 | -z|--zero-stage) 64 | ZERO_STAGE=$2; 65 | shift 66 | ;; 67 | *) 68 | echo "Unknown argument(s)" 69 | usage 70 | exit 1 71 | shift 72 | ;; 73 | esac 74 | done 75 | 76 | 77 | options=" \ 78 | --tensor-model-parallel-size $TP \ 79 | --pipeline-model-parallel-size $PP \ 80 | --num-layers $LAYERS \ 81 | --hidden-size $HIDDEN \ 82 | --num-attention-heads 32 \ 83 | --seq-length $SEQ \ 84 | --loss-scale 12 \ 85 | --max-position-embeddings $SEQ \ 86 | --micro-batch-size $MICRO_BATCH \ 87 | --global-batch-size $GLOBAL_BATCH \ 88 | --train-iters 1000 \ 89 | --lr 6.0e-5 \ 90 | --min-lr 6.0e-6 \ 91 | --lr-decay-style cosine \ 92 | --log-interval 1 \ 93 | --eval-iters 40 \ 94 | --eval-interval 1000 \ 95 | --data-path ${DATASET} \ 96 | --vocab-file ${VOCAB_PATH} \ 97 | --merge-file ${MERGE_PATH} \ 98 | --save-interval 10000 \ 99 | --split 98,2,0 \ 100 | --clip-grad 1.0 \ 101 | --weight-decay 0.1 \ 102 | --adam-beta1 0.9 \ 103 | --adam-beta2 0.95 \ 104 | --init-method-std 0.006 \ 105 | --${DTYPE} \ 106 | --checkpoint-activations \ 107 | --exit-interval 10000 \ 108 | --tensorboard-dir $LOG_DIR 109 | " 110 | 111 | 112 | if [[ ${USE_DEEPSPEED} -eq 1 ]]; then 113 | echo "Using DeepSpeed" 114 | options="${options} \ 115 | --deepspeed \ 116 | --deepspeed_config=${CONFIG_JSON} \ 117 | --zero-stage=${ZERO_STAGE} \ 118 | --deepspeed-activation-checkpointing \ 119 | " 120 | fi 121 | 122 | 123 | cat < $CONFIG_JSON 124 | { 125 | "train_batch_size" : $GLOBAL_BATCH, 126 | "train_micro_batch_size_per_gpu": $MICRO_BATCH, 127 | "steps_per_print": 1, 128 | 129 | "zero_optimization": { 130 | "stage": $ZERO_STAGE 131 | }, 132 | 133 | "bf16": { 134 | "enabled": true 135 | }, 136 | 137 | "fp16": { 138 | "enabled": false, 139 | "loss_scale": 0, 140 | "loss_scale_window": 500, 141 | "hysteresis": 2, 142 | "min_loss_scale": 1, 143 | "initial_scale_power": 12 144 | }, 145 | 146 | "wall_clock_breakdown" : true 147 | } 148 | EOT 149 | 150 | WORKER_STR="-i worker-0:0,1" 151 | #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}" 152 | #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}" 153 | run_cmd="deepspeed $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}" 154 | 155 | 156 | echo ${run_cmd} 157 | eval ${run_cmd} 158 | 159 | set +x 160 | -------------------------------------------------------------------------------- /run_fp16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | DIR=`pwd` 5 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 6 | #mkdir -p $DIR/logs 7 | #mkdir -p /tmp/logs 8 | 9 | 10 | #DATASET_1="" 11 | #DATASET_2="" 12 | #DATASET_3="" 13 | #DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" 14 | 15 | BASE_DATA_PATH=/data/Megatron-LM/data 16 | DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron 17 | VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json 18 | MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt 19 | 20 | 21 | script_path=$(realpath $0) 22 | script_dir=$(dirname $script_path) 23 | #CONFIG_JSON="$script_dir/ds_config.json" 24 | CONFIG_JSON="/tmp/ds_config.json" 25 | 26 | USE_DEEPSPEED=1 27 | ZERO_STAGE=0 28 | 29 | 30 | # Debug 31 | #TP=4 32 | #PP=4 33 | #LAYERS=8 34 | #HIDDEN=512 35 | #SEQ=1024 36 | #GLOBAL_BATCH=128 37 | #WORKER_STR="-i worker-0" 38 | 39 | 40 | TP=1 41 | PP=1 42 | DP=2 43 | WORLD_SIZE=$((TP*PP*DP)) 44 | HIDDEN=1024 45 | LAYERS=24 46 | SEQ=1024 47 | GLOBAL_BATCH=1 48 | WORKER_STR="" 49 | 50 | MICRO_BATCH=1 51 | LR=6.0e-4 52 | MIN_LR=6.0e-5 53 | DTYPE="fp16" 54 | EXP_DIR=${HOME}/experiments/results/bf16 55 | LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3" 56 | mkdir -p $LOG_DIR 57 | 58 | while [[ $# -gt 0 ]] 59 | do 60 | key="$1" 61 | case $key in 62 | --no-deepspeed) 63 | USE_DEEPSPEED=0; 64 | shift 65 | ;; 66 | -z|--zero-stage) 67 | ZERO_STAGE=$2; 68 | shift 69 | ;; 70 | *) 71 | echo "Unknown argument(s)" 72 | usage 73 | exit 1 74 | shift 75 | ;; 76 | esac 77 | done 78 | 79 | 80 | options=" \ 81 | --tensor-model-parallel-size $TP \ 82 | --pipeline-model-parallel-size $PP \ 83 | --num-layers $LAYERS \ 84 | --hidden-size $HIDDEN \ 85 | --num-attention-heads 32 \ 86 | --seq-length $SEQ \ 87 | --loss-scale 12 \ 88 | --max-position-embeddings $SEQ \ 89 | --micro-batch-size $MICRO_BATCH \ 90 | --global-batch-size $GLOBAL_BATCH \ 91 | --train-iters 1000 \ 92 | --lr $LR \ 93 | --min-lr $MIN_LR \ 94 | --lr-decay-style cosine \ 95 | --log-interval 1 \ 96 | --eval-iters 40 \ 97 | --eval-interval 10 \ 98 | --data-path ${DATASET} \ 99 | --vocab-file ${VOCAB_PATH} \ 100 | --merge-file ${MERGE_PATH} \ 101 | --save-interval 10000 \ 102 | --split 98,2,0 \ 103 | --clip-grad 1.0 \ 104 | --weight-decay 0.1 \ 105 | --adam-beta1 0.9 \ 106 | --adam-beta2 0.95 \ 107 | --init-method-std 0.006 \ 108 | --${DTYPE} \ 109 | --checkpoint-activations \ 110 | --exit-interval 10000 \ 111 | --tensorboard-dir $LOG_DIR 112 | " 113 | 114 | 115 | if [[ ${USE_DEEPSPEED} -eq 1 ]]; then 116 | echo "Using DeepSpeed" 117 | options="${options} \ 118 | --deepspeed \ 119 | --deepspeed_config=${CONFIG_JSON} \ 120 | --zero-stage=${ZERO_STAGE} \ 121 | --deepspeed-activation-checkpointing \ 122 | " 123 | fi 124 | 125 | 126 | cat < $CONFIG_JSON 127 | { 128 | "train_batch_size" : $GLOBAL_BATCH, 129 | "train_micro_batch_size_per_gpu": $MICRO_BATCH, 130 | "steps_per_print": 1, 131 | 132 | "zero_optimization": { 133 | "stage": $ZERO_STAGE 134 | }, 135 | 136 | "bf16": { 137 | "enabled": false 138 | }, 139 | 140 | "fp16": { 141 | "enabled": true, 142 | "loss_scale": 0, 143 | "loss_scale_window": 500, 144 | "hysteresis": 2, 145 | "min_loss_scale": 1, 146 | "initial_scale_power": 8 147 | }, 148 | 149 | "wall_clock_breakdown" : true 150 | } 151 | EOT 152 | 153 | WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" 154 | #WORKER_STR="-i worker-0:0,1,2,3" 155 | #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}" 156 | #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}" 157 | run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}" 158 | 159 | 160 | echo ${run_cmd} 161 | eval ${run_cmd} 162 | 163 | set +x 164 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Scripts 2 | 3 | This section should be organized with sub-folders for different things. 4 | -------------------------------------------------------------------------------- /scripts/bloom-inference-scripts/README.md: -------------------------------------------------------------------------------- 1 | # Inference scripts for BLOOM 2 | 3 | Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts 4 | -------------------------------------------------------------------------------- /scripts/bloom-inference-server/README.md: -------------------------------------------------------------------------------- 1 | ## Inference solutions for BLOOM 176B 2 | 3 | Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-server 4 | -------------------------------------------------------------------------------- /scripts/test_multiple_dataset_sampling/create_dummy_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | def get_args(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--dir', 7 | type=str, 8 | required=True, 9 | help='directory to save data' 10 | ) 11 | args = parser.parse_args() 12 | return args 13 | 14 | def main(): 15 | args = get_args() 16 | 17 | for i in range(10): 18 | 19 | row_limit = 1000 20 | rows_to_save = [{'text': ''.join([str(i)+'-*']*128)}] 21 | 22 | with open('{}/dataset_{}.json'.format(args.dir, i), 'w') as f: 23 | f.write( 24 | '\n'.join(json.dumps(_i) for _i in rows_to_save*row_limit) 25 | ) 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = megatron 7 | known_third_party = 8 | apex 9 | codecarbon 10 | datasets 11 | deepspeed 12 | git 13 | nltk 14 | numpy 15 | pytest 16 | tensorboard 17 | torch 18 | tqdm 19 | transformers 20 | 21 | line_length = 119 22 | lines_after_imports = 2 23 | multi_line_output = 3 24 | use_parentheses = True -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Setup for pip package.""" 17 | 18 | import os 19 | import sys 20 | import setuptools 21 | 22 | if sys.version_info < (3,): 23 | raise Exception("Python 2 is not supported by Megatron.") 24 | 25 | from megatron.package_info import ( 26 | __description__, 27 | __contact_names__, 28 | __url__, 29 | __download_url__, 30 | __keywords__, 31 | __license__, 32 | __package_name__, 33 | __version__, 34 | ) 35 | 36 | with open("README.md", "r") as fh: 37 | long_description = fh.read() 38 | 39 | ############################################################################### 40 | # Dependency Loading # 41 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 42 | 43 | 44 | def req_file(filename): 45 | with open(filename) as f: 46 | content = f.readlines() 47 | return [x.strip() for x in content] 48 | 49 | 50 | install_requires = req_file("requirements.txt") 51 | 52 | setuptools.setup( 53 | name=__package_name__, 54 | # Versions should comply with PEP440. For a discussion on single-sourcing 55 | # the version across setup.py and the project code, see 56 | # https://packaging.python.org/en/latest/single_source_version.html 57 | version=__version__, 58 | description=__description__, 59 | long_description=long_description, 60 | long_description_content_type="text/markdown", 61 | # The project's main homepage. 62 | url=__url__, 63 | author=__contact_names__, 64 | maintainer=__contact_names__, 65 | # The licence under which the project is released 66 | license=__license__, 67 | classifiers=[ 68 | 'Intended Audience :: Developers', 69 | 'Intended Audience :: Science/Research', 70 | 'Intended Audience :: Information Technology', 71 | # Indicate what your project relates to 72 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 73 | 'Topic :: Software Development :: Libraries :: Python Modules', 74 | # Supported python versions 75 | 'Programming Language :: Python :: 3.6', 76 | 'Programming Language :: Python :: 3.7', 77 | 'Programming Language :: Python :: 3.8', 78 | # Additional Setting 79 | 'Environment :: Console', 80 | 'Natural Language :: English', 81 | 'Operating System :: OS Independent', 82 | ], 83 | python_requires='>=3.6', 84 | packages=setuptools.find_packages(), 85 | install_requires=install_requires, 86 | # Add in any packaged data. 87 | include_package_data=True, 88 | zip_safe=False, 89 | # PyPI package information. 90 | keywords=__keywords__ 91 | ) 92 | -------------------------------------------------------------------------------- /tasks/data_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ Tasks data utility.""" 17 | 18 | import re 19 | import numpy as np 20 | 21 | 22 | def clean_text(text): 23 | """Remove new lines and multiple spaces and adjust end of sentence dot.""" 24 | 25 | text = text.replace("\n", " ") 26 | text = re.sub(r'\s+', ' ', text) 27 | for _ in range(3): 28 | text = text.replace(' . ', '. ') 29 | 30 | return text 31 | 32 | 33 | def build_sample(ids, types, paddings, label, unique_id): 34 | """Convert to numpy and return a sample consumed by the batch producer.""" 35 | 36 | ids_np = np.array(ids, dtype=np.int64) 37 | types_np = np.array(types, dtype=np.int64) 38 | paddings_np = np.array(paddings, dtype=np.int64) 39 | sample = ({'text': ids_np, 40 | 'types': types_np, 41 | 'padding_mask': paddings_np, 42 | 'label': int(label), 43 | 'uid': int(unique_id)}) 44 | 45 | return sample 46 | 47 | 48 | def build_tokens_types_paddings_from_text(text_a, text_b, 49 | tokenizer, max_seq_length): 50 | """Build token types and paddings, trim if needed, and pad if needed.""" 51 | 52 | text_a_ids = tokenizer.tokenize(text_a) 53 | text_b_ids = None 54 | if text_b is not None: 55 | text_b_ids = tokenizer.tokenize(text_b) 56 | 57 | return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, 58 | max_seq_length, tokenizer.cls, 59 | tokenizer.sep, tokenizer.pad) 60 | 61 | 62 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length, 63 | cls_id, sep_id, pad_id): 64 | """Build token types and paddings, trim if needed, and pad if needed.""" 65 | 66 | ids = [] 67 | types = [] 68 | paddings = [] 69 | 70 | # [CLS]. 71 | ids.append(cls_id) 72 | types.append(0) 73 | paddings.append(1) 74 | 75 | # A. 76 | len_text_a = len(text_a_ids) 77 | ids.extend(text_a_ids) 78 | types.extend([0] * len_text_a) 79 | paddings.extend([1] * len_text_a) 80 | 81 | # [SEP]. 82 | ids.append(sep_id) 83 | types.append(0) 84 | paddings.append(1) 85 | 86 | # B. 87 | if text_b_ids is not None: 88 | len_text_b = len(text_b_ids) 89 | ids.extend(text_b_ids) 90 | types.extend([1] * len_text_b) 91 | paddings.extend([1] * len_text_b) 92 | 93 | # Cap the size. 94 | trimmed = False 95 | if len(ids) >= max_seq_length: 96 | max_seq_length_m1 = max_seq_length - 1 97 | ids = ids[0:max_seq_length_m1] 98 | types = types[0:max_seq_length_m1] 99 | paddings = paddings[0:max_seq_length_m1] 100 | trimmed = True 101 | 102 | # [SEP]. 103 | if (text_b_ids is not None) or trimmed: 104 | ids.append(sep_id) 105 | if text_b_ids is None: 106 | types.append(0) 107 | else: 108 | types.append(1) 109 | paddings.append(1) 110 | 111 | # Padding. 112 | padding_length = max_seq_length - len(ids) 113 | if padding_length > 0: 114 | ids.extend([pad_id] * padding_length) 115 | types.extend([pad_id] * padding_length) 116 | paddings.extend([0] * padding_length) 117 | 118 | return ids, types, paddings 119 | -------------------------------------------------------------------------------- /tasks/eval_harness/download.py: -------------------------------------------------------------------------------- 1 | # Downloads the specified taks in the evaluation harness 2 | # This is particularly useful when running in environments where the GPU nodes 3 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation. 4 | 5 | from lm_eval import tasks 6 | from lm_eval.tasks import ALL_TASKS 7 | import argparse 8 | import os 9 | 10 | 11 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False) 12 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.') 13 | args = parser.parse_args() 14 | 15 | def main(): 16 | task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') 17 | tasks.get_task_dict(task_list) 18 | 19 | if __name__ == '__main__': 20 | main() 21 | 22 | 23 | -------------------------------------------------------------------------------- /tasks/eval_harness/report-to-csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # this script converts results.json: 4 | # 5 | # "results": { 6 | # "arc_challenge": { 7 | # "acc": 0.24232081911262798, 8 | # "acc_stderr": 0.01252159329580012, 9 | # "acc_norm": 0.2764505119453925, 10 | # "acc_norm_stderr": 0.013069662474252425 11 | # }, 12 | # 13 | # into a format expected by a spreadsheet, which is: 14 | # 15 | # task metric value err 16 | # arc_challenge acc xxx yyy 17 | # arc_challenge acc_norm xxx yyy 18 | # arc_challenge f1 xxx yyy 19 | # 20 | # usage: 21 | # report-to-csv.py results.json 22 | 23 | 24 | import sys 25 | import json 26 | import io 27 | import csv 28 | 29 | results_file = sys.argv[1] 30 | 31 | csv_file = results_file.replace("json", "csv") 32 | 33 | print(f"Converting {results_file} to {csv_file}") 34 | 35 | with io.open(results_file, 'r', encoding='utf-8') as f: 36 | results = json.load(f) 37 | 38 | with io.open(csv_file, 'w', encoding='utf-8') as f: 39 | 40 | writer = csv.writer(f) 41 | writer.writerow(["task", "metric", "value", "err", "version"]) 42 | 43 | versions = results["versions"] 44 | 45 | for k,v in sorted(results["results"].items()): 46 | if k not in versions: 47 | versions[k] = -1 48 | 49 | if "acc" in v: 50 | writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]]) 51 | if "acc_norm" in v: 52 | writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]]) 53 | if "f1" in v: 54 | writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]]) 55 | # if "ppl" in v: 56 | # writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]]) 57 | # if "em" in v: 58 | # writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]]) 59 | -------------------------------------------------------------------------------- /tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GLUE dataset.""" 17 | 18 | from abc import ABC 19 | from abc import abstractmethod 20 | 21 | from torch.utils.data import Dataset 22 | 23 | from megatron import print_rank_0 24 | from tasks.data_utils import build_sample 25 | from tasks.data_utils import build_tokens_types_paddings_from_text 26 | 27 | 28 | class GLUEAbstractDataset(ABC, Dataset): 29 | """GLUE base dataset class.""" 30 | 31 | def __init__(self, task_name, dataset_name, datapaths, 32 | tokenizer, max_seq_length): 33 | # Store inputs. 34 | self.task_name = task_name 35 | self.dataset_name = dataset_name 36 | self.tokenizer = tokenizer 37 | self.max_seq_length = max_seq_length 38 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 39 | self.dataset_name)) 40 | # Process the files. 41 | string = ' > paths:' 42 | for path in datapaths: 43 | string += ' ' + path 44 | print_rank_0(string) 45 | self.samples = [] 46 | for datapath in datapaths: 47 | self.samples.extend(self.process_samples_from_single_path(datapath)) 48 | print_rank_0(' >> total number of samples: {}'.format( 49 | len(self.samples))) 50 | 51 | def __len__(self): 52 | return len(self.samples) 53 | 54 | def __getitem__(self, idx): 55 | raw_sample = self.samples[idx] 56 | ids, types, paddings = build_tokens_types_paddings_from_text( 57 | raw_sample['text_a'], raw_sample['text_b'], 58 | self.tokenizer, self.max_seq_length) 59 | sample = build_sample(ids, types, paddings, 60 | raw_sample['label'], raw_sample['uid']) 61 | return sample 62 | 63 | @abstractmethod 64 | def process_samples_from_single_path(self, datapath): 65 | """Abstract method that takes a single path / filename and 66 | returns a list of dataset samples, each sample being a dict of 67 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 68 | """ 69 | pass 70 | -------------------------------------------------------------------------------- /tasks/glue/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GLUE finetuning/evaluation.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron import get_tokenizer 21 | from megatron import mpu 22 | from megatron.model.classification import Classification 23 | from tasks.eval_utils import accuracy_func_provider 24 | from tasks.finetune_utils import finetune 25 | 26 | 27 | def glue_classification(num_classes, Dataset, 28 | name_from_datapath_func): 29 | 30 | def train_valid_datasets_provider(): 31 | """Build train and validation dataset.""" 32 | args = get_args() 33 | tokenizer = get_tokenizer() 34 | 35 | train_dataset = Dataset('training', args.train_data, 36 | tokenizer, args.seq_length) 37 | valid_dataset = Dataset('validation', args.valid_data, 38 | tokenizer, args.seq_length) 39 | 40 | return train_dataset, valid_dataset 41 | 42 | def model_provider(pre_process=True, post_process=True): 43 | """Build the model.""" 44 | args = get_args() 45 | 46 | print_rank_0('building classification model for {} ...'.format( 47 | args.task)) 48 | model = Classification(num_classes=num_classes, num_tokentypes=2, 49 | pre_process=pre_process, post_process=post_process) 50 | 51 | return model 52 | 53 | def metrics_func_provider(): 54 | """Privde metrics callback function.""" 55 | def single_dataset_provider(datapath): 56 | args = get_args() 57 | tokenizer = get_tokenizer() 58 | 59 | name = name_from_datapath_func(datapath) 60 | return Dataset(name, [datapath], tokenizer, args.seq_length) 61 | return accuracy_func_provider(single_dataset_provider) 62 | 63 | """Finetune/evaluate.""" 64 | finetune(train_valid_datasets_provider, model_provider, 65 | end_of_epoch_callback_provider=metrics_func_provider) 66 | 67 | 68 | def main(): 69 | args = get_args() 70 | 71 | if args.task == 'MNLI': 72 | 73 | num_classes = 3 74 | from tasks.glue.mnli import MNLIDataset as Dataset 75 | 76 | def name_from_datapath(datapath): 77 | return datapath.split('MNLI')[-1].strip( 78 | '.tsv').strip('/').replace('_', '-') 79 | 80 | elif args.task == 'QQP': 81 | 82 | num_classes = 2 83 | from tasks.glue.qqp import QQPDataset as Dataset 84 | 85 | def name_from_datapath(datapath): 86 | return datapath.split('QQP')[-1].strip( 87 | '.tsv').strip('/').replace('_', '-') 88 | 89 | else: 90 | raise NotImplementedError('GLUE task {} is not implemented.'.format( 91 | args.task)) 92 | 93 | glue_classification(num_classes, Dataset, name_from_datapath) 94 | -------------------------------------------------------------------------------- /tasks/glue/mnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """MNLI dataset.""" 17 | 18 | from megatron import print_rank_0 19 | from tasks.data_utils import clean_text 20 | from .data import GLUEAbstractDataset 21 | 22 | 23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} 24 | 25 | 26 | class MNLIDataset(GLUEAbstractDataset): 27 | 28 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 29 | test_label='contradiction'): 30 | self.test_label = test_label 31 | super().__init__('MNLI', name, datapaths, 32 | tokenizer, max_seq_length) 33 | 34 | def process_samples_from_single_path(self, filename): 35 | """"Implement abstract method.""" 36 | print_rank_0(' > Processing {} ...'.format(filename)) 37 | 38 | samples = [] 39 | total = 0 40 | first = True 41 | is_test = False 42 | with open(filename, 'r') as f: 43 | for line in f: 44 | row = line.strip().split('\t') 45 | if first: 46 | first = False 47 | if len(row) == 10: 48 | is_test = True 49 | print_rank_0( 50 | ' reading {}, {} and {} columns and setting ' 51 | 'labels to {}'.format( 52 | row[0].strip(), row[8].strip(), 53 | row[9].strip(), self.test_label)) 54 | else: 55 | print_rank_0(' reading {} , {}, {}, and {} columns ' 56 | '...'.format( 57 | row[0].strip(), row[8].strip(), 58 | row[9].strip(), row[-1].strip())) 59 | continue 60 | 61 | text_a = clean_text(row[8].strip()) 62 | text_b = clean_text(row[9].strip()) 63 | unique_id = int(row[0].strip()) 64 | label = row[-1].strip() 65 | if is_test: 66 | label = self.test_label 67 | 68 | assert len(text_a) > 0 69 | assert len(text_b) > 0 70 | assert label in LABELS 71 | assert unique_id >= 0 72 | 73 | sample = {'text_a': text_a, 74 | 'text_b': text_b, 75 | 'label': LABELS[label], 76 | 'uid': unique_id} 77 | total += 1 78 | samples.append(sample) 79 | 80 | if total % 50000 == 0: 81 | print_rank_0(' > processed {} so far ...'.format(total)) 82 | 83 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 84 | return samples 85 | -------------------------------------------------------------------------------- /tasks/main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | import os 19 | import sys 20 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 21 | os.path.pardir))) 22 | 23 | from megatron import get_args 24 | from megatron.initialize import initialize_megatron 25 | 26 | 27 | def get_tasks_args(parser): 28 | """Provide extra arguments required for tasks.""" 29 | group = parser.add_argument_group(title='tasks') 30 | 31 | group.add_argument('--task', type=str, required=True, 32 | help='Task name.') 33 | group.add_argument('--epochs', type=int, default=None, 34 | help='Number of finetunning epochs. Zero results in ' 35 | 'evaluation only.') 36 | group.add_argument('--pretrained-checkpoint', type=str, default=None, 37 | help='Pretrained checkpoint used for finetunning.') 38 | group.add_argument('--keep-last', action='store_true', 39 | help='Keep the last batch (maybe incomplete) in' 40 | 'the data loader') 41 | group.add_argument('--train-data', nargs='+', default=None, 42 | help='Whitespace separated paths or corpora names ' 43 | 'for training.') 44 | group.add_argument('--valid-data', nargs='*', default=None, 45 | help='path(s) to the validation data.') 46 | group.add_argument('--overlapping-eval', type=int, default=32, 47 | help='Sliding window for overlapping evaluation.') 48 | group.add_argument('--strict-lambada', action='store_true', 49 | help='Use more difficult formulation of lambada.') 50 | # Retriever args 51 | group.add_argument('--qa-data-dev', type=str, default=None, 52 | help='Path to the QA dataset dev file.') 53 | group.add_argument('--qa-data-test', type=str, default=None, 54 | help='Path to the QA dataset test file.') 55 | 56 | # Faiss arguments for retriever 57 | group.add_argument('--faiss-use-gpu', action='store_true', 58 | help='Whether create the FaissMIPSIndex on GPU') 59 | group.add_argument('--faiss-match', type=str, default='string', \ 60 | choices=['regex', 'string'], help="Answer matching '\ 61 | 'logic type") 62 | group.add_argument('--faiss-topk-retrievals', type=int, default=100, 63 | help='Number of blocks to use as top-k during retrieval') 64 | 65 | return parser 66 | 67 | 68 | if __name__ == '__main__': 69 | 70 | initialize_megatron(extra_args_provider=get_tasks_args) 71 | 72 | args = get_args() 73 | 74 | if args.num_layers_per_virtual_pipeline_stage is not None: 75 | print("Interleaved pipeline schedule is not yet supported for downstream tasks.") 76 | exit() 77 | 78 | if args.task == 'RACE': 79 | from race.finetune import main 80 | elif args.task in ['MNLI', 'QQP']: 81 | from glue.finetune import main 82 | elif args.task in ['LAMBADA', 'WIKITEXT103']: 83 | from zeroshot_gpt.evaluate import main 84 | elif args.task in ['ICT-ZEROSHOT-NQ']: 85 | from orqa.evaluate_orqa import main 86 | else: 87 | raise NotImplementedError('Task {} is not implemented.'.format( 88 | args.task)) 89 | 90 | main() 91 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | import os 19 | import sys 20 | 21 | from megatron import get_args 22 | from tasks.orqa.evaluate_utils import ORQAEvaluator 23 | 24 | def main(): 25 | """ 26 | Main program 27 | """ 28 | 29 | args = get_args() 30 | 31 | # Set up the model and evaluator 32 | evaluator = ORQAEvaluator() 33 | 34 | # Run evaluation 35 | if args.qa_data_dev is not None: 36 | evaluator.evaluate(args.qa_data_dev, "DEV") 37 | 38 | if args.qa_data_test is not None: 39 | evaluator.evaluate(args.qa_data_test, "TEST") 40 | 41 | -------------------------------------------------------------------------------- /tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Race.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron import get_tokenizer 21 | from megatron import mpu 22 | from megatron.model.multiple_choice import MultipleChoice 23 | from tasks.eval_utils import accuracy_func_provider 24 | from tasks.finetune_utils import finetune 25 | from tasks.race.data import RaceDataset 26 | 27 | 28 | def train_valid_datasets_provider(): 29 | """Provide train and validation datasets.""" 30 | args = get_args() 31 | tokenizer = get_tokenizer() 32 | 33 | train_dataset = RaceDataset('training', args.train_data, 34 | tokenizer, args.seq_length) 35 | valid_dataset = RaceDataset('validation', args.valid_data, 36 | tokenizer, args.seq_length) 37 | 38 | return train_dataset, valid_dataset 39 | 40 | 41 | def model_provider(pre_process=True, post_process=True): 42 | """Build the model.""" 43 | 44 | print_rank_0('building multichoice model for RACE ...') 45 | model = MultipleChoice(num_tokentypes=2, 46 | pre_process=pre_process, 47 | post_process=post_process) 48 | 49 | return model 50 | 51 | 52 | def metrics_func_provider(): 53 | """Privde metrics callback function.""" 54 | args = get_args() 55 | tokenizer = get_tokenizer() 56 | 57 | def single_dataset_provider(datapath): 58 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 59 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 60 | 61 | return accuracy_func_provider(single_dataset_provider) 62 | 63 | 64 | def main(): 65 | 66 | finetune(train_valid_datasets_provider, model_provider, 67 | end_of_epoch_callback_provider=metrics_func_provider) 68 | -------------------------------------------------------------------------------- /tasks/vision/classification.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Vision-classification finetuning/evaluation.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron.model.vit_model import VitModel 21 | from megatron.data.vit_dataset import build_train_valid_datasets 22 | from tasks.vision.eval_utils import accuracy_func_provider 23 | from tasks.vision.finetune_utils import finetune 24 | 25 | 26 | def classification(): 27 | def train_valid_datasets_provider(): 28 | """Build train and validation dataset.""" 29 | args = get_args() 30 | 31 | train_ds, valid_ds = build_train_valid_datasets( 32 | data_path=args.data_path, 33 | crop_size=args.img_dim, 34 | ) 35 | return train_ds, valid_ds 36 | 37 | def model_provider(): 38 | """Build the model.""" 39 | args = get_args() 40 | 41 | print_rank_0("building classification model for ImageNet ...") 42 | 43 | return VitModel(num_classes=args.num_classes, finetune=True) 44 | 45 | """Finetune/evaluate.""" 46 | finetune( 47 | train_valid_datasets_provider, 48 | model_provider, 49 | end_of_epoch_callback_provider=accuracy_func_provider, 50 | ) 51 | 52 | 53 | def main(): 54 | classification() 55 | -------------------------------------------------------------------------------- /tasks/vision/eval_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Evaluation utilities.""" 17 | 18 | import os 19 | import torch 20 | from megatron import get_args 21 | from megatron import print_rank_0 22 | from megatron import mpu 23 | from tasks.vision.finetune_utils import build_data_loader 24 | from tasks.vision.finetune_utils import process_batch 25 | from torchvision import datasets, transforms 26 | 27 | 28 | def accuracy_func_provider(): 29 | """Provide function that calculates accuracies.""" 30 | args = get_args() 31 | data_path = args.data_path 32 | crop_size = args.img_dim 33 | 34 | # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] 35 | # Build dataloaders. 36 | val_data_path = os.path.join(data_path[0], "val") 37 | normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 38 | transform_val = transforms.Compose( 39 | [ 40 | transforms.Resize(crop_size), 41 | transforms.CenterCrop(crop_size), 42 | transforms.ToTensor(), 43 | normalize, 44 | ] 45 | ) 46 | dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val) 47 | 48 | dataloader = build_data_loader( 49 | dataset, 50 | args.micro_batch_size, 51 | num_workers=args.num_workers, 52 | drop_last=(mpu.get_data_parallel_world_size() > 1), 53 | ) 54 | 55 | def metrics_func(model, epoch): 56 | print_rank_0("calculating metrics ...") 57 | correct, total = calculate_correct_answers(model, dataloader, epoch) 58 | percent = float(correct) * 100.0 / float(total) 59 | print_rank_0( 60 | " >> |epoch: {}| overall: correct / total = {} / {} = " 61 | "{:.4f} %".format(epoch, correct, total, percent) 62 | ) 63 | 64 | return metrics_func 65 | 66 | 67 | def calculate_correct_answers(model, dataloader, epoch): 68 | """Calculate correct over total answers""" 69 | 70 | model.eval() 71 | with torch.no_grad(): 72 | # For all the batches in the dataset. 73 | total = 0 74 | correct = 0 75 | for _, batch in enumerate(dataloader): 76 | # Run the model forward. 77 | images, labels = process_batch(batch) 78 | logits = model(images).contiguous().float() 79 | # Add output predictions. 80 | # Compute the correct answers. 81 | predicted = torch.argmax(logits, dim=-1) 82 | corrects = (predicted == labels).float() 83 | # Add to the counters. 84 | total += labels.size(0) 85 | correct += corrects.sum().item() 86 | model.train() 87 | 88 | # Reduce. 89 | unreduced = torch.cuda.LongTensor([correct, total]) 90 | torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) 91 | 92 | # Print on screen. 93 | correct_ans = unreduced[0].item() 94 | total_count = unreduced[1].item() 95 | return correct_ans, total_count 96 | -------------------------------------------------------------------------------- /tasks/vision/main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | import os 19 | import sys 20 | 21 | sys.path.append( 22 | os.path.abspath( 23 | os.path.join( 24 | os.path.join(os.path.dirname(__file__), os.path.pardir), 25 | os.path.pardir, 26 | ) 27 | ) 28 | ) 29 | from megatron import get_args 30 | from megatron.initialize import initialize_megatron 31 | from classification import main 32 | 33 | 34 | def get_tasks_args(parser): 35 | """Provide extra arguments required for tasks.""" 36 | group = parser.add_argument_group(title="tasks") 37 | 38 | group.add_argument( 39 | "--epochs", 40 | type=int, 41 | default=None, 42 | help="Number of finetunning epochs. Zero results in " 43 | "evaluation only.", 44 | ) 45 | group.add_argument( 46 | "--pretrained-checkpoint", 47 | type=str, 48 | default=None, 49 | help="Pretrained checkpoint used for finetunning.", 50 | ) 51 | group.add_argument( 52 | "--keep-last", 53 | action="store_true", 54 | help="Keep the last batch (maybe incomplete) in" "the data loader", 55 | ) 56 | 57 | return parser 58 | 59 | 60 | if __name__ == "__main__": 61 | 62 | initialize_megatron(extra_args_provider=get_tasks_args) 63 | args = get_args() 64 | main() 65 | -------------------------------------------------------------------------------- /tasks/zeroshot_gpt/detokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Detokenization.""" 17 | 18 | import re 19 | 20 | 21 | def ptb_detokenizer(string): 22 | string = string.replace(" '", "'") 23 | string = string.replace(" \n", "\n") 24 | string = string.replace("\n ", "\n") 25 | string = string.replace(" n't", "n't") 26 | string = string.replace(" N ", "1 ") 27 | string = string.replace("$ 1", "$1") 28 | string = string.replace("# 1", "#1") 29 | return string 30 | 31 | 32 | def wikitext_detokenizer(string): 33 | # contractions 34 | string = string.replace("s '", "s'") 35 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 36 | # number separators 37 | string = string.replace(" @-@ ", "-") 38 | string = string.replace(" @,@ ", ",") 39 | string = string.replace(" @.@ ", ".") 40 | # punctuation 41 | string = string.replace(" : ", ": ") 42 | string = string.replace(" ; ", "; ") 43 | string = string.replace(" . ", ". ") 44 | string = string.replace(" ! ", "! ") 45 | string = string.replace(" ? ", "? ") 46 | string = string.replace(" , ", ", ") 47 | # double brackets 48 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 49 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 50 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 51 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 52 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 53 | # miscellaneous 54 | string = string.replace("= = = =", "====") 55 | string = string.replace("= = =", "===") 56 | string = string.replace("= =", "==") 57 | string = string.replace(" " + chr(176) + " ", chr(176)) 58 | string = string.replace(" \n", "\n") 59 | string = string.replace("\n ", "\n") 60 | string = string.replace(" N ", " 1 ") 61 | string = string.replace(" 's", "'s") 62 | 63 | return string 64 | 65 | 66 | def lambada_detokenizer(string): 67 | return string 68 | 69 | 70 | _DETOKENIZERS = { 71 | 'ptb': ptb_detokenizer, 72 | 'wiki': wikitext_detokenizer, 73 | 'lambada': lambada_detokenizer, 74 | } 75 | 76 | 77 | def get_detokenizer(path): 78 | for key in _DETOKENIZERS.keys(): 79 | if key in path: 80 | return _DETOKENIZERS[key] 81 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Testing 2 | 3 | This test suite heavily borrows from [HF Transformers](https://github.com/huggingface/transformers/), therefore you can refer to the its [testing docs](https://huggingface.co/transformers/testing.html) for in-depth details. In particular wrt writing new tests, as we have access a lot of helper classes and functions, so you can write tests very quickly and not need to reinvent the wheel. 4 | 5 | The foundation is `pytest`, which allows you to write normal `pytest` tests, but we also use a lot of unit tests in particular via `TestCasePlus` which extends `unittest` and provides additional rich functionality. 6 | 7 | ## Running testing 8 | 9 | ``` 10 | make test 11 | ``` 12 | or: 13 | 14 | ``` 15 | pytest tests 16 | ``` 17 | 18 | Important: the first time you run this it can take some minutes to build all the Megatron cuda kernels and deepspeed kernels if you haven't pre-built the latter. 19 | 20 | For various other options please see the doc mentioned at the very top. 21 | 22 | You will want to have at least 1 gpu available, best 2 to run the tests. 23 | 24 | ## CI 25 | 26 | The CI setup is documented [here](../.github/workflows/ci.md). 27 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # tests directory-specific settings - this file is run automatically 16 | # by pytest before any tests are run 17 | 18 | import sys 19 | import warnings 20 | from os.path import abspath, dirname, join 21 | 22 | 23 | # allow having multiple repository checkouts and not needing to remember to rerun 24 | # 'pip install -e .[dev]' when switching between checkouts and running tests. 25 | git_repo_path = abspath(join(dirname(dirname(__file__)))) 26 | sys.path.insert(1, git_repo_path) 27 | 28 | # silence FutureWarning warnings in tests since often we can't act on them until 29 | # they become normal warnings - i.e. the tests still need to test the current functionality 30 | warnings.simplefilter(action="ignore", category=FutureWarning) 31 | 32 | 33 | def pytest_sessionfinish(session, exitstatus): 34 | # If no tests are collected, pytest exists with code 5, which makes the CI fail. 35 | if exitstatus == 5: 36 | session.exitstatus = 0 37 | -------------------------------------------------------------------------------- /tests/data/gpt2/README.md: -------------------------------------------------------------------------------- 1 | Dataset used for testing. 2 | 3 | `ag_news_prompt*`: manually generated from dataset available at https://huggingface.co/datasets/TimeRobber/ag_news_classify_question_first_100 -------------------------------------------------------------------------------- /tests/data/gpt2/ag_news_prompt_inputs_document.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_inputs_document.bin -------------------------------------------------------------------------------- /tests/data/gpt2/ag_news_prompt_inputs_document.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_inputs_document.idx -------------------------------------------------------------------------------- /tests/data/gpt2/ag_news_prompt_targets_document.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_targets_document.bin -------------------------------------------------------------------------------- /tests/data/gpt2/ag_news_prompt_targets_document.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_targets_document.idx -------------------------------------------------------------------------------- /tests/data/gpt2/generate_ag_news_mtf_dataset.sh: -------------------------------------------------------------------------------- 1 | python -c "from datasets import load_dataset; load_dataset('TimeRobber/ag_news_classify_question_first_100', split='train').to_json('ag_news_classify_question_first_100.jsonl')" 2 | 3 | python tools/preprocess_data.py \ 4 | --input ag_news_classify_question_first_100.jsonl \ 5 | --output-prefix tests/data/gpt2/ag_news_prompt \ 6 | --dataset-impl mmap \ 7 | --json-key targets \ 8 | --tokenizer-type PretrainedFromHF \ 9 | --tokenizer-name-or-path bigscience/tokenizer \ 10 | --append-eod \ 11 | --workers 8 12 | 13 | python tools/preprocess_data.py \ 14 | --input ag_news_classify_question_first_100.jsonl \ 15 | --output-prefix tests/data/gpt2/ag_news_prompt \ 16 | --dataset-impl mmap \ 17 | --json-key inputs \ 18 | --tokenizer-type PretrainedFromHF \ 19 | --tokenizer-name-or-path bigscience/tokenizer \ 20 | --workers 8 21 | 22 | rm ag_news_classify_question_first_100.jsonl 23 | -------------------------------------------------------------------------------- /tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin -------------------------------------------------------------------------------- /tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx -------------------------------------------------------------------------------- /tests/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "train_batch_size": 16, 4 | "gradient_clipping": 1.0, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "fp16": { 9 | "enabled": true, 10 | "loss_scale": 0, 11 | "loss_scale_window": 500, 12 | "hysteresis": 2, 13 | "min_loss_scale": 1, 14 | "initial_scale_power": 12 15 | }, 16 | "zero_allow_untested_optimizer": true, 17 | "steps_per_print": 2000, 18 | "wall_clock_breakdown": false 19 | } 20 | -------------------------------------------------------------------------------- /tests/ds_config_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "train_batch_size": 16, 4 | "gradient_clipping": 1.0, 5 | "zero_optimization": { 6 | "stage": 0 7 | }, 8 | "bf16": { 9 | "enabled": true 10 | }, 11 | "zero_allow_untested_optimizer": true, 12 | "steps_per_print": 2000, 13 | "wall_clock_breakdown": false 14 | } 15 | -------------------------------------------------------------------------------- /tests/ds_config_cl.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "train_batch_size": 16, 4 | "gradient_clipping": 1.0, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "fp16": { 9 | "enabled": true, 10 | "loss_scale": 0, 11 | "loss_scale_window": 500, 12 | "hysteresis": 2, 13 | "min_loss_scale": 1, 14 | "initial_scale_power": 12 15 | }, 16 | "curriculum_learning": { 17 | "enabled": true, 18 | "curriculum_type": "seqlen", 19 | "min_difficulty": 8, 20 | "max_difficulty": 128, 21 | "schedule_type": "fixed_linear", 22 | "schedule_config": { 23 | "total_curriculum_step": 30, 24 | "difficulty_step": 4 25 | } 26 | }, 27 | "steps_per_print": 2000, 28 | "wall_clock_breakdown": false 29 | } 30 | -------------------------------------------------------------------------------- /tests/ds_config_inference.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "train_batch_size": 16, 4 | "fp16": { 5 | "enabled": true, 6 | "loss_scale": 0, 7 | "loss_scale_window": 500, 8 | "hysteresis": 2, 9 | "min_loss_scale": 1, 10 | "initial_scale_power": 12 11 | }, 12 | "zero_allow_untested_optimizer": false, 13 | "steps_per_print": 2000, 14 | "wall_clock_breakdown": false 15 | } 16 | -------------------------------------------------------------------------------- /tests/test_activations.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | from megatron.model.glu_activations import GLU_ACTIVATIONS, geglu, liglu, reglu, swiglu 8 | from megatron.testing_utils import set_seed, torch_assert_equal 9 | 10 | 11 | class TestActivations(unittest.TestCase): 12 | def setUp(self): 13 | """setup an input of reasonable size""" 14 | set_seed() 15 | self.batch_size = random.randint(2, 64) 16 | self.seq_len = random.randint(256, 1025) 17 | self.num_channels = random.randint(1, 384) * 2 18 | self.x = torch.randn(self.batch_size, self.seq_len, self.num_channels) 19 | self.x1, self.x2 = self.x.chunk(2, dim=-1) 20 | # glu should halve the last dimension 21 | self.output_shape = [self.batch_size, self.seq_len, self.num_channels // 2] 22 | 23 | def test_shapes(self): 24 | for activation_fn in GLU_ACTIVATIONS.values(): 25 | output = activation_fn(self.x) 26 | self.assertEqual(list(output.shape), self.output_shape) 27 | 28 | def test_liglu(self): 29 | expected = self.x1 * self.x2 30 | torch_assert_equal(liglu(self.x), expected) 31 | 32 | def test_geglu(self): 33 | expected = self.x1 * F.gelu(self.x2) 34 | torch_assert_equal(geglu(self.x), expected) 35 | 36 | def test_reglu(self): 37 | expected = self.x1 * F.relu(self.x2) 38 | torch_assert_equal(reglu(self.x), expected) 39 | 40 | def test_swiglu(self): 41 | expected = self.x1 * F.silu(self.x2) 42 | torch_assert_equal(swiglu(self.x), expected) 43 | 44 | # from megatron.testing_utils import require_torch_bf16 45 | # @require_torch_bf16 46 | # def test_bf16_jit(self): 47 | # x_bf16 = self.x.to(torch.bfloat16) 48 | # for activation_fn in GLU_ACTIVATIONS.values(): 49 | # output = activation_fn(x_bf16) 50 | # self.assertEqual(list(output.shape), self.output_shape) 51 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | -------------------------------------------------------------------------------- /tests/tools/README.md: -------------------------------------------------------------------------------- 1 | # Test suite tools 2 | 3 | # Make tiny tokenizer files 4 | 5 | currently for gpt2 run: 6 | ``` 7 | ./shrink-tokenizer.py 8 | ``` 9 | 10 | and then we have tiny vocab and merge files under the generated dir `tiny` to add to repo under `data/gpt2`. 11 | 12 | ``` 13 | cp tiny/merges.txt ../data/gpt2/gpt2-tiny-merges.txt 14 | cp tiny/vocab.json ../data/gpt2/gpt2-tiny-vocab.json 15 | ``` 16 | 17 | Note, the tiny vocab was set to 5000 items after experimenting with the resulting index files size. Using a tiny vocab of 500 (and adjusted merge entries) proved to generate very large index files, so it actually ends up costing more in final file size. 5000 proved to generate an almost identical index files as with the original 50k vocab size. 18 | 19 | 20 | # Make tiny pre-processed index 21 | 22 | to be used in test training 23 | 24 | ``` 25 | ./openwebtext-to-jsonl.py 26 | ``` 27 | 28 | generates: 29 | 30 | ``` 31 | openwebtext-10000.jsonl 32 | ``` 33 | 34 | we don't want to store jsonl in repo, to keep the size small, so it's a temp file. 35 | 36 | Now we pre-process it: 37 | 38 | ``` 39 | cd ../.. 40 | input=tests/tools/openwebtext-1000.jsonl 41 | python tools/preprocess_data.py \ 42 | --input $input \ 43 | --output-prefix tests/data/gpt2/meg-gpt2-openwebtext \ 44 | --dataset-impl mmap \ 45 | --tokenizer-type GPT2BPETokenizer \ 46 | --merge-file tests/data/gpt2/gpt2-tiny-merges.txt \ 47 | --vocab tests/data/gpt2/gpt2-tiny-vocab.json \ 48 | --append-eod \ 49 | --workers 6 50 | ``` 51 | 52 | and voila we now have: 53 | ``` 54 | ls -sh1 tests/data/gpt2/meg-gpt2-openwebtext* 55 | 2.6M tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin 56 | 20K tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx 57 | ``` 58 | which we can now commit and use in tests. 59 | -------------------------------------------------------------------------------- /tests/tools/openwebtext-to-jsonl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # generate a jsonl version of a small slice of a dataset that can be fed to megatron-lm preprocessor 4 | 5 | import sys 6 | from datasets import load_dataset 7 | 8 | dataset_name = "stas/openwebtext-10k" 9 | 10 | # subset to jsonlines 11 | n_samples = 1000 12 | ds = load_dataset(dataset_name, split='train') 13 | ds_small = ds.select(range(n_samples)) 14 | path = f"openwebtext-{n_samples}.jsonl" 15 | ds_small.to_json(path, orient="records", lines=True) 16 | -------------------------------------------------------------------------------- /tests/tools/shrink-tokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # produce a tiny tokenizer which we can use in testing (so that it won't take much space in the repo) 4 | 5 | import json 6 | from transformers import AutoTokenizer 7 | from tokenizers import Tokenizer 8 | 9 | mname = "gpt2" 10 | 11 | vocab_keep_items = 5000 12 | 13 | tokenizer = AutoTokenizer.from_pretrained(mname, use_fast=True) 14 | assert tokenizer.is_fast, "This only works for fast tokenizers." 15 | tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) 16 | vocab = tokenizer_json["model"]["vocab"] 17 | if tokenizer_json["model"]["type"] == "BPE": 18 | if "gpt2" in mname: 19 | new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items-1 } 20 | new_vocab["<|endoftext|>"] = vocab_keep_items-1 21 | else: 22 | new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } 23 | merges = tokenizer_json["model"]["merges"] 24 | new_merges = [] 25 | for i in range(len(merges)): 26 | a, b = merges[i].split() 27 | new_token = "".join((a, b)) 28 | if a in new_vocab and b in new_vocab and new_token in new_vocab: 29 | new_merges.append(merges[i]) 30 | tokenizer_json["model"]["merges"] = new_merges 31 | elif tokenizer_json["model"]["type"] == "Unigram": 32 | new_vocab = vocab[:vocab_keep_items] 33 | elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel": 34 | new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } 35 | else: 36 | raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}") 37 | tokenizer_json["model"]["vocab"] = new_vocab 38 | tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json)) 39 | tokenizer.save_pretrained("tiny") 40 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Tools 3 | 4 | - [sample_idxs_to_text.py](./sample_idxs_to_text.py) - want to see which text was feed at specific iterations? for example to understand why the training went astray? Then use this script. The pre-amble of the script contains the documentation and usage examples. 5 | 6 | 7 | ## A few notes on how we created the datasets: 8 | 9 | ### Creating the Json Lines text file 10 | 11 | First you need to create a jsonl file containing your dataset. For this we exported from the HF-datasets format. For example for C4: 12 | 13 | ``` 14 | from datasets import load_dataset 15 | c4 = load_dataset("c4", "en") 16 | c4["train"].to_json("c4_en_train.jsonl") 17 | c4["validation"].to_json("c4_en_valid.jsonl") 18 | ``` 19 | 20 | This creates quite a large file compared to the size of the HF dataset on disk (810GB vs 305 for C4 for example) 21 | 22 | ### Megatron pre-processing 23 | 24 | Then you need to pass that text file to the `preprocess_data.py` script for tokenization and memory-mapping, creating two files, one to store the tokens indices and one to store the document start and ends. The result will be slightly bigger than the text dataset. (360GB vs 305GB for C4 for example). You can choose one of the default Megatron tokenizers (but then you have to pass merges and vocab files) or one from HF-tokenizers. For example, in our GPT-like models reusing a T5 sentencepiece-bpe tokenizer: 25 | 26 | `python tools/preprocess_data.py --input ~/c4_en_train.jsonl --output-prefix c4_en_train --dataset-impl mmap --tokenizer-type PretrainedFromHF --tokenizer-name-or-path t5-small --workers 30 --append-eod` 27 | 28 | Do note that adding too many workers can be counterproductive for very large dataset: as the bottleneck becomes disk writing, the intermediary process results pool up and can flood the RAM. In our experiments on GCP machines, running with 60 workers on C4 inevitably led the program to fail. 29 | -------------------------------------------------------------------------------- /tools/convert_checkpoint/deepspeed_to_transformers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import torch 5 | import json 6 | import sys 7 | from pathlib import Path 8 | 9 | # insert megatron's root dir into sys.path 10 | root_repo_path = str(Path(__file__).resolve().parents[2]) 11 | if root_repo_path not in sys.path: 12 | sys.path.insert(0, root_repo_path) 13 | 14 | from deepspeed.checkpoint import DeepSpeedCheckpoint 15 | from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments 16 | 17 | # the import was tested to work with this version 18 | # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider 19 | # copying that version here instead 20 | from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint 21 | from transformers import GPT2Config 22 | 23 | 24 | def main(): 25 | 26 | # this first part comes mainly from deepspeed_to_megatron.main 27 | args = parse_arguments() 28 | print( 29 | f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}' 30 | ) 31 | 32 | ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, 33 | args.target_pp) 34 | iteration = ds_checkpoint.get_iteration() 35 | input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, 36 | args.for_release) 37 | 38 | # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main 39 | # Spell out all parameters in case the defaults change. 40 | config = GPT2Config( 41 | vocab_size=50257, 42 | n_positions=1024, 43 | n_ctx=1024, 44 | n_embd=1024, 45 | n_layer=24, 46 | n_head=16, 47 | n_inner=4096, 48 | activation_function="gelu", # used to be "gelu_new" in earlier versions 49 | resid_pdrop=0.1, 50 | embd_pdrop=0.1, 51 | attn_pdrop=0.1, 52 | layer_norm_epsilon=1e-5, 53 | initializer_range=0.02, 54 | summary_type="cls_index", 55 | summary_use_proj=True, 56 | summary_activation=None, 57 | summary_proj_to_labels=True, 58 | summary_first_dropout=0.1, 59 | scale_attn_weights=True, 60 | gradient_checkpointing=False, 61 | use_cache=True, 62 | bos_token_id=50256, 63 | eos_token_id=50256, 64 | ) 65 | 66 | # Convert. 67 | print("Converting to HF Checkpoint") 68 | output_state_dict = convert_megatron_checkpoint(args, input_state_dict, 69 | config) 70 | 71 | basename = args.output_folder 72 | os.makedirs(basename, exist_ok=True) 73 | 74 | # Print the structure of converted state dict. 75 | #if args.print_checkpoint_structure: 76 | # recursive_print(None, output_state_dict) 77 | 78 | # Store the config to file. 79 | output_config_file = os.path.join(basename, "config.json") 80 | output_config = config.to_dict() 81 | output_config["architectures"] = ["GPT2LMHeadModel"] 82 | output_config["model_type"] = "gpt2" 83 | print(f'Saving config to "{output_config_file}"') 84 | with open(output_config_file, "w") as f: 85 | json.dump(output_config, f) 86 | 87 | # Store the state_dict to file. 88 | output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") 89 | print(f'Saving checkpoint to "{output_checkpoint_file}"') 90 | torch.save(output_state_dict, output_checkpoint_file) 91 | 92 | print("Now add tokenizer files and upload to the hub") 93 | 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /tools/convert_checkpoint/inspect_checkpoint.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import os 4 | from collections import OrderedDict 5 | from pathlib import Path 6 | 7 | # insert megatron's root dir into sys.path 8 | root_repo_path = str(Path(__file__).resolve().parents[2]) 9 | if root_repo_path not in sys.path: 10 | sys.path.insert(0, root_repo_path) 11 | 12 | 13 | def dump_data(datum, name_list=[]): 14 | if type(datum) in (dict, OrderedDict): 15 | for k, v in datum.items(): 16 | dump_data(v, name_list + [str(k)]) 17 | elif type(datum) in (list, tuple): 18 | for v in datum: 19 | dump_data(v, name_list) 20 | elif torch.is_tensor(datum): 21 | prefix = '.'.join(name_list) 22 | print(f'[tensor] {prefix} = {datum.shape}') 23 | else: 24 | #pass 25 | prefix = '.'.join(name_list) 26 | print(f'[other] {prefix} = {datum}') 27 | 28 | 29 | def main(): 30 | if len(sys.argv) < 2: 31 | print(f'Usage: {sys.argv[0]} ') 32 | exit(1) 33 | 34 | ckpt_file = sys.argv[1] 35 | if not os.path.isfile(ckpt_file): 36 | print(f'{ckpt_file} is not a valid file') 37 | exit(1) 38 | 39 | print(f'loading checkpoint file: {ckpt_file}') 40 | sd = torch.load(ckpt_file, map_location=torch.device('cpu')) 41 | dump_data(sd) 42 | 43 | quit() 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /tools/convert_checkpoint/inspect_deepspeed_checkpoint.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | # insert megatron's root dir into sys.path 5 | root_repo_path = str(Path(__file__).resolve().parents[2]) 6 | if root_repo_path not in sys.path: 7 | sys.path.insert(0, root_repo_path) 8 | 9 | import argparse 10 | 11 | from deepspeed.checkpoint import DeepSpeedCheckpoint 12 | 13 | 14 | def list_files(file_list, tag): 15 | print(f'Listing files: {tag}') 16 | for i, file in enumerate(file_list): 17 | print(f'{i+1}: {file}') 18 | 19 | 20 | def parse_arguments(): 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--folder', 23 | default=None, 24 | type=str, 25 | help='DeepSpeed Checkpoint folder') 26 | parser.add_argument('--target_tp', 27 | default=None, 28 | type=int, 29 | help='Target TP degree') 30 | parser.add_argument('--target_pp', 31 | default=None, 32 | type=int, 33 | help='Target PP degree') 34 | args = parser.parse_args() 35 | print(f'args = {args}') 36 | return args 37 | 38 | 39 | def show_input_files(ds_checkpoint): 40 | list_files(ds_checkpoint.file_list, 'all') 41 | list_files(ds_checkpoint.zero_files, 'zero') 42 | list_files(ds_checkpoint.layer_files, 'layer') 43 | list_files(ds_checkpoint.mp_rank_files, 'mp rank') 44 | 45 | 46 | def show_simple_state(ds_checkpoint): 47 | print(f'layer keys = {ds_checkpoint.layer_keys}') 48 | print(f'layer count = {ds_checkpoint.layer_count}') 49 | 50 | print( 51 | f'tp_degree_count = {ds_checkpoint.original_tp_degree} ------> {ds_checkpoint.tp_degree}' 52 | ) 53 | print( 54 | f'pp_degree_count = {ds_checkpoint.original_pp_degree} ------> {ds_checkpoint.pp_degree}' 55 | ) 56 | print(f'dp_degree_count = {ds_checkpoint.dp_degree}') 57 | ds_checkpoint.old_2d_map.print_data('old 2d map ==>') 58 | ds_checkpoint.new_2d_map.print_data('new 2d map ==>') 59 | 60 | 61 | def show_mappings(ds_checkpoint): 62 | ds_checkpoint.show_pp_tranformer_map() 63 | ds_checkpoint.show_transformer_file_map() 64 | ds_checkpoint.show_tp_embedding_map() 65 | ds_checkpoint.show_tp_final_norm_map() 66 | ds_checkpoint.show_2d_mapping() 67 | 68 | 69 | def show_state_summary(tag, sd): 70 | summary = {k: v.shape for k, v in sd.items()} 71 | print(f'{tag} = {summary}') 72 | 73 | 74 | def show_embedding_states(ds_checkpoint): 75 | for i in range(0, ds_checkpoint.tp_degree): 76 | sd = ds_checkpoint.get_embedding_state(i) 77 | show_state_summary(f'embedding[{i}]', sd) 78 | 79 | 80 | def show_final_norm_states(ds_checkpoint): 81 | for i in range(0, ds_checkpoint.tp_degree): 82 | sd = ds_checkpoint.get_final_norm_state(i) 83 | show_state_summary(f'final_norm[{i}]', sd) 84 | 85 | 86 | def show_transformer_states(ds_checkpoint): 87 | for i in range(0, ds_checkpoint.tp_degree): 88 | for j in range(0, ds_checkpoint.pp_degree): 89 | state_list = ds_checkpoint.get_transformer_state(tp_index=i, 90 | pp_index=j) 91 | print(f'tp_pp_rank[{i},{j}] = ') 92 | for k, sd in enumerate(state_list): 93 | show_state_summary(f' block[{k}]', sd) 94 | print("") 95 | 96 | 97 | def main(): 98 | print(f'Inspecting DeepSpeed Checkpoint') 99 | args = parse_arguments() 100 | 101 | ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp, 102 | args.target_pp) 103 | ds_checkpoint.validate_files() 104 | 105 | show_simple_state(ds_checkpoint) 106 | show_input_files(ds_checkpoint) 107 | show_simple_state(ds_checkpoint) 108 | show_mappings(ds_checkpoint) 109 | show_embedding_states(ds_checkpoint) 110 | show_final_norm_states(ds_checkpoint) 111 | show_transformer_states(ds_checkpoint) 112 | checkpoint_args = ds_checkpoint.get_args() 113 | print(f'checkpoint args = {checkpoint_args}') 114 | 115 | 116 | if __name__ == "__main__": 117 | main() 118 | -------------------------------------------------------------------------------- /tools/create_doc_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 4 | os.path.pardir))) 5 | 6 | from megatron import print_rank_0 7 | from megatron.indexer import IndexBuilder 8 | from megatron.initialize import initialize_megatron 9 | 10 | 11 | def main(): 12 | """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset 13 | - Include all args needed for initial model specification 14 | 15 | Other key args: 16 | --block-data-path: path to write to 17 | --ict-load or --realm-load: path to checkpoint with which to embed 18 | --data-path and --titles-data-path: paths for dataset 19 | --indexer-log-interval: reporting interval 20 | --indexer-batch-size: size specific for indexer jobs 21 | 22 | Check README.md for example script 23 | """ 24 | 25 | initialize_megatron(extra_args_provider=None, 26 | args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) 27 | index_builder = IndexBuilder() 28 | index_builder.build_and_save_index() 29 | print_rank_0("Build and save indices: done!") 30 | 31 | if __name__ == "__main__": 32 | main() 33 | 34 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/logs/rescale-logs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script fixes up BigScience log files by adjusting and fixing 5 | # units of logged values to be seconds instead of milliseconds. 6 | # It does the modification in-place (so make back ups!). 7 | # 8 | # Example: 9 | # 10 | # find . -name "*.out*" -print0 | xargs -0 -P 8 rescale-logs.py 11 | # 12 | # See also the discussion in 13 | # https://github.com/bigscience-workshop/Megatron-DeepSpeed/issues/236. 14 | # 15 | # This script is derived from https://stackoverflow.com/a/60080531/9201239 16 | # and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c. 17 | 18 | import os 19 | import re 20 | import sys 21 | 22 | LINE_START_RE = re.compile(' ?iteration') 23 | ELAPSED_TIME_RE = re.compile(r'elapsed time per iteration \(ms\): ([0-9.]+)') 24 | SAMPLES_PER_SEC_RE = re.compile('samples per second: ([0-9.]+)') 25 | 26 | 27 | def rescale_logs(log_file_path): 28 | new_log_file_path = log_file_path + '.new' 29 | with open(log_file_path, 'r') as log_file: 30 | with open(new_log_file_path, 'w') as new_log_file: 31 | for line in log_file.readlines(): 32 | if LINE_START_RE.match(line): 33 | match = ELAPSED_TIME_RE.search(line) 34 | if match: 35 | # Logged time is in ms, so convert the match. 36 | time_in_sec = float(match[1]) / 1000 37 | replacement = ( 38 | f'elapsed time per iteration (s): ' 39 | f'{time_in_sec:.2f}' 40 | ) 41 | 42 | # We only need to replace once per line. 43 | line = ELAPSED_TIME_RE.sub(replacement, line, count=1) 44 | 45 | match = SAMPLES_PER_SEC_RE.search(line) 46 | if match: 47 | # Logged time is in ms, so convert the match. 48 | time_in_sec = float(match[1]) * 1000 49 | # As the values are already logged up to 3 50 | # numbers after the decimal point and we scale 51 | # by exactly that amount, we log them without 52 | # decimal point here in order to not seem more 53 | # exact than we are. 54 | replacement = f'samples per second: {time_in_sec:.0f}' 55 | 56 | # We only need to replace once per line. 57 | line = SAMPLES_PER_SEC_RE.sub( 58 | replacement, 59 | line, 60 | count=1, 61 | ) 62 | 63 | new_log_file.write(line) 64 | 65 | os.rename(new_log_file_path, log_file_path) 66 | 67 | 68 | if __name__ == '__main__': 69 | if len(sys.argv) < 2: 70 | print(f'{sys.argv[0]} ', 71 | file=sys.stderr) 72 | sys.exit(1) 73 | 74 | input_file = sys.argv[1] 75 | rescale_logs(input_file) 76 | print('Done') 77 | -------------------------------------------------------------------------------- /tools/logs/tb-rename-events.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script renames event names in TensorBoard log files. 5 | # It does the renaming in-place (so make back ups!). 6 | # 7 | # Example: 8 | # 9 | # find . -name "*.tfevents*" -exec tb-rename-events.py {} "iteration-time" "iteration-time/iteration-time" \; 10 | # 11 | # More than one old tag can be remapped to one new tag – use ";" as a separator: 12 | # 13 | # tb-rename-events.py events.out.tfevents.1 "training loss;validation loss" "loss" 14 | # 15 | # This script is derived from https://stackoverflow.com/a/60080531/9201239 16 | # and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c. 17 | 18 | import os 19 | import sys 20 | 21 | # Use this if you want to avoid using the GPU 22 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 23 | import tensorflow as tf 24 | from tensorflow.core.util.event_pb2 import Event 25 | 26 | 27 | def rename_events(input_file, old_tags, new_tag): 28 | new_file = input_file + '.new' 29 | # Make a record writer 30 | with tf.io.TFRecordWriter(new_file) as writer: 31 | # Iterate event records 32 | for rec in tf.data.TFRecordDataset([input_file]): 33 | # Read event 34 | ev = Event() 35 | ev.MergeFromString(rec.numpy()) 36 | # Check if it is a summary 37 | if ev.summary: 38 | # Iterate summary values 39 | for v in ev.summary.value: 40 | # Check if the tag should be renamed 41 | if v.tag in old_tags: 42 | # Rename with new tag name 43 | v.tag = new_tag 44 | writer.write(ev.SerializeToString()) 45 | os.rename(new_file, input_file) 46 | 47 | 48 | if __name__ == '__main__': 49 | if len(sys.argv) != 4: 50 | print(f'{sys.argv[0]} ', 51 | file=sys.stderr) 52 | sys.exit(1) 53 | input_file, old_tags, new_tag = sys.argv[1:] 54 | old_tags = old_tags.split(';') 55 | rename_events(input_file, old_tags, new_tag) 56 | print('Done') 57 | -------------------------------------------------------------------------------- /tools/logs/tb-rescale-scalars.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script rescales scalar values in TensorBoard log files. 5 | # It does the modification in-place (so make back ups!). 6 | # 7 | # Example: 8 | # 9 | # find . -name "*.tfevents*" -exec tb-rescale-scalars.py {} "iteration-time/samples per second" 1000 \; 10 | # 11 | # More than one old tag can be rescaled – use ";" as a separator: 12 | # 13 | # tb-rescale-scalars.py events.out.tfevents.1 "training loss;validation loss" 1e-2 14 | # 15 | # By default, BigScience GPT throughput values will be fixed up according to 16 | # https://github.com/bigscience-workshop/Megatron-DeepSpeed/issues/236, 17 | # i.e. the rescaling fixes values wrongly logged as "seconds" when they are 18 | # actually milliseconds. 19 | # 20 | # This script is derived from https://stackoverflow.com/a/60080531/9201239 21 | # and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c. 22 | 23 | import os 24 | import sys 25 | 26 | # Use this if you want to avoid using the GPU 27 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 28 | import tensorflow as tf 29 | from tensorflow.core.util.event_pb2 import Event 30 | 31 | 32 | def rescale_scalars(input_file, tags, rescale_factor): 33 | new_file = input_file + '.new' 34 | # Make a record writer 35 | with tf.io.TFRecordWriter(new_file) as writer: 36 | # Iterate event records 37 | for rec in tf.data.TFRecordDataset([input_file]): 38 | # Read event 39 | ev = Event() 40 | ev.MergeFromString(rec.numpy()) 41 | # Check if it is a summary 42 | if ev.summary: 43 | # Iterate summary values 44 | for v in ev.summary.value: 45 | # Check if the tag should be rescaled 46 | if v.tag in tags: 47 | v.simple_value *= rescale_factor 48 | writer.write(ev.SerializeToString()) 49 | os.rename(new_file, input_file) 50 | 51 | 52 | if __name__ == '__main__': 53 | if len(sys.argv) < 2: 54 | print(f'{sys.argv[0]} [ []]', 55 | file=sys.stderr) 56 | sys.exit(1) 57 | 58 | if len(sys.argv) < 3: 59 | sys.argv.append(';'.join([ 60 | 'iteration-time/samples per second', 61 | 'iteration-time/samples per second per replica', 62 | 'iteration-time/tokens per second', 63 | 'iteration-time/tokens per second per replica', 64 | ])) 65 | if len(sys.argv) < 4: 66 | sys.argv.append('1000') 67 | 68 | input_file, tags, rescale_factor = sys.argv[1:] 69 | tags = tags.split(';') 70 | rescale_factor = float(rescale_factor) 71 | rescale_scalars(input_file, tags, rescale_factor) 72 | print('Done') 73 | -------------------------------------------------------------------------------- /tools/openwebtext/README.md: -------------------------------------------------------------------------------- 1 | The following steps show how to prepare training dataset to train the mode. 2 | 3 | # Libraries to install 4 | 5 | ``` 6 | pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 7 | git clone https://github.com/mattilyra/LSH 8 | cd LSH 9 | python setup.py install 10 | ``` 11 | 12 | # Download the dataset 13 | 14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ) 15 | 2. Remove blacklisted URLs. 16 | ``` 17 | python blacklist_urls.py 18 | ``` 19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 20 | 21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique. 22 | 23 | # Prepare the data for GPT-2 training: 24 | 25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards. 26 | ``` 27 | python cleanup_dataset.py 28 | ``` 29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`. 30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`. 31 | ``` 32 | python find_duplicates.py --inputs --output 33 | ``` 34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest. 35 | ``` 36 | python group_duplicate_urls.py 37 | ``` 38 | 4. Remove similar documents that were detected in the last step. 39 | ``` 40 | python remove_group_duplicates.py 41 | ``` 42 | 43 | 5. Shuffle the dataset. 44 | ``` 45 | shuf -o train_data.json 46 | ``` 47 | 48 | # Deduplicating ngrams 49 | 50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command. 51 | 52 | ``` 53 | python filter_ngrams.py --tasks --dedup-dataset --output 54 | ``` 55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments. 56 | 57 | Only for the lambada task, we need to provide the path, `--lambada-path `. 58 | 59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details. 60 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import json 18 | import os 19 | import time 20 | 21 | """ 22 | This code adds id to each json object in a json file. User can add prefix 23 | to the ids. 24 | """ 25 | 26 | if __name__ == '__main__': 27 | 28 | print('parsing the arguments ...') 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 32 | ' json file where id needs to be added') 33 | parser.add_argument('--output-file', type=str, default=None, help=\ 34 | 'Output file name with id') 35 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 36 | 'Id prefix') 37 | parser.add_argument('--log-interval', type=int, default=100, 38 | help='Log interval') 39 | args = parser.parse_args() 40 | 41 | print('Adding ids to dataset ...') 42 | 43 | f_input = open(args.input_file, 'r', encoding='utf-8') 44 | f_output = open(args.output_file, 'wb') 45 | 46 | unique_ids = 1 47 | start_time = time.time() 48 | for row in f_input: 49 | each_row = json.loads(row) 50 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 51 | each_row['adlr_id'] = adlr_id_string 52 | myjson = json.dumps(each_row, ensure_ascii=False) 53 | 54 | f_output.write(myjson.encode('utf-8')) 55 | f_output.write('\n'.encode('utf-8')) 56 | 57 | if unique_ids % args.log_interval == 0: 58 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 59 | unique_ids, time.time() - start_time), flush=True) 60 | 61 | unique_ids += 1 62 | 63 | # Close the file. 64 | f_input.close() 65 | f_output.close() 66 | 67 | print('done :-)', flush=True) 68 | -------------------------------------------------------------------------------- /tools/openwebtext/group_duplicate_url.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import time 18 | import sys 19 | 20 | 21 | if __name__ == '__main__': 22 | 23 | 24 | print('grouping duplicate urls ...') 25 | 26 | input = sys.argv[1] 27 | output = sys.argv[2] 28 | if len(sys.argv) > 3: 29 | jaccard_similarity_threshold = float(sys.argv[3]) 30 | else: 31 | jaccard_similarity_threshold = 0.7 32 | 33 | url_to_index = {} 34 | index_to_urls = [] 35 | counter = 0 36 | start_time = time.time() 37 | with open(input, 'r') as f: 38 | for line in f: 39 | counter += 1 40 | myjson = json.loads(line) 41 | urls = [] 42 | for main_url in myjson.keys(): 43 | urls.append(main_url) 44 | for value in myjson[main_url]: 45 | for other_url, js in value.items(): 46 | if js >= jaccard_similarity_threshold: 47 | urls.append(other_url) 48 | current_index = -1 49 | other_indices = set() 50 | for url in urls: 51 | if url in url_to_index: 52 | if current_index == -1: 53 | current_index = url_to_index[url] 54 | elif current_index != url_to_index[url]: 55 | other_indices.add(url_to_index[url]) 56 | if current_index == -1: 57 | current_index = len(index_to_urls) 58 | index_to_urls.append(set()) 59 | for url in urls: 60 | url_to_index[url] = current_index 61 | index_to_urls[current_index].add(url) 62 | for index in other_indices: 63 | for url in index_to_urls[index]: 64 | index_to_urls[current_index].add(url) 65 | url_to_index[url] = current_index 66 | index_to_urls[index] = None 67 | 68 | if counter % 100000 == 0: 69 | print(' > processed {} lines in {} seconds ...'.format( 70 | counter, time.time() - start_time)) 71 | 72 | 73 | total_remove = 0 74 | total_remain = 0 75 | for urls in index_to_urls: 76 | if urls is not None: 77 | if len(urls) > 1: 78 | total_remove += (len(urls) - 1) 79 | total_remain += 1 80 | print('out of {} urls, only {} are unique and {} should be removed'.format( 81 | total_remove+total_remain, total_remain, total_remove)) 82 | 83 | with open(output, 'wb') as f: 84 | for i, urls in enumerate(index_to_urls): 85 | if urls is not None: 86 | if len(urls) > 1: 87 | myjson = json.dumps({str(i): list(urls)}, 88 | ensure_ascii=False) 89 | f.write(myjson.encode('utf-8')) 90 | f.write('\n'.encode('utf-8')) 91 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import glob 18 | import sys 19 | import json 20 | import argparse 21 | 22 | if __name__ == '__main__': 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--json_path", type=str, default=".", 26 | help="path where all the json files are located") 27 | 28 | parser.add_argument("--output_file", type=str, default="merged_output.json", 29 | help="filename where the merged json should go") 30 | 31 | args = parser.parse_args() 32 | 33 | json_path = args.json_path 34 | out_file = args.output_file 35 | 36 | json_files = glob.glob(json_path + '/*.json') 37 | 38 | counter = 0 39 | 40 | with open(out_file, 'w') as outfile: 41 | for fname in json_files: 42 | counter += 1 43 | 44 | if counter % 1024 == 0: 45 | print("Merging at ", counter, flush=True) 46 | 47 | with open(fname, 'r') as infile: 48 | for row in infile: 49 | each_row = json.loads(row) 50 | outfile.write(row) 51 | 52 | 53 | print("Merged file", out_file, flush=True) 54 | 55 | 56 | -------------------------------------------------------------------------------- /tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import time 19 | import sys 20 | 21 | 22 | if __name__ == '__main__': 23 | 24 | url_filename = sys.argv[1] 25 | data_filename = sys.argv[2] 26 | output_filename = sys.argv[3] 27 | 28 | urls = set() 29 | with open(url_filename, 'r') as f: 30 | for line in f: 31 | myjson = json.loads(line) 32 | for key in myjson: 33 | this_urls = myjson[key] 34 | for i in range(1, len(this_urls)): 35 | urls.add(this_urls[i]) 36 | print('will be removing {} urls'.format(len(urls)), flush=True) 37 | 38 | written_docs = 0 39 | removed_docs = 0 40 | removed_chars = 0 41 | start_time = time.time() 42 | with open(output_filename, 'wb') as fout: 43 | with open(data_filename, 'r') as fin: 44 | for line in fin: 45 | try: 46 | myjson = json.loads(line) 47 | url = myjson['url'] 48 | if url in urls: 49 | print('removing', myjson) 50 | removed_docs += 1 51 | removed_chars += len(myjson['text']) 52 | continue 53 | myjson = json.dumps(myjson, ensure_ascii=False) 54 | fout.write(myjson.encode('utf-8')) 55 | fout.write('\n'.encode('utf-8')) 56 | written_docs += 1 57 | if written_docs % 10000 == 0: 58 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 59 | '| removed: {} (char: {})'.format( 60 | time.time() - start_time, 61 | written_docs, removed_docs, removed_chars)) 62 | except Exception as e: 63 | print('[SKIPPING]', line, e) 64 | 65 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 66 | '| removed: {} (char: {})'.format( 67 | time.time() - start_time, 68 | written_docs, removed_docs, removed_chars)) 69 | print('done :-)') 70 | -------------------------------------------------------------------------------- /tools/tb/tb-remove-events-by-group.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # this script removes events from tensorboard log files by group names 5 | # it does the removal in place (so make back ups!) 6 | # 7 | # example: 8 | # 9 | # find . -name "*.tfevents*" -exec tb-remove-events-by-group.py {} "batch-size" \; 10 | # 11 | # which wold match any of "batch-size/batch-size", "batch-size/batch-size vs samples", etc. 12 | # 13 | # more than one group can be removed - use `;` as a separator: 14 | # 15 | # tb-remove-events-by-group.py events.out.tfevents.1 "batch-size;grad-norm" 16 | # 17 | # this script is derived from https://stackoverflow.com/a/60080531/9201239 18 | # 19 | # Important: this script requires CUDA environment. 20 | 21 | from pathlib import Path 22 | import os 23 | import re 24 | import shlex 25 | import sys 26 | 27 | # avoid using the GPU 28 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 29 | # disable logging 30 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 31 | import tensorflow as tf 32 | from tensorflow.core.util.event_pb2 import Event 33 | 34 | 35 | def is_tag_matching_group(tag, groups_to_remove): 36 | for group in groups_to_remove: 37 | if tag.startswith(group): 38 | return True 39 | return False 40 | 41 | 42 | def remove_events(input_file, groups_to_remove): 43 | new_file = input_file + ".new" 44 | # Make a record writer 45 | with tf.io.TFRecordWriter(new_file) as writer: 46 | # Iterate event records 47 | for rec in tf.data.TFRecordDataset([input_file]): 48 | # Read event 49 | ev = Event() 50 | ev.MergeFromString(rec.numpy()) 51 | # Check if it is a summary event 52 | if ev.summary: 53 | orig_values = [v for v in ev.summary.value] 54 | filtered_values = [v for v in orig_values if not is_tag_matching_group(v.tag, groups_to_remove)] 55 | #print(f"filtered_values={len(filtered_values)}, orig_values={len(orig_values)}") 56 | if len(filtered_values) != len(orig_values): 57 | # for v in orig_values: 58 | # print(v) 59 | del ev.summary.value[:] 60 | ev.summary.value.extend(filtered_values) 61 | writer.write(ev.SerializeToString()) 62 | os.rename(new_file, input_file) 63 | 64 | def remove_events_dir(input_file, groups_to_remove): 65 | # Write removed events 66 | remove_events(input_file, groups_to_remove) 67 | 68 | if __name__ == '__main__': 69 | if len(sys.argv) != 3: 70 | print(f'{sys.argv[0]} ', 71 | file=sys.stderr) 72 | sys.exit(1) 73 | input_file, groups_to_remove = sys.argv[1:] 74 | print(input_file, shlex.quote(groups_to_remove)) 75 | groups_to_remove = groups_to_remove.split(';') 76 | remove_events_dir(input_file, groups_to_remove) 77 | -------------------------------------------------------------------------------- /tools/tb/tb-remove-events-by-tag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # this script removes events from tensorboard log files by specific tag names 5 | # it does the removal in place (so make back ups!) 6 | # 7 | # example: 8 | # 9 | # find . -name "*.tfevents*" -exec tb-remove-events-by-tag.py {} "batch-size/batch-size" \; 10 | # 11 | # more than one tag can be removed - use `;` as a separator: 12 | # 13 | # tb-remove-events-by-tag.py events.out.tfevents.1 "batch-size/batch-size;batch-size/batch-size vs samples" 14 | # 15 | # this script is derived from https://stackoverflow.com/a/60080531/9201239 16 | # 17 | # Important: this script requires CUDA environment. 18 | 19 | import shlex 20 | import sys 21 | from pathlib import Path 22 | import os 23 | # avoid using the GPU 24 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 25 | # disable logging 26 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 27 | import tensorflow as tf 28 | from tensorflow.core.util.event_pb2 import Event 29 | 30 | def remove_events(input_file, tags_to_remove): 31 | new_file = input_file + ".new" 32 | # Make a record writer 33 | with tf.io.TFRecordWriter(new_file) as writer: 34 | # Iterate event records 35 | for rec in tf.data.TFRecordDataset([input_file]): 36 | # Read event 37 | ev = Event() 38 | ev.MergeFromString(rec.numpy()) 39 | # Check if it is a summary event 40 | if ev.summary: 41 | orig_values = [v for v in ev.summary.value] 42 | filtered_values = [v for v in orig_values if v.tag not in tags_to_remove] 43 | #print(f"filtered_values={len(filtered_values)}, orig_values={len(orig_values)}") 44 | if len(filtered_values) != len(orig_values): 45 | # for v in orig_values: 46 | # print(v) 47 | del ev.summary.value[:] 48 | ev.summary.value.extend(filtered_values) 49 | writer.write(ev.SerializeToString()) 50 | os.rename(new_file, input_file) 51 | 52 | def remove_events_dir(input_file, tags_to_remove): 53 | # Write removed events 54 | remove_events(input_file, tags_to_remove) 55 | 56 | if __name__ == '__main__': 57 | if len(sys.argv) != 3: 58 | print(f'{sys.argv[0]} ', 59 | file=sys.stderr) 60 | sys.exit(1) 61 | input_file, tags_to_remove = sys.argv[1:] 62 | print(input_file, shlex.quote(tags_to_remove)) 63 | tags_to_remove = tags_to_remove.split(';') 64 | remove_events_dir(input_file, tags_to_remove) 65 | -------------------------------------------------------------------------------- /tools/tb/tb-rename-events.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # this script renames event names in tensorboard log files 5 | # it does the rename in place (so make back ups!) 6 | # 7 | # example: 8 | # 9 | # find . -name "*.tfevents*" -exec tb-rename-events.py {} "iteration-time" "iteration-time/iteration-time" \; 10 | # 11 | # more than one old tag can be remapped to one new tag - use `;` as a separator: 12 | # 13 | # tb-rename-events.py events.out.tfevents.1 "training loss;validation loss" "loss" 14 | # 15 | # this script is derived from https://stackoverflow.com/a/60080531/9201239 16 | # 17 | # Important: this script requires CUDA environment. 18 | 19 | import shlex 20 | import sys 21 | from pathlib import Path 22 | import os 23 | # avoid using the GPU 24 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 25 | # disable logging 26 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 27 | import tensorflow as tf 28 | from tensorflow.core.util.event_pb2 import Event 29 | 30 | def rename_events(input_file, old_tags, new_tag): 31 | new_file = input_file + ".new" 32 | # Make a record writer 33 | with tf.io.TFRecordWriter(new_file) as writer: 34 | # Iterate event records 35 | for rec in tf.data.TFRecordDataset([input_file]): 36 | # Read event 37 | ev = Event() 38 | ev.MergeFromString(rec.numpy()) 39 | # Check if it is a summary 40 | #print(ev) 41 | if ev.summary: 42 | # Iterate summary values 43 | for v in ev.summary.value: 44 | #print(v) 45 | # Check if the tag should be renamed 46 | if v.tag in old_tags: 47 | # Rename with new tag name 48 | v.tag = new_tag 49 | writer.write(ev.SerializeToString()) 50 | os.rename(new_file, input_file) 51 | 52 | def rename_events_dir(input_file, old_tags, new_tag): 53 | # Write renamed events 54 | rename_events(input_file, old_tags, new_tag) 55 | 56 | if __name__ == '__main__': 57 | if len(sys.argv) != 4: 58 | print(f'{sys.argv[0]} ', 59 | file=sys.stderr) 60 | sys.exit(1) 61 | input_file, old_tags, new_tag = sys.argv[1:] 62 | print(input_file, shlex.quote(old_tags), shlex.quote(new_tag)) 63 | old_tags = old_tags.split(';') 64 | rename_events_dir(input_file, old_tags, new_tag) 65 | --------------------------------------------------------------------------------