├── .gitignore ├── .gitlab-ci.yml ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── README.md ├── README_ORIGINAL.md ├── examples ├── pretrain_gpt3_5B_24layers_bf16.sh ├── pretrain_gpt3_5B_24layers_bf16_bs1024_slurm.sh ├── pretrain_gpt3_6.7B.slurm ├── pretrain_gpt3_6.7B_32layers_bf16.sh ├── pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh ├── pretrain_gpt3_6.7B_compile.slurm └── sc21 │ ├── CONFIG.sh │ ├── README.md │ ├── SBATCH.sh │ ├── SRUN.sh │ ├── run_figure_11.sh │ ├── run_figure_12.sh │ ├── run_figure_13.sh │ ├── run_figure_14.sh │ ├── run_figure_15.sh │ ├── run_figure_16.sh │ ├── run_figure_17.sh │ ├── run_figure_18.sh │ └── run_table_1.sh ├── images └── cases_april2021.png ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── data │ ├── Makefile │ ├── __init__.py │ ├── autoaugment.py │ ├── bert_dataset.py │ ├── biencoder_dataset_utils.py │ ├── blendable_dataset.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── gpt_dataset.py │ ├── helpers.cpp │ ├── ict_dataset.py │ ├── indexed_dataset.py │ ├── orqa_wiki_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ ├── t5_dataset.py │ ├── test │ │ ├── test_indexed_dataset.py │ │ └── test_preprocess_data.sh │ └── vit_dataset.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── layer_norm_cuda.cpp │ ├── layer_norm_cuda_kernel.cu │ ├── scaled_masked_softmax.cpp │ ├── scaled_masked_softmax.h │ ├── scaled_masked_softmax_cuda.cu │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax.h │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ ├── tests │ │ ├── __init__.py │ │ └── test_fused_kernels.py │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── learning_rates.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── distributed.py │ ├── enums.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_softmax.py │ ├── gpt_model.py │ ├── language_model.py │ ├── module.py │ ├── multiple_choice.py │ ├── realm_model.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vit_model.py ├── mpu │ ├── __init__.py │ ├── cross_entropy.py │ ├── data.py │ ├── initialize.py │ ├── layers.py │ ├── mappings.py │ ├── random.py │ ├── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py │ └── utils.py ├── optimizer │ ├── __init__.py │ ├── clip_grads.py │ ├── grad_scaler.py │ └── optimizer.py ├── p2p_communication.py ├── schedules.py ├── text_generation_server.py ├── text_generation_utils.py ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_gpt_mp.py ├── pretrain_ict.py ├── pretrain_t5.py ├── pretrain_vit.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── main.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification.py │ ├── eval_utils.py │ ├── finetune_utils.py │ └── main.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests └── test_basic.py └── tools ├── linter.py ├── merge_mp_partitions.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py ├── preprocess_data.py ├── run_text_generation_server.py └── text_generation_cli.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | 3 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel 2 | 3 | test: 4 | script: 5 | - pytest --junitxml=report.xml tests 6 | artifacts: 7 | when: always 8 | reports: 9 | junit: report.xml 10 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This file creates codeowners for the documentation. It will allow setting code reviewers for all Pull requests to merge to the master branch 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # Refernce guide - https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-code-owners#example-[…]ners-file 5 | # Example - These owners will be the default owners for everything in 6 | # the repo. Unless a later match takes precedence, 7 | # @global-owner1 and @global-owner2 will be requested for 8 | # review when someone opens a pull request. 9 | # * @global-owner1 @global-owner2 10 | 11 | * @aws-maens @aws-mesharma @musunita 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository is no longer supported. 2 | 3 | Please start using AWS Neuron reference for NeMo Megatron (https://github.com/aws-neuron/neuronx-nemo-megatron) or Neuron Distributed (https://github.com/aws-neuron/neuronx-distributed) instead. 4 | 5 | This Megatron-LM library (source https://github.com/NVIDIA/Megatron-LM) is adapted for use with AWS Trainium via Neuron SDK. 6 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_5B_24layers_bf16.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -o pipefail 3 | 4 | MODEL_CONFIG_NAME=gpt3_5B_24layers_bf16 5 | 6 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document 7 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME} 8 | 9 | NUM_NEURONCORES=32 10 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES" 11 | 12 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3 13 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1 14 | 15 | export NEURON_RT_STOCHASTIC_ROUNDING_SEED=0 16 | export NEURON_FUSE_SOFTMAX=1 17 | export XLA_USE_BF16=1 18 | 19 | # Workaround "Too many open files" error with GPT training on U20 server AMI 20 | ulimit -n 8192 21 | 22 | TRAIN_ITERS=10000 23 | TB_DIR=./tb_${MODEL_CONFIG_NAME} 24 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then 25 | TRAIN_ITERS=65 26 | TB_DIR=/tmp/parallel_compile_ignored_tb_output 27 | fi 28 | 29 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 30 | --tensor-model-parallel-size 8 \ 31 | --num-layers 24 \ 32 | --hidden-size 4096 \ 33 | --num-attention-heads 32 \ 34 | --micro-batch-size 1 \ 35 | --global-batch-size 64 \ 36 | --seq-length 2048 \ 37 | --max-position-embeddings 2048 \ 38 | --train-iters $TRAIN_ITERS \ 39 | --lr-decay-iters 320000 \ 40 | --data-path $DATA_PATH \ 41 | --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \ 42 | --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \ 43 | --data-impl mmap \ 44 | --split 100,0,0 \ 45 | --distributed-backend xla \ 46 | --lr 0.00015 \ 47 | --lr-decay-style cosine \ 48 | --min-lr 1.0e-5 \ 49 | --weight-decay 1e-2 \ 50 | --clip-grad 1 \ 51 | --lr-warmup-fraction .01 \ 52 | --log-interval 1 \ 53 | --tensorboard-log-interval 1 \ 54 | --eval-interval $TRAIN_ITERS \ 55 | --eval-iters 1000 \ 56 | --attention-dropout 0 \ 57 | --hidden-dropout 0 \ 58 | --no-masked-softmax-fusion \ 59 | --no-bias-gelu-fusion \ 60 | --no-bias-dropout-fusion \ 61 | --no-async-tensor-model-parallel-allreduce \ 62 | --no-contiguous-buffers-in-local-ddp \ 63 | --save-xser $CHECKPOINT_PATH \ 64 | --save-interval 2000 \ 65 | --keep-last-checkpoint-only \ 66 | --use-cpu-initialization \ 67 | --tensorboard-dir $TB_DIR \ 68 | |& tee run_log_$MODEL_CONFIG_NAME.txt & 69 | wait %1 70 | 71 | ret_val=$? 72 | if [ $ret_val -eq 0 ]; then 73 | success=1 74 | else 75 | success=0 76 | fi 77 | 78 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh 79 | if [ -e $dump_to_s3_update_json_scr ]; then 80 | $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON." 81 | else 82 | echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON." 83 | fi 84 | 85 | exit $ret_val 86 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_5B_24layers_bf16_bs1024_slurm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -o pipefail 3 | 4 | MODEL_CONFIG_NAME=gpt3_5B_24layers_bf16 5 | 6 | # Enable Elastic Fabric Adapter for higher networking performance 7 | export FI_EFA_USE_DEVICE_RDMA=1 8 | export FI_PROVIDER=efa 9 | export FI_EFA_FORK_SAFE=1 10 | 11 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document 12 | 13 | MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`) 14 | MASTER_PORT=2022 15 | NUM_NEURONCORES=32 16 | 17 | WORLD_SIZE_JOB=$SLURM_NTASKS 18 | RANK_NODE=$SLURM_NODEID 19 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 20 | echo $DISTRIBUTED_ARGS 21 | 22 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME}_${WORLD_SIZE_JOB} 23 | 24 | # Keep only 3 number of graphs loaded in Neuron runtime for each process to reduce device mem usage 25 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3 26 | # Mark all parameter transfers as static to enable runtime optimizations for wrapped torch.nn modules 27 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1 28 | # Enables custom lowering for Softmax operation to enable compiler optimizations and improve GPT performance 29 | export NEURON_FUSE_SOFTMAX=1 30 | # Cast training to BF16 and enable stochastic rounding 31 | export XLA_USE_BF16=1 32 | # Increase Neuron RT execution timeout in case slow compilation causes Neuron RT to wait longer than default timeout 33 | export NEURON_RT_EXEC_TIMEOUT=600 34 | 35 | # Separate NeuronCache dir per node, workaround limitation to file locking on NFS 36 | export NEURON_CC_FLAGS="--cache_dir=$HOME/neuron_cache/gpt/`hostname`" 37 | 38 | TRAIN_ITERS=143051 39 | TB_DIR=./tb_${MODEL_CONFIG_NAME} 40 | # Run fewer steps and ignore tb output when extract graphs only (neuron_parallel_compile) 41 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then 42 | TRAIN_ITERS=65 43 | TB_DIR=/tmp/parallel_compile_ignored_tb_output 44 | fi 45 | 46 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 47 | --tensor-model-parallel-size 8 \ 48 | --num-layers 24 \ 49 | --hidden-size 4096 \ 50 | --num-attention-heads 32 \ 51 | --micro-batch-size 1 \ 52 | --global-batch-size 1024 \ 53 | --seq-length 2048 \ 54 | --max-position-embeddings 2048 \ 55 | --train-iters $TRAIN_ITERS \ 56 | --lr-decay-iters 123977 \ 57 | --data-path $DATA_PATH \ 58 | --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \ 59 | --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \ 60 | --data-impl mmap \ 61 | --split 100,0,0 \ 62 | --distributed-backend xla \ 63 | --lr 0.00012 \ 64 | --lr-decay-style cosine \ 65 | --min-lr 1.2e-5 \ 66 | --weight-decay 1e-1 \ 67 | --clip-grad 1 \ 68 | --lr-warmup-fraction 0.00125 \ 69 | --log-interval 1 \ 70 | --tensorboard-log-interval 1 \ 71 | --eval-interval $TRAIN_ITERS \ 72 | --eval-iters 1000 \ 73 | --attention-dropout 0 \ 74 | --hidden-dropout 0 \ 75 | --no-masked-softmax-fusion \ 76 | --no-bias-gelu-fusion \ 77 | --no-bias-dropout-fusion \ 78 | --no-async-tensor-model-parallel-allreduce \ 79 | --no-contiguous-buffers-in-local-ddp \ 80 | --init-method-std 0.006 \ 81 | --adam-beta1 0.9 \ 82 | --adam-beta2 0.95 \ 83 | --save-xser $CHECKPOINT_PATH \ 84 | --save-interval 2000 \ 85 | --keep-last-checkpoint-only \ 86 | --use-cpu-initialization \ 87 | --tensorboard-dir $TB_DIR \ 88 | |& tee run_log_$MODEL_CONFIG_NAME.$RANK_NODE.$WORLD_SIZE_JOB.txt & 89 | wait %1 90 | 91 | ret_val=$? 92 | 93 | if [ $ret_val -eq 0 ] ; then 94 | msg="SUCCESS" 95 | elif [ $ret_val -eq 2 ] ; then 96 | msg="SCANCEL/INTERRUPT" 97 | else 98 | msg="INTERNAL FAILURE" 99 | # Uncomment lines below to requeue after internal failure (make sure the script doesn't fail) 100 | #msg="INTERNAL FAILURE - HARDWARE ISSUE? Requeue JOB ID ${SLURM_JOB_ID} - use scancel to terminate" 101 | #scontrol requeue ${SLURM_JOB_ID} 102 | fi 103 | echo $msg 104 | 105 | if [ $ret_val -eq 0 ]; then 106 | success=1 107 | else 108 | success=0 109 | fi 110 | 111 | # Below is for testing only, not needed for actual execution 112 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh 113 | if [ -e $dump_to_s3_update_json_scr ]; then 114 | $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON." 115 | else 116 | echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON." 117 | fi 118 | 119 | exit $ret_val 120 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_6.7B.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --nodes=16 3 | #SBATCH --exclusive 4 | #SBATCH --output=slurm-%x-%j.out 5 | #SBATCH --requeue 6 | #SBATCH --open-mode=append 7 | 8 | srun ./examples/pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh 9 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_6.7B_32layers_bf16.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -o pipefail 3 | 4 | MODEL_CONFIG_NAME=gpt3_6.7B_32layers_bf16 5 | 6 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document 7 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME} 8 | 9 | NUM_NEURONCORES=32 10 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES" 11 | 12 | # Keep only 3 number of graphs loaded in Neuron runtime for each process to reduce device mem usage 13 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3 14 | # Mark all parameter transfers as static to enable runtime optimizations for wrapped torch.nn modules 15 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1 16 | # Enables custom lowering for Softmax operation to enable compiler optimizations and improve GPT performance 17 | export NEURON_FUSE_SOFTMAX=1 18 | # Cast training to BF16 and enable stochastic rounding 19 | export XLA_USE_BF16=1 20 | 21 | # Workaround "Too many open files" error with GPT training on U20 server AMI 22 | ulimit -n 8192 23 | 24 | TRAIN_ITERS=10000 25 | TB_DIR=./tb_${MODEL_CONFIG_NAME} 26 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then 27 | TRAIN_ITERS=65 28 | TB_DIR=/tmp/parallel_compile_ignored_tb_output 29 | fi 30 | 31 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 32 | --tensor-model-parallel-size 8 \ 33 | --num-layers 32 \ 34 | --hidden-size 4096 \ 35 | --num-attention-heads 32 \ 36 | --micro-batch-size 1 \ 37 | --global-batch-size 64 \ 38 | --seq-length 2048 \ 39 | --max-position-embeddings 2048 \ 40 | --train-iters $TRAIN_ITERS \ 41 | --lr-decay-iters 320000 \ 42 | --data-path $DATA_PATH \ 43 | --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \ 44 | --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \ 45 | --data-impl mmap \ 46 | --split 100,0,0 \ 47 | --distributed-backend xla \ 48 | --lr 0.00015 \ 49 | --lr-decay-style cosine \ 50 | --min-lr 1.0e-5 \ 51 | --weight-decay 1e-2 \ 52 | --clip-grad 1 \ 53 | --lr-warmup-fraction .01 \ 54 | --log-interval 1 \ 55 | --tensorboard-log-interval 1 \ 56 | --eval-interval $TRAIN_ITERS \ 57 | --eval-iters 1000 \ 58 | --attention-dropout 0 \ 59 | --hidden-dropout 0 \ 60 | --no-masked-softmax-fusion \ 61 | --no-bias-gelu-fusion \ 62 | --no-bias-dropout-fusion \ 63 | --no-async-tensor-model-parallel-allreduce \ 64 | --no-contiguous-buffers-in-local-ddp \ 65 | --save-xser $CHECKPOINT_PATH \ 66 | --save-interval 2000 \ 67 | --keep-last-checkpoint-only \ 68 | --use-cpu-initialization \ 69 | --tensorboard-dir $TB_DIR \ 70 | |& tee run_log_$MODEL_CONFIG_NAME.txt & 71 | wait %1 72 | 73 | ret_val=$? 74 | if [ $ret_val -eq 0 ]; then 75 | success=1 76 | else 77 | success=0 78 | fi 79 | 80 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh 81 | if [ -e $dump_to_s3_update_json_scr ]; then 82 | $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON." 83 | else 84 | echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON." 85 | fi 86 | 87 | exit $ret_val 88 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -o pipefail 3 | 4 | MODEL_CONFIG_NAME=gpt3_6.7B_32layers_bf16 5 | 6 | # Enable Elastic Fabric Adapter for higher networking performance 7 | export FI_EFA_USE_DEVICE_RDMA=1 8 | export FI_PROVIDER=efa 9 | export FI_EFA_FORK_SAFE=1 10 | 11 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document 12 | 13 | MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`) 14 | MASTER_PORT=2022 15 | NUM_NEURONCORES=32 16 | 17 | WORLD_SIZE_JOB=$SLURM_NTASKS 18 | RANK_NODE=$SLURM_NODEID 19 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 20 | echo $DISTRIBUTED_ARGS 21 | 22 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME}_${WORLD_SIZE_JOB} 23 | 24 | # Keep only 3 number of graphs loaded in Neuron runtime for each process to reduce device mem usage 25 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3 26 | # Mark all parameter transfers as static to enable runtime optimizations for wrapped torch.nn modules 27 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1 28 | # Enables custom lowering for Softmax operation to enable compiler optimizations and improve GPT performance 29 | export NEURON_FUSE_SOFTMAX=1 30 | # Cast training to BF16 and enable stochastic rounding 31 | export XLA_USE_BF16=1 32 | # Increase Neuron RT execution timeout in case slow compilation causes Neuron RT to wait longer than default timeout 33 | export NEURON_RT_EXEC_TIMEOUT=600 34 | 35 | # Separate NeuronCache dir per node, workaround limitation to file locking on NFS 36 | export NEURON_CC_FLAGS="--cache_dir=$HOME/neuron_cache/gpt/`hostname`" 37 | 38 | TRAIN_ITERS=143051 39 | TB_DIR=./tb_${MODEL_CONFIG_NAME} 40 | # Run fewer steps and ignore tb output when extract graphs only (neuron_parallel_compile) 41 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then 42 | # Using larger trial count to workaround extra recompilation due to https://github.com/pytorch/xla/issues/4994 43 | TRAIN_ITERS=325 44 | TB_DIR=/tmp/parallel_compile_ignored_tb_output 45 | fi 46 | 47 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 48 | --tensor-model-parallel-size 8 \ 49 | --num-layers 32 \ 50 | --hidden-size 4096 \ 51 | --num-attention-heads 32 \ 52 | --micro-batch-size 1 \ 53 | --global-batch-size 1024 \ 54 | --seq-length 2048 \ 55 | --max-position-embeddings 2048 \ 56 | --train-iters $TRAIN_ITERS \ 57 | --lr-decay-iters 123977 \ 58 | --data-path $DATA_PATH \ 59 | --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \ 60 | --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \ 61 | --data-impl mmap \ 62 | --split 100,0,0 \ 63 | --distributed-backend xla \ 64 | --lr 0.00012 \ 65 | --lr-decay-style cosine \ 66 | --min-lr 1.2e-5 \ 67 | --weight-decay 1e-1 \ 68 | --clip-grad 1 \ 69 | --lr-warmup-fraction 0.00125 \ 70 | --log-interval 1 \ 71 | --tensorboard-log-interval 1 \ 72 | --eval-interval $TRAIN_ITERS \ 73 | --eval-iters 1000 \ 74 | --attention-dropout 0 \ 75 | --hidden-dropout 0 \ 76 | --no-masked-softmax-fusion \ 77 | --no-bias-gelu-fusion \ 78 | --no-bias-dropout-fusion \ 79 | --no-async-tensor-model-parallel-allreduce \ 80 | --no-contiguous-buffers-in-local-ddp \ 81 | --init-method-std 0.006 \ 82 | --adam-beta1 0.9 \ 83 | --adam-beta2 0.95 \ 84 | --save-xser $CHECKPOINT_PATH \ 85 | --save-interval 2000 \ 86 | --keep-last-checkpoint-only \ 87 | --use-cpu-initialization \ 88 | --tensorboard-dir $TB_DIR \ 89 | |& tee run_log_$MODEL_CONFIG_NAME.$RANK_NODE.$WORLD_SIZE_JOB.txt & 90 | wait %1 91 | 92 | ret_val=$? 93 | 94 | if [ $ret_val -eq 0 ] ; then 95 | msg="SUCCESS" 96 | elif [ $ret_val -eq 2 ] ; then 97 | msg="SCANCEL/INTERRUPT" 98 | else 99 | msg="INTERNAL FAILURE" 100 | # Uncomment lines below to requeue after internal failure (make sure the script doesn't fail) 101 | #msg="INTERNAL FAILURE - HARDWARE ISSUE? Requeue JOB ID ${SLURM_JOB_ID} - use scancel to terminate" 102 | #scontrol requeue ${SLURM_JOB_ID} 103 | fi 104 | echo $msg 105 | 106 | if [ $ret_val -eq 0 ]; then 107 | success=1 108 | else 109 | success=0 110 | fi 111 | 112 | # Below is for testing only, not needed for actual execution 113 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh 114 | if [ -e $dump_to_s3_update_json_scr ]; then 115 | $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON." 116 | else 117 | echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON." 118 | fi 119 | 120 | exit $ret_val 121 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_6.7B_compile.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --nodes=16 3 | #SBATCH --exclusive 4 | #SBATCH --output=slurm-%x-%j.out 5 | #SBATCH --requeue 6 | #SBATCH --open-mode=append 7 | 8 | srun neuron_parallel_compile ./examples/pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh 9 | -------------------------------------------------------------------------------- /examples/sc21/CONFIG.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # SLURM options. 5 | export SLURM_PARTITION= 6 | export SLURM_ACCOUNT= 7 | 8 | 9 | # Source code. 10 | export MEGATRON_CODE_DIR= 11 | 12 | 13 | # This variable is used to mount the relevant part of the filesystem 14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the 15 | # launch directory already get mounted; this variable should be used to 16 | # mount the directories that contain the data and tokenizer files. 17 | export DOCKER_MOUNT_DIR= 18 | 19 | 20 | # Data and tokenizer files. 21 | MEGATRON_DATA= 22 | BPE_VOCAB_FILE= 23 | BPE_MERGE_FILE= 24 | 25 | 26 | # Megatron input parameters. 27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters 28 | # that are not listed here. 29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ 30 | --tensor-model-parallel-size ${TP} \ 31 | --pipeline-model-parallel-size ${PP} \ 32 | --micro-batch-size ${MBS} \ 33 | --global-batch-size ${GBS} \ 34 | --num-layers ${NLS} \ 35 | --hidden-size ${HS} \ 36 | --num-attention-heads ${NAH} \ 37 | --DDP-impl ${DDP} \ 38 | --data-path ${MEGATRON_DATA} \ 39 | --vocab-file ${BPE_VOCAB_FILE} \ 40 | --merge-file ${BPE_MERGE_FILE} \ 41 | --log-interval 5 \ 42 | --seq-length 2048 \ 43 | --max-position-embeddings 2048 \ 44 | --train-iters 500 \ 45 | --lr-decay-iters 320 \ 46 | --lr 0.0001 \ 47 | --min-lr 0.00001 \ 48 | --lr-decay-style cosine \ 49 | --lr-warmup-fraction 0.01 \ 50 | --split 969,30,1 \ 51 | --eval-iters 100 \ 52 | --eval-interval 1000 \ 53 | --clip-grad 1.0 \ 54 | --fp16 \ 55 | --loss-scale 8192 " 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/sc21/README.md: -------------------------------------------------------------------------------- 1 | # Reproducing Figures in SC21 Paper 2 | 3 | 4 | This directory contains some of the scripts that were used to produce the 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other 9 | schedulers as well. 10 | 11 | 12 | ## Setup 13 | 14 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please 15 | update the unspecified values (in angle brackets `<...>`) before launching any 16 | scripts. 17 | 18 | 19 | 20 | ## Scripts 21 | 22 | Below is a list of scripts that can be used to reproduce various figures in our 23 | [paper](https://arxiv.org/pdf/2104.04473.pdf): 24 | 25 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput 26 | for GPT models ranging from 1 billion to 1 trillion parameters. 27 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling 28 | performance of pipeline parallelism. 29 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of 30 | the interleaved schedule on a 175B GPT model. 31 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of 32 | different degrees of pipeline and tensor model parallelism on a model with 33 | 162.2 billion parameters. 34 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of 35 | different degrees of data and pipeline model parallelism on a model with 36 | 5.9 billion parameters. 37 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of 38 | different degrees of data and tensor model parallelism on a model with 39 | 5.9 billion parameters. 40 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of 41 | microbatch size. 42 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of 43 | activation recomputation. 44 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of 45 | the scatter-gather communication optimization. 46 | -------------------------------------------------------------------------------- /examples/sc21/SBATCH.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | sbatch -p ${SLURM_PARTITION} \ 5 | -A ${SLURM_ACCOUNT} \ 6 | --job-name=${JOB_NAME} \ 7 | --nodes=${NNODES} \ 8 | --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh 9 | 10 | exit 0 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/sc21/SRUN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 4 | 5 | 6 | THIS_DIR=`pwd` 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 8 | mkdir -p ${THIS_DIR}/logs 9 | 10 | 11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" 12 | 13 | 14 | srun -l \ 15 | --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ 16 | --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ 17 | --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" 18 | 19 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [1, 2, 4, 8]. 8 | PP=1 9 | 10 | # Batch size (global batch size) options = [8, 128]. 11 | GBS=8 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel size options. 18 | NLS=$((3*PP)) 19 | NNODES=${PP} 20 | 21 | 22 | # Other params. 23 | TP=8 24 | MBS=1 25 | HS=20480 26 | NAH=128 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Interleaved schedule options = [YES, NO]. 8 | INTERLEAVED=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set interleaved schedule options. 18 | if [ ${INTERLEAVED} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${INTERLEAVED} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 128]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and tensor-parallel size options. 18 | TP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | MBS=1 23 | NLS=32 24 | HS=20480 25 | NAH=128 26 | DDP=local 27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 28 | NNODES=8 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and data-parallel size options. 18 | DP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | TP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32]. 8 | TP=2 9 | 10 | # Batch size (global batch size) options = [32, 128, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set tensor-parallel and data-parallel size options. 18 | DP=$((64/TP)) 19 | 20 | 21 | # Other params. 22 | PP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Microbatch size options = [1, 2, 4, 8]. 8 | MBS=1 9 | 10 | # Batch size (global batch size) options = [128, 512]. 11 | GBS=128 12 | 13 | 14 | 15 | 16 | 17 | # Other params. 18 | TP=8 19 | PP=8 20 | NLS=32 21 | HS=15360 22 | NAH=128 23 | DDP=local 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | NNODES=8 26 | 27 | 28 | # Name of the job. 29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} 30 | 31 | 32 | # Import the configs. 33 | . `pwd`/CONFIG.sh 34 | 35 | 36 | # Submit the job. 37 | . `pwd`/SBATCH.sh 38 | 39 | 40 | exit 0 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_17.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Activation recomputation options = [YES, NO]. 8 | ACTIVATION_RECOMPUTATION=YES 9 | 10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256]. 11 | GBS=1 12 | 13 | 14 | 15 | 16 | 17 | # Set activation recomputation. 18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="" 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=16 31 | MBS=1 32 | NLS=80 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=16 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_18.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Scatter-gather communication optimization options = [YES, NO]. 8 | SCATTER_GATHER=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set scatter-gather communication optimization options. 18 | if [ ${SCATTER_GATHER} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${SCATTER_GATHER} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_table_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | # model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T] 7 | MODEL_SIZE=1.7B 8 | 9 | 10 | 11 | 12 | 13 | 14 | if [ ${MODEL_SIZE} == "1.7B" ]; then 15 | TP=1 16 | PP=1 17 | MBS=16 18 | GBS=512 19 | NLS=24 20 | HS=2304 21 | NAH=24 22 | DDP=torch 23 | NNODES=4 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | elif [ ${MODEL_SIZE} == "3.6B" ]; then 26 | TP=2 27 | PP=1 28 | MBS=16 29 | GBS=512 30 | NLS=30 31 | HS=3072 32 | NAH=32 33 | DDP=torch 34 | NNODES=8 35 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 36 | elif [ ${MODEL_SIZE} == "7.5B" ]; then 37 | TP=4 38 | PP=1 39 | MBS=16 40 | GBS=512 41 | NLS=36 42 | HS=4096 43 | NAH=32 44 | DDP=torch 45 | NNODES=16 46 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 47 | elif [ ${MODEL_SIZE} == "18B" ]; then 48 | TP=8 49 | PP=1 50 | MBS=8 51 | GBS=1024 52 | NLS=40 53 | HS=6144 54 | NAH=48 55 | DDP=torch 56 | NNODES=32 57 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 58 | elif [ ${MODEL_SIZE} == "39B" ]; then 59 | TP=8 60 | PP=2 61 | MBS=4 62 | GBS=1536 63 | NLS=48 64 | HS=8192 65 | NAH=64 66 | DDP=local 67 | NNODES=64 68 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 69 | elif [ ${MODEL_SIZE} == "76B" ]; then 70 | TP=8 71 | PP=4 72 | MBS=2 73 | GBS=1792 74 | NLS=60 75 | HS=10240 76 | NAH=80 77 | DDP=local 78 | NNODES=128 79 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5" 80 | elif [ ${MODEL_SIZE} == "145B" ]; then 81 | TP=8 82 | PP=8 83 | MBS=2 84 | GBS=2304 85 | NLS=80 86 | HS=12288 87 | NAH=96 88 | DDP=local 89 | NNODES=192 90 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 " 91 | elif [ ${MODEL_SIZE} == "310B" ]; then 92 | TP=8 93 | PP=16 94 | MBS=1 95 | GBS=2160 96 | NLS=96 97 | HS=16384 98 | NAH=128 99 | DDP=local 100 | NNODES=240 101 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 " 102 | elif [ ${MODEL_SIZE} == "530B" ]; then 103 | TP=8 104 | PP=35 105 | MBS=1 106 | GBS=2520 107 | NLS=105 108 | HS=20480 109 | NAH=128 110 | DDP=local 111 | NNODES=315 112 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 " 113 | elif [ ${MODEL_SIZE} == "1T" ]; then 114 | TP=8 115 | PP=64 116 | MBS=1 117 | GBS=3072 118 | NLS=128 119 | HS=25600 120 | NAH=160 121 | DDP=local 122 | NNODES=384 123 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 124 | else 125 | echo "Invalid configuration" 126 | exit 1 127 | fi 128 | 129 | 130 | # Name of the job 131 | export JOB_NAME=results_table_1_model_size_${MODEL_SIZE} 132 | 133 | 134 | # Import the configs. 135 | . `pwd`/CONFIG.sh 136 | 137 | 138 | # Submit the job. 139 | . `pwd`/SBATCH.sh 140 | 141 | 142 | exit 0 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-reference-for-megatron-lm/868d46cccb320a05eeac833be4a55c2c07db620a/images/cases_april2021.png -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from .global_vars import get_args 18 | from .global_vars import get_current_global_batch_size 19 | from .global_vars import get_num_microbatches 20 | from .global_vars import update_num_microbatches 21 | from .global_vars import get_tokenizer 22 | from .global_vars import get_tensorboard_writer 23 | from .global_vars import get_adlr_autoresume 24 | from .global_vars import get_timers 25 | from .initialize import initialize_megatron 26 | from .mpu.initialize import get_tensor_model_parallel_rank, get_data_parallel_rank 27 | 28 | def print_rank_0(message): 29 | """If distributed is initialized, print only on rank 0.""" 30 | if torch.distributed.is_initialized(): 31 | if torch.distributed.get_rank() == 0: 32 | print(message, flush=True) 33 | else: 34 | print(message, flush=True) 35 | 36 | def is_last_rank(): 37 | return torch.distributed.get_rank() == ( 38 | torch.distributed.get_world_size() - 1) 39 | 40 | def print_rank_last(message): 41 | """If distributed is initialized, print only on last rank.""" 42 | if torch.distributed.is_initialized(): 43 | if is_last_rank(): 44 | print(message, flush=True) 45 | else: 46 | print(message, flush=True) 47 | 48 | def print_rank_2D(message): 49 | """If distributed is initialized, print only on rank 0.""" 50 | if torch.distributed.is_initialized(): 51 | tp_rank = get_tensor_model_parallel_rank() 52 | dp_rank = get_data_parallel_rank() 53 | print(f'tp:{tp_rank},dp:{dp_rank}:'+message, flush=True) 54 | else: 55 | print(message, flush=True) 56 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | PYTHON_PATH = $(shell readlink -f $(shell which python3)) 5 | ifneq ("$(wildcard $(PYTHON_PATH)m-config)","") 6 | LIBEXT = $(shell $(PYTHON_PATH)m-config --extension-suffix) 7 | else ifneq ("$(wildcard $(PYTHON_PATH)-config)","") 8 | LIBEXT = $(shell $(PYTHON_PATH)-config --extension-suffix) 9 | else 10 | $(error "python3-config not found. Please run 'sudo apt install -y python3-dev' on Ubuntu or 'sudo yum install -y python3-devel' on Amazon Linux.") 11 | endif 12 | 13 | default: $(LIBNAME)$(LIBEXT) 14 | 15 | %$(LIBEXT): %.cpp 16 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 17 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Blendable dataset.""" 17 | 18 | import time 19 | 20 | import numpy as np 21 | import torch 22 | 23 | from megatron import print_rank_0 24 | from megatron import mpu 25 | 26 | 27 | class BlendableDataset(torch.utils.data.Dataset): 28 | 29 | 30 | def __init__(self, datasets, weights): 31 | 32 | self.datasets = datasets 33 | num_datasets = len(datasets) 34 | assert num_datasets == len(weights) 35 | 36 | self.size = 0 37 | for dataset in self.datasets: 38 | self.size += len(dataset) 39 | 40 | # Normalize weights. 41 | weights = np.array(weights, dtype=np.float64) 42 | sum_weights = np.sum(weights) 43 | assert sum_weights > 0.0 44 | weights /= sum_weights 45 | 46 | # Build indecies. 47 | start_time = time.time() 48 | assert num_datasets < 255 49 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 50 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 51 | 52 | from megatron.data import helpers 53 | helpers.build_blending_indices(self.dataset_index, 54 | self.dataset_sample_index, 55 | weights, num_datasets, self.size, 56 | torch.distributed.get_rank() == 0) 57 | print_rank_0('> elapsed time for building blendable dataset indices: ' 58 | '{:.2f} (sec)'.format(time.time() - start_time)) 59 | 60 | 61 | def __len__(self): 62 | return self.size 63 | 64 | 65 | def __getitem__(self, idx): 66 | dataset_idx = self.dataset_index[idx] 67 | sample_idx = self.dataset_sample_index[idx] 68 | return self.datasets[dataset_idx][sample_idx] 69 | -------------------------------------------------------------------------------- /megatron/data/test/test_indexed_dataset.py: -------------------------------------------------------------------------------- 1 | # This file isn't really a formal automated test, it's just a place to 2 | # put some code used during development and manual testing of 3 | # indexed_dataset. 4 | 5 | from megatron.data import indexed_dataset 6 | from megatron.tokenizer import build_tokenizer 7 | import argparse 8 | import os 9 | import sys 10 | 11 | import torch 12 | 13 | script_dir = os.path.dirname(os.path.realpath(__file__)) 14 | sys.path.append(os.path.join(script_dir, "../../../")) 15 | 16 | 17 | def test_indexed_dataset(args): 18 | ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) 19 | tokenizer = build_tokenizer(args) 20 | print(len(ds.doc_idx)) 21 | print(len(ds)) 22 | print(ds.doc_idx[-1]) 23 | if ds.supports_prefetch: 24 | # just prefetch the whole thing in test (so assume it is small) 25 | ds.prefetch(range(len(ds))) 26 | if args.count > len(ds.doc_idx) - 1: 27 | args.count = len(ds.doc_idx) - 1 28 | 29 | for i in range(args.count): 30 | start = ds.doc_idx[i] 31 | end = ds.doc_idx[i + 1] 32 | ids = ds[start:end] 33 | print(f"Document {i}:") 34 | print("--------------") 35 | for s in ids: 36 | assert len(s) > 0 37 | l = s.data.tolist() 38 | text = tokenizer.detokenize(l) 39 | print(text) 40 | print("---") 41 | 42 | 43 | def test_indexed_dataset_get(args): 44 | ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) 45 | tokenizer = build_tokenizer(args) 46 | size = ds.sizes[0] 47 | print(f"size: {size}") 48 | full = ds.get(0) 49 | print(full) 50 | # print(tokenizer.detokenize(full.data.tolist())) 51 | print("---") 52 | end = ds.get(0, offset=size - 10) 53 | print(end) 54 | # print(tokenizer.detokenize(end.data.tolist())) 55 | 56 | start = ds.get(0, length=10) 57 | print(start) 58 | # print(tokenizer.detokenize(start.data.tolist())) 59 | 60 | part = ds.get(0, offset=2, length=8) 61 | print(part) 62 | # print(tokenizer.detokenize(part.data.tolist())) 63 | 64 | # def test_albert_dataset(args): 65 | # # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True) 66 | # # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl) 67 | # # ds = AlbertDataset(idataset, tokenizer) 68 | # ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl, 69 | # args.epochs, args.max_num_samples, 70 | # args.masked_lm_prob, args.seq_length, 71 | # args.short_seq_prob, args.seed) 72 | # truncated = 0 73 | # total = 0 74 | # for i, s in enumerate(ds): 75 | # ids = s['text'] 76 | # tokens = ds.tokenizer.convert_ids_to_tokens(ids) 77 | # print(tokens) 78 | # if i >= args.count-1: 79 | # exit() 80 | 81 | 82 | def main(): 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument('--data', type=str, help='prefix to data files') 85 | parser.add_argument('--dataset-impl', type=str, default='infer', 86 | choices=['lazy', 'cached', 'mmap', 'infer']) 87 | parser.add_argument('--count', type=int, default=10, 88 | help='Number of samples/documents to print') 89 | 90 | group = parser.add_argument_group(title='tokenizer') 91 | group.add_argument('--tokenizer-type', type=str, required=True, 92 | choices=['BertWordPieceLowerCase', 93 | 'GPT2BPETokenizer'], 94 | help='What type of tokenizer to use.') 95 | group.add_argument('--vocab-file', type=str, default=None, 96 | help='Path to the vocab file') 97 | group.add_argument('--merge-file', type=str, default=None, 98 | help='Path to the BPE merge file (if necessary).') 99 | 100 | parser.add_argument('--epochs', type=int, default=5, 101 | help='Number of epochs to plan for') 102 | parser.add_argument('--max-num-samples', type=int, default=None, 103 | help='Maximum number of samples to plan for') 104 | parser.add_argument('--masked-lm-prob', type=float, default=0.15, 105 | help='probability of masking tokens') 106 | parser.add_argument('--seq-length', type=int, default=512, 107 | help='maximum sequence length') 108 | parser.add_argument('--short-seq-prob', type=float, default=0.1, 109 | help='probability of creating a short sequence') 110 | parser.add_argument('--seed', type=int, default=1234, 111 | help='random seed') 112 | args = parser.parse_args() 113 | args.rank = 0 114 | args.make_vocab_size_divisible_by = 128 115 | args.tensor_model_parallel_size = 1 116 | 117 | if args.dataset_impl == "infer": 118 | args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data) 119 | 120 | # test_albert_dataset(args) 121 | test_indexed_dataset_get(args) 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /megatron/data/vit_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import os 16 | import torch 17 | from torchvision import datasets, transforms 18 | from megatron.data.autoaugment import ImageNetPolicy 19 | 20 | 21 | def build_train_valid_datasets(data_path, crop_size=224, color_jitter=True): 22 | 23 | # training dataset 24 | train_data_path = os.path.join(data_path[0], "train") 25 | normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 26 | process = [ 27 | transforms.RandomResizedCrop(crop_size), 28 | transforms.RandomHorizontalFlip(), 29 | ] 30 | if color_jitter: 31 | process += [ 32 | transforms.ColorJitter( 33 | brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1 34 | ) 35 | ] 36 | fp16_t = transforms.ConvertImageDtype(torch.half) 37 | process += [ImageNetPolicy(), transforms.ToTensor(), normalize, fp16_t] 38 | transform_train = transforms.Compose(process) 39 | train_data = datasets.ImageFolder( 40 | root=train_data_path, transform=transform_train 41 | ) 42 | 43 | # validation dataset 44 | val_data_path = os.path.join(data_path[0], "val") 45 | transform_val = transforms.Compose( 46 | [ 47 | transforms.Resize(crop_size), 48 | transforms.CenterCrop(crop_size), 49 | transforms.ToTensor(), 50 | normalize, 51 | fp16_t 52 | ] 53 | ) 54 | val_data = datasets.ImageFolder( 55 | root=val_data_path, transform=transform_val 56 | ) 57 | 58 | return train_data, val_data 59 | -------------------------------------------------------------------------------- /megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """For backward compatibility, we need the class definitions to deserialize.""" 17 | 18 | class LossScaler: 19 | def __init__(self, scale=1): 20 | self.cur_scale = scale 21 | 22 | class DynamicLossScaler: 23 | def __init__(self, 24 | init_scale=2**32, 25 | scale_factor=2., 26 | scale_window=1000, 27 | min_scale=1, 28 | delayed_shift=1, 29 | consecutive_hysteresis=False): 30 | self.cur_scale = init_scale 31 | self.cur_iter = 0 32 | self.last_overflow_iter = -1 33 | self.scale_factor = scale_factor 34 | self.scale_window = scale_window 35 | self.min_scale = min_scale 36 | self.delayed_shift = delayed_shift 37 | self.cur_hysteresis = delayed_shift 38 | self.consecutive_hysteresis = consecutive_hysteresis 39 | 40 | -------------------------------------------------------------------------------- /megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import pathlib 18 | import subprocess 19 | 20 | from torch.utils import cpp_extension 21 | 22 | # Setting this param to a list has a problem of generating different 23 | # compilation commands (with diferent order of architectures) and 24 | # leading to recompilation of fused kernels. Set it to empty string 25 | # to avoid recompilation and assign arch flags explicity in 26 | # extra_cuda_cflags below 27 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 28 | 29 | 30 | def load(args): 31 | 32 | # Check if cuda 11 is installed for compute capability 8.0 33 | cc_flag = [] 34 | _, bare_metal_major, _ = _get_cuda_bare_metal_version( 35 | cpp_extension.CUDA_HOME) 36 | if int(bare_metal_major) >= 11: 37 | cc_flag.append('-gencode') 38 | cc_flag.append('arch=compute_80,code=sm_80') 39 | 40 | # Build path 41 | srcpath = pathlib.Path(__file__).parent.absolute() 42 | buildpath = srcpath / 'build' 43 | _create_build_dir(buildpath) 44 | 45 | # Helper function to build the kernels. 46 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 47 | return cpp_extension.load( 48 | name=name, 49 | sources=sources, 50 | build_directory=buildpath, 51 | extra_cflags=['-O3',], 52 | extra_cuda_cflags=['-O3', 53 | '-gencode', 'arch=compute_70,code=sm_70', 54 | '--use_fast_math'] + extra_cuda_flags + cc_flag, 55 | verbose=(args.rank == 0) 56 | ) 57 | 58 | # ============== 59 | # Fused softmax. 60 | # ============== 61 | 62 | if args.masked_softmax_fusion: 63 | extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', 64 | '-U__CUDA_NO_HALF_CONVERSIONS__', 65 | '--expt-relaxed-constexpr', 66 | '--expt-extended-lambda'] 67 | 68 | # Upper triangular softmax. 69 | sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', 70 | srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] 71 | scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( 72 | "scaled_upper_triang_masked_softmax_cuda", 73 | sources, extra_cuda_flags) 74 | 75 | # Masked softmax. 76 | sources=[srcpath / 'scaled_masked_softmax.cpp', 77 | srcpath / 'scaled_masked_softmax_cuda.cu'] 78 | scaled_masked_softmax_cuda = _cpp_extention_load_helper( 79 | "scaled_masked_softmax_cuda", sources, extra_cuda_flags) 80 | 81 | # ================================= 82 | # Mixed precision fused layer norm. 83 | # ================================= 84 | 85 | extra_cuda_flags = ['-maxrregcount=50'] 86 | sources=[srcpath / 'layer_norm_cuda.cpp', 87 | srcpath / 'layer_norm_cuda_kernel.cu'] 88 | fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( 89 | "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags) 90 | 91 | 92 | def _get_cuda_bare_metal_version(cuda_dir): 93 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], 94 | universal_newlines=True) 95 | output = raw_output.split() 96 | release_idx = output.index("release") + 1 97 | release = output[release_idx].split(".") 98 | bare_metal_major = release[0] 99 | bare_metal_minor = release[1][0] 100 | 101 | return raw_output, bare_metal_major, bare_metal_minor 102 | 103 | 104 | def _create_build_dir(buildpath): 105 | try: 106 | os.mkdir(buildpath) 107 | except OSError: 108 | if not os.path.isdir(buildpath): 109 | print(f"Creation of the build directory {buildpath} failed") 110 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | 22 | 23 | #ifndef TORCH_CHECK 24 | #define TORCH_CHECK AT_CHECK 25 | #endif 26 | 27 | #ifdef VERSION_GE_1_3 28 | #define DATA_PTR data_ptr 29 | #else 30 | #define DATA_PTR data 31 | #endif 32 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | torch::Tensor const& mask, 28 | float scale_factor); 29 | 30 | torch::Tensor bwd_cuda( 31 | torch::Tensor const& output_grads, 32 | torch::Tensor const& softmax_results, 33 | float scale_factor); 34 | 35 | int get_batch_per_block_cuda( 36 | int query_seq_len, 37 | int key_seq_len, 38 | int batches, 39 | int attn_heads); 40 | 41 | torch::Tensor fwd( 42 | torch::Tensor const& input, 43 | torch::Tensor const& mask, 44 | float scale_factor) { 45 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 46 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 47 | (input.scalar_type() == at::ScalarType::BFloat16), 48 | "Only fp16 and bf16 are supported"); 49 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 50 | 51 | return fwd_cuda(input, mask, scale_factor); 52 | } 53 | 54 | torch::Tensor bwd( 55 | torch::Tensor const& output_grads, 56 | torch::Tensor const& softmax_results, 57 | float scale_factor) { 58 | 59 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 60 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 61 | 62 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 63 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 64 | "Only fp16 and bf16 are supported"); 65 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 66 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 67 | "Only fp16 and bf16 are supported"); 68 | 69 | return bwd_cuda(output_grads, softmax_results, scale_factor); 70 | } 71 | 72 | int get_batch_per_block( 73 | int query_seq_len, 74 | int key_seq_len, 75 | int batches, 76 | int attn_heads) { 77 | return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads); 78 | } 79 | 80 | } // end namespace scaled_masked_softmax 81 | } // end namespace fused_softmax 82 | } // end namespace multihead_attn 83 | 84 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 85 | m.def("forward", 86 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 87 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 88 | 89 | m.def("backward", 90 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 91 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 92 | 93 | m.def("get_batch_per_block", 94 | &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block, 95 | "Return Batch per block size." 96 | ); 97 | } 98 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "scaled_masked_softmax.h" 25 | #include "type_shim.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_masked_softmax { 30 | 31 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){ 32 | return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads); 33 | } 34 | 35 | 36 | torch::Tensor fwd_cuda( 37 | torch::Tensor const& input, 38 | torch::Tensor const& mask, 39 | float scale_factor) 40 | { 41 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 42 | const int batches = input.size(0); 43 | const int pad_batches = mask.size(0); 44 | const int attn_heads = input.size(1); 45 | const int query_seq_len = input.size(2); 46 | const int key_seq_len = input.size(3); 47 | TORCH_INTERNAL_ASSERT(key_seq_len <= 2048); 48 | TORCH_INTERNAL_ASSERT(query_seq_len > 1); 49 | TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); 50 | TORCH_INTERNAL_ASSERT(mask.size(1) == 1); 51 | TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len); 52 | TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len); 53 | 54 | // Output 55 | auto act_options = input.options().requires_grad(false); 56 | torch::Tensor softmax_results = 57 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 58 | 59 | // Softmax Intermediate Result Ptr 60 | void* input_ptr = static_cast(input.data_ptr()); 61 | void* mask_ptr = static_cast(mask.data_ptr()); 62 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 63 | 64 | DISPATCH_HALF_AND_BFLOAT( 65 | input.scalar_type(), 66 | "dispatch_scaled_masked_softmax_forward", 67 | dispatch_scaled_masked_softmax_forward( 68 | reinterpret_cast(softmax_results_ptr), 69 | reinterpret_cast(input_ptr), 70 | reinterpret_cast(mask_ptr), 71 | scale_factor, 72 | query_seq_len, 73 | key_seq_len, 74 | batches, 75 | attn_heads, 76 | pad_batches); 77 | ); 78 | return softmax_results; 79 | } 80 | 81 | torch::Tensor bwd_cuda( 82 | torch::Tensor const& output_grads_, 83 | torch::Tensor const& softmax_results_, 84 | float scale_factor) { 85 | 86 | auto output_grads = output_grads_.contiguous(); 87 | auto softmax_results = softmax_results_.contiguous(); 88 | 89 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 90 | const int batches = output_grads.size(0); 91 | const int attn_heads = output_grads.size(1); 92 | const int query_seq_len = output_grads.size(2); 93 | const int key_seq_len = output_grads.size(3); 94 | 95 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 96 | 97 | //Softmax Grad 98 | DISPATCH_HALF_AND_BFLOAT( 99 | output_grads_.scalar_type(), 100 | "dispatch_scaled_masked_softmax_backward", 101 | dispatch_scaled_masked_softmax_backward( 102 | reinterpret_cast(output_grads_ptr), 103 | reinterpret_cast(output_grads_ptr), 104 | reinterpret_cast(softmax_results.data_ptr()), 105 | scale_factor, 106 | query_seq_len, 107 | key_seq_len, 108 | batches, 109 | attn_heads); 110 | ); 111 | 112 | //backward pass is completely in-place 113 | return output_grads; 114 | } 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | float scale_factor); 28 | 29 | torch::Tensor bwd_cuda( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor); 33 | 34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 35 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 36 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 37 | (input.scalar_type() == at::ScalarType::BFloat16), 38 | "Only fp16 and bf16 are supported"); 39 | 40 | return fwd_cuda(input, scale_factor); 41 | } 42 | 43 | torch::Tensor bwd( 44 | torch::Tensor const& output_grads, 45 | torch::Tensor const& softmax_results, 46 | float scale_factor) { 47 | 48 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 49 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 50 | 51 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 52 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 55 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 56 | "Only fp16 and bf16 are supported"); 57 | 58 | return bwd_cuda(output_grads, softmax_results, scale_factor); 59 | } 60 | 61 | } // end namespace scaled_upper_triang_masked_softmax 62 | } // end namespace fused_softmax 63 | } // end namespace multihead_attn 64 | 65 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 66 | m.def("forward", 67 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 68 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 69 | m.def("backward", 70 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 71 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 72 | } 73 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "scaled_upper_triang_masked_softmax.h" 25 | #include "type_shim.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_upper_triang_masked_softmax { 30 | 31 | torch::Tensor fwd_cuda( 32 | torch::Tensor const& input, 33 | float scale_factor) 34 | { 35 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 36 | const int attn_batches = input.size(0); 37 | const int seq_len = input.size(1); 38 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 39 | 40 | // Output 41 | auto act_options = input.options().requires_grad(false); 42 | torch::Tensor softmax_results = 43 | torch::empty({attn_batches, seq_len, seq_len}, act_options); 44 | 45 | // Softmax Intermediate Result Ptr 46 | void* input_ptr = static_cast(input.data_ptr()); 47 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 48 | 49 | DISPATCH_HALF_AND_BFLOAT( 50 | input.scalar_type(), 51 | "dispatch_scaled_upper_triang_masked_softmax_forward", 52 | dispatch_scaled_upper_triang_masked_softmax_forward( 53 | reinterpret_cast(softmax_results_ptr), 54 | reinterpret_cast(input_ptr), 55 | scale_factor, 56 | seq_len, 57 | seq_len, 58 | attn_batches); 59 | ); 60 | return softmax_results; 61 | } 62 | 63 | 64 | torch::Tensor bwd_cuda( 65 | torch::Tensor const& output_grads_, 66 | torch::Tensor const& softmax_results_, 67 | float scale_factor) { 68 | 69 | auto output_grads = output_grads_.contiguous(); 70 | auto softmax_results = softmax_results_.contiguous(); 71 | 72 | //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 73 | const int attn_batches = output_grads.size(0); 74 | const int seq_len = output_grads.size(1); 75 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 76 | 77 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 78 | 79 | //Softmax Grad 80 | DISPATCH_HALF_AND_BFLOAT( 81 | output_grads_.scalar_type(), 82 | "dispatch_scaled_upper_triang_masked_softmax_backward", 83 | dispatch_scaled_upper_triang_masked_softmax_backward( 84 | reinterpret_cast(output_grads_ptr), 85 | reinterpret_cast(output_grads_ptr), 86 | reinterpret_cast(softmax_results.data_ptr()), 87 | scale_factor, 88 | seq_len, 89 | seq_len, 90 | attn_batches); 91 | ); 92 | 93 | //backward pass is completely in-place 94 | return output_grads; 95 | } 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-reference-for-megatron-lm/868d46cccb320a05eeac833be4a55c2c07db620a/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/fused_kernels/type_shim.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | #include 19 | #include "compat.h" 20 | 21 | 22 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 23 | switch(TYPE) \ 24 | { \ 25 | case at::ScalarType::Half: \ 26 | { \ 27 | using scalar_t = at::Half; \ 28 | __VA_ARGS__; \ 29 | break; \ 30 | } \ 31 | case at::ScalarType::BFloat16: \ 32 | { \ 33 | using scalar_t = at::BFloat16; \ 34 | __VA_ARGS__; \ 35 | break; \ 36 | } \ 37 | default: \ 38 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 39 | } 40 | 41 | 42 | 43 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ 44 | switch(TYPEIN) \ 45 | { \ 46 | case at::ScalarType::Float: \ 47 | { \ 48 | using scalar_t_in = float; \ 49 | switch(TYPEOUT) \ 50 | { \ 51 | case at::ScalarType::Float: \ 52 | { \ 53 | using scalar_t_out = float; \ 54 | __VA_ARGS__; \ 55 | break; \ 56 | } \ 57 | case at::ScalarType::Half: \ 58 | { \ 59 | using scalar_t_out = at::Half; \ 60 | __VA_ARGS__; \ 61 | break; \ 62 | } \ 63 | case at::ScalarType::BFloat16: \ 64 | { \ 65 | using scalar_t_out = at::BFloat16; \ 66 | __VA_ARGS__; \ 67 | break; \ 68 | } \ 69 | default: \ 70 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ 71 | } \ 72 | break; \ 73 | } \ 74 | case at::ScalarType::Half: \ 75 | { \ 76 | using scalar_t_in = at::Half; \ 77 | using scalar_t_out = at::Half; \ 78 | __VA_ARGS__; \ 79 | break; \ 80 | } \ 81 | case at::ScalarType::BFloat16: \ 82 | { \ 83 | using scalar_t_in = at::BFloat16; \ 84 | using scalar_t_out = at::BFloat16; \ 85 | __VA_ARGS__; \ 86 | break; \ 87 | } \ 88 | default: \ 89 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ 90 | } 91 | 92 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modifications copyright Amazon Web Services and its affiliates. All rights reserved. 16 | 17 | #from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 18 | from torch.nn import LayerNorm 19 | 20 | from .distributed import DistributedDataParallel 21 | from .bert_model import BertModel 22 | from .gpt_model import GPTModel 23 | from .t5_model import T5Model 24 | from .language_model import get_language_model 25 | from .module import Float16Module 26 | from .enums import ModelType 27 | -------------------------------------------------------------------------------- /megatron/model/classification.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Classification model.""" 17 | 18 | import torch 19 | 20 | from megatron import get_args, print_rank_last 21 | from megatron import mpu 22 | from megatron.model.enums import AttnMaskType 23 | from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids 24 | from megatron.model.language_model import get_language_model 25 | from megatron.model.utils import get_linear_layer 26 | from megatron.model.utils import init_method_normal 27 | from megatron.model.utils import scaled_init_method_normal 28 | from .module import MegatronModule 29 | 30 | 31 | class Classification(MegatronModule): 32 | 33 | def __init__(self, 34 | num_classes, 35 | num_tokentypes=2, 36 | pre_process=True, 37 | post_process=True): 38 | super(Classification, self).__init__(share_word_embeddings=False) 39 | args = get_args() 40 | 41 | self.num_classes = num_classes 42 | self.pre_process = pre_process 43 | self.post_process = post_process 44 | init_method = init_method_normal(args.init_method_std) 45 | 46 | self.language_model, self._language_model_key = get_language_model( 47 | num_tokentypes=num_tokentypes, 48 | add_pooler=True, 49 | encoder_attn_mask_type=AttnMaskType.padding, 50 | init_method=init_method, 51 | scaled_init_method=scaled_init_method_normal(args.init_method_std, 52 | args.num_layers), 53 | pre_process=self.pre_process, 54 | post_process=self.post_process) 55 | 56 | # Multi-choice head. 57 | if self.post_process: 58 | self.classification_dropout = torch.nn.Dropout(args.hidden_dropout) 59 | self.classification_head = get_linear_layer(args.hidden_size, 60 | self.num_classes, 61 | init_method) 62 | self._classification_head_key = 'classification_head' 63 | 64 | def set_input_tensor(self, input_tensor): 65 | """See megatron.model.transformer.set_input_tensor()""" 66 | self.language_model.set_input_tensor(input_tensor) 67 | 68 | def forward(self, model_input, attention_mask, tokentype_ids=None): 69 | 70 | extended_attention_mask = bert_extended_attention_mask(attention_mask) 71 | input_ids = model_input 72 | position_ids = bert_position_ids(input_ids) 73 | 74 | lm_output = self.language_model( 75 | input_ids, 76 | position_ids, 77 | extended_attention_mask, 78 | tokentype_ids=tokentype_ids 79 | ) 80 | 81 | if self.post_process: 82 | _, pooled_output = lm_output 83 | classification_output = self.classification_dropout(pooled_output) 84 | classification_logits = self.classification_head(classification_output) 85 | 86 | # Reshape back to separate choices. 87 | classification_logits = classification_logits.view(-1, self.num_classes) 88 | 89 | return classification_logits 90 | return lm_output 91 | 92 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 93 | keep_vars=False): 94 | """For easy load when model is combined with other heads, 95 | add an extra key.""" 96 | 97 | state_dict_ = {} 98 | state_dict_[self._language_model_key] \ 99 | = self.language_model.state_dict_for_save_checkpoint( 100 | destination, prefix, keep_vars) 101 | if self.post_process: 102 | state_dict_[self._classification_head_key] \ 103 | = self.classification_head.state_dict( 104 | destination, prefix, keep_vars) 105 | return state_dict_ 106 | 107 | def load_state_dict(self, state_dict, strict=True): 108 | """Customized load.""" 109 | 110 | self.language_model.load_state_dict( 111 | state_dict[self._language_model_key], strict=strict) 112 | if self.post_process: 113 | if self._classification_head_key in state_dict: 114 | self.classification_head.load_state_dict( 115 | state_dict[self._classification_head_key], strict=strict) 116 | else: 117 | print_rank_last('***WARNING*** could not find {} in the checkpoint, ' 118 | 'initializing to random'.format( 119 | self._classification_head_key)) 120 | -------------------------------------------------------------------------------- /megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class ModelType(enum.Enum): 19 | encoder_or_decoder = 1 20 | encoder_and_decoder = 2 21 | 22 | class LayerType(enum.Enum): 23 | encoder = 1 24 | decoder = 2 25 | 26 | class AttnType(enum.Enum): 27 | self_attn = 1 28 | cross_attn = 2 29 | 30 | class AttnMaskType(enum.Enum): 31 | padding = 1 32 | causal = 2 33 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | 19 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 20 | # 1/sqrt(2*pi)-> 0.3989423 21 | # 1/sqrt(2) -> 0.70710678 22 | # sqrt(2/pi) -> 0.79788456 23 | # this function is tanh approximation of gelu 24 | # actual gelu is: 25 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 26 | 27 | @torch.jit.script 28 | def bias_gelu(bias, y): 29 | x = bias + y 30 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 31 | 32 | # gradient of tanh approximation of gelu 33 | # gradient of actual gelu is: 34 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 35 | @torch.jit.script 36 | def bias_gelu_back(g, bias, y): 37 | x = bias + y 38 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 39 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 40 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 41 | return ff*g 42 | 43 | class GeLUFunction(torch.autograd.Function): 44 | @staticmethod 45 | # bias is an optional argument 46 | def forward(ctx, input, bias): 47 | ctx.save_for_backward(input, bias) 48 | return bias_gelu(bias, input) 49 | 50 | @staticmethod 51 | def backward(ctx, grad_output): 52 | input, bias = ctx.saved_tensors 53 | tmp = bias_gelu_back(grad_output, bias, input) 54 | return tmp, tmp 55 | 56 | bias_gelu_impl = GeLUFunction.apply 57 | -------------------------------------------------------------------------------- /megatron/model/fused_layer_norm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """This code is copied fron NVIDIA apex: 17 | https://github.com/NVIDIA/apex 18 | with some changes. """ 19 | 20 | import numbers 21 | import torch 22 | from torch.nn.parameter import Parameter 23 | from torch.nn import init 24 | import importlib 25 | 26 | global fused_mix_prec_layer_norm_cuda 27 | fused_mix_prec_layer_norm_cuda = None 28 | 29 | 30 | class FusedLayerNormAffineFunction(torch.autograd.Function): 31 | 32 | @staticmethod 33 | def forward(ctx, input, weight, bias, normalized_shape, eps): 34 | 35 | ctx.normalized_shape = normalized_shape 36 | ctx.eps = eps 37 | input_ = input.contiguous() 38 | weight_ = weight.contiguous() 39 | bias_ = bias.contiguous() 40 | output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine( 41 | input_, ctx.normalized_shape, weight_, bias_, ctx.eps) 42 | ctx.save_for_backward(input_, weight_, bias_, mean, invvar) 43 | 44 | return output 45 | 46 | 47 | @staticmethod 48 | def backward(ctx, grad_output): 49 | 50 | input_, weight_, bias_, mean, invvar = ctx.saved_tensors 51 | grad_input = grad_weight = grad_bias = None 52 | grad_input, grad_weight, grad_bias \ 53 | = fused_mix_prec_layer_norm_cuda.backward_affine( 54 | grad_output.contiguous(), mean, invvar, 55 | input_, ctx.normalized_shape, 56 | weight_, bias_, ctx.eps) 57 | 58 | return grad_input, grad_weight, grad_bias, None, None 59 | 60 | 61 | 62 | class MixedFusedLayerNorm(torch.nn.Module): 63 | 64 | def __init__(self, normalized_shape, eps=1e-5): 65 | super(MixedFusedLayerNorm, self).__init__() 66 | 67 | global fused_mix_prec_layer_norm_cuda 68 | #fused_mix_prec_layer_norm_cuda = importlib.import_module( 69 | # "fused_mix_prec_layer_norm_cuda") 70 | fused_mix_prec_layer_norm_cuda = importlib.import_module( 71 | "fused_layer_norm_cuda") 72 | 73 | if isinstance(normalized_shape, numbers.Integral): 74 | normalized_shape = (normalized_shape,) 75 | self.normalized_shape = torch.Size(normalized_shape) 76 | self.eps = eps 77 | self.weight = Parameter(torch.Tensor(*normalized_shape)) 78 | self.bias = Parameter(torch.Tensor(*normalized_shape)) 79 | self.reset_parameters() 80 | 81 | 82 | def reset_parameters(self): 83 | 84 | init.ones_(self.weight) 85 | init.zeros_(self.bias) 86 | 87 | 88 | def forward(self, input): 89 | 90 | return FusedLayerNormAffineFunction.apply( 91 | input, self.weight, self.bias, self.normalized_shape,self.eps) 92 | 93 | -------------------------------------------------------------------------------- /megatron/model/gpt_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GPT-2 model.""" 17 | 18 | import torch 19 | 20 | from megatron import get_args 21 | from megatron import mpu 22 | from .module import MegatronModule 23 | 24 | from .enums import AttnMaskType 25 | from .language_model import parallel_lm_logits 26 | from .language_model import get_language_model 27 | from .utils import init_method_normal 28 | from .utils import scaled_init_method_normal 29 | 30 | 31 | def post_language_model_processing(lm_output, labels, logit_weights, 32 | parallel_output, 33 | fp16_lm_cross_entropy): 34 | 35 | # Output. 36 | output = parallel_lm_logits( 37 | lm_output, 38 | logit_weights, 39 | parallel_output) 40 | 41 | if labels is None: 42 | return output 43 | else: 44 | if fp16_lm_cross_entropy: 45 | assert output.dtype == torch.half 46 | loss = mpu.vocab_parallel_cross_entropy(output, labels) 47 | else: 48 | loss = mpu.vocab_parallel_cross_entropy(output.float(), labels) 49 | return loss 50 | 51 | 52 | class GPTModel(MegatronModule): 53 | """GPT-2 Language model.""" 54 | 55 | def __init__(self, 56 | num_tokentypes=0, 57 | parallel_output=True, 58 | pre_process=True, 59 | post_process=True): 60 | super(GPTModel, self).__init__() 61 | args = get_args() 62 | 63 | self.parallel_output = parallel_output 64 | self.pre_process = pre_process 65 | self.post_process = post_process 66 | self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy 67 | 68 | self.language_model, self._language_model_key = get_language_model( 69 | num_tokentypes=num_tokentypes, 70 | add_pooler=False, 71 | encoder_attn_mask_type=AttnMaskType.causal, 72 | init_method=init_method_normal(args.init_method_std), 73 | scaled_init_method=scaled_init_method_normal(args.init_method_std, 74 | args.num_layers), 75 | pre_process=self.pre_process, 76 | post_process=self.post_process) 77 | 78 | self.initialize_word_embeddings(init_method_normal) 79 | 80 | def set_input_tensor(self, input_tensor): 81 | """See megatron.model.transformer.set_input_tensor()""" 82 | self.language_model.set_input_tensor(input_tensor) 83 | 84 | def forward(self, input_ids, position_ids, attention_mask, labels=None, 85 | tokentype_ids=None, 86 | set_inference_key_value_memory=False, 87 | inference_max_sequence_len=None): 88 | 89 | lm_output = self.language_model( 90 | input_ids, 91 | position_ids, 92 | attention_mask, 93 | set_inference_key_value_memory=set_inference_key_value_memory, 94 | inference_max_sequence_len=inference_max_sequence_len) 95 | 96 | if self.post_process: 97 | return post_language_model_processing( 98 | lm_output, labels, 99 | self.word_embeddings_weight(), 100 | self.parallel_output, 101 | self.fp16_lm_cross_entropy) 102 | else: 103 | return lm_output 104 | 105 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 106 | keep_vars=False): 107 | 108 | state_dict_ = {} 109 | state_dict_[self._language_model_key] \ 110 | = self.language_model.state_dict_for_save_checkpoint( 111 | destination, prefix, keep_vars) 112 | # Save word_embeddings. 113 | if self.post_process and not self.pre_process: 114 | state_dict_[self._word_embeddings_for_head_key] \ 115 | = self.word_embeddings.state_dict(destination, prefix, keep_vars) 116 | return state_dict_ 117 | 118 | def load_state_dict(self, state_dict, strict=True): 119 | """Customized load.""" 120 | 121 | # Load word_embeddings. 122 | if self.post_process and not self.pre_process: 123 | self.word_embeddings.load_state_dict( 124 | state_dict[self._word_embeddings_for_head_key], strict=strict) 125 | if self._language_model_key in state_dict: 126 | state_dict = state_dict[self._language_model_key] 127 | self.language_model.load_state_dict(state_dict, strict=strict) 128 | -------------------------------------------------------------------------------- /megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for models.""" 17 | 18 | import math 19 | 20 | import torch 21 | 22 | from megatron import get_args 23 | 24 | def init_method_normal(sigma): 25 | """Init method based on N(0, sigma).""" 26 | def init_(tensor): 27 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 28 | 29 | return init_ 30 | 31 | 32 | def scaled_init_method_normal(sigma, num_layers): 33 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 34 | std = sigma / math.sqrt(2.0 * num_layers) 35 | 36 | def init_(tensor): 37 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 38 | 39 | return init_ 40 | 41 | 42 | def attention_mask_func(attention_scores, attention_mask): 43 | attention_scores.masked_fill_(attention_mask, -10000.0) 44 | return attention_scores 45 | 46 | 47 | def get_linear_layer(rows, columns, init_method): 48 | """Simple linear layer with weight initialization.""" 49 | layer = torch.nn.Linear(rows, columns) 50 | init_method(layer.weight) 51 | with torch.no_grad(): 52 | layer.bias.zero_() 53 | return layer 54 | 55 | @torch.jit.script 56 | def gelu_impl(x): 57 | """OpenAI's gelu implementation.""" 58 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 59 | (1.0 + 0.044715 * x * x))) 60 | def openai_gelu(x): 61 | return gelu_impl(x) 62 | 63 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 64 | @torch.jit.script 65 | def erf_gelu(x): 66 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 67 | -------------------------------------------------------------------------------- /megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .initialize import is_unitialized 23 | from .initialize import destroy_model_parallel 24 | from .initialize import get_data_parallel_group 25 | from .initialize import get_data_parallel_rank 26 | from .initialize import get_data_parallel_world_size 27 | from .initialize import get_embedding_group 28 | from .initialize import get_model_parallel_group 29 | from .initialize import get_tensor_model_parallel_group 30 | from .initialize import get_pipeline_model_parallel_group 31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank 32 | from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank 33 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage 34 | from .initialize import is_rank_in_embedding_group 35 | from .initialize import is_pipeline_stage_before_split, is_pipeline_stage_after_split 36 | from .initialize import is_pipeline_stage_at_split 37 | from .initialize import get_num_layers 38 | from .initialize import get_tensor_model_parallel_src_rank 39 | from .initialize import get_pipeline_model_parallel_first_rank 40 | from .initialize import get_pipeline_model_parallel_last_rank 41 | from .initialize import get_pipeline_model_parallel_next_rank 42 | from .initialize import get_pipeline_model_parallel_prev_rank 43 | from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size 44 | from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size 45 | from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank 46 | from .initialize import get_next_rank_group, get_prev_rank_group 47 | from .initialize import initialize_model_parallel 48 | from .initialize import model_parallel_is_initialized 49 | 50 | from .layers import ColumnParallelLinear 51 | from .layers import RowParallelLinear 52 | from .layers import VocabParallelEmbedding 53 | from .layers import (set_tensor_model_parallel_attributes, 54 | set_defaults_if_not_set_tensor_model_parallel_attributes, 55 | copy_tensor_model_parallel_attributes) 56 | 57 | from .mappings import copy_to_tensor_model_parallel_region 58 | from .mappings import gather_from_tensor_model_parallel_region 59 | from .mappings import reduce_from_tensor_model_parallel_region 60 | from .mappings import scatter_to_tensor_model_parallel_region 61 | 62 | from .random import checkpoint 63 | from .random import get_cuda_rng_tracker 64 | from .random import model_parallel_cuda_manual_seed 65 | from .random import gather_split_1d_tensor 66 | from .random import split_tensor_into_1d_equal_chunks 67 | 68 | from .utils import divide 69 | from .utils import split_tensor_along_last_dim 70 | -------------------------------------------------------------------------------- /megatron/mpu/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_tensor_model_parallel_group 19 | from .initialize import get_tensor_model_parallel_rank 20 | from .initialize import get_tensor_model_parallel_src_rank 21 | 22 | 23 | _MAX_DATA_DIM = 5 24 | 25 | 26 | def _check_data_types(keys, data, target_dtype): 27 | """Check that all the keys have the same target data type.""" 28 | for key in keys: 29 | assert data[key].dtype == target_dtype, '{} has data type {} which '\ 30 | 'is different than {}'.format(key, data[key].dtype, target_dtype) 31 | 32 | 33 | def _build_key_size_numel_dictionaries(keys, data): 34 | """Build the size on rank 0 and broadcast.""" 35 | max_dim = _MAX_DATA_DIM 36 | sizes = [0 for _ in range(max_dim) for _ in keys] 37 | 38 | # Pack the sizes on rank zero. 39 | if get_tensor_model_parallel_rank() == 0: 40 | offset = 0 41 | for key in keys: 42 | assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' 43 | size = data[key].size() 44 | for i, s in enumerate(size): 45 | sizes[i + offset] = s 46 | offset += max_dim 47 | 48 | # Move to GPU and broadcast. 49 | sizes_cuda = torch.cuda.LongTensor(sizes) 50 | torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(), 51 | group=get_tensor_model_parallel_group()) 52 | 53 | # Move back to cpu and unpack. 54 | sizes_cpu = sizes_cuda.cpu() 55 | key_size = {} 56 | key_numel = {} 57 | total_numel = 0 58 | offset = 0 59 | for key in keys: 60 | i = 0 61 | size = [] 62 | numel = 1 63 | while sizes_cpu[offset + i] > 0: 64 | this_size = sizes_cpu[offset + i] 65 | size.append(this_size) 66 | numel *= this_size 67 | i += 1 68 | key_size[key] = size 69 | key_numel[key] = numel 70 | total_numel += numel 71 | offset += max_dim 72 | 73 | return key_size, key_numel, total_numel 74 | 75 | 76 | def broadcast_data(keys, data, datatype): 77 | """Broadcast data from rank zero of each model parallel group to the 78 | members of the same model parallel group. 79 | 80 | Arguments: 81 | keys: list of keys in the data disctionary to be broadcasted 82 | data: data dictionary of string keys and cpu tensor values. 83 | datatype: torch data type of all tensors in data associated 84 | with keys. 85 | """ 86 | # Build (key, size) and (key, number of elements) dictionaries along 87 | # with the total number of elements on all ranks. 88 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, 89 | data) 90 | 91 | # Pack on rank zero. 92 | if get_tensor_model_parallel_rank() == 0: 93 | # Check that all keys have the same data type. 94 | _check_data_types(keys, data, datatype) 95 | # Flatten the data associated with the keys 96 | flatten_data = torch.cat( 97 | [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() 98 | else: 99 | flatten_data = torch.empty(total_numel, 100 | device=torch.cuda.current_device(), 101 | dtype=datatype) 102 | 103 | # Broadcast 104 | torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(), 105 | group=get_tensor_model_parallel_group()) 106 | 107 | # Unpack 108 | output = {} 109 | offset = 0 110 | for key in keys: 111 | size = key_size[key] 112 | numel = key_numel[key] 113 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 114 | offset += numel 115 | 116 | return output 117 | -------------------------------------------------------------------------------- /megatron/mpu/mappings.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modifications copyright Amazon Web Services and its affiliates. All rights reserved. 16 | 17 | import torch 18 | 19 | from .initialize import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank 20 | from .utils import split_tensor_along_last_dim 21 | import torch_xla.core.xla_model as xm 22 | 23 | 24 | def _reduce(input_): 25 | """All-reduce the input tensor across model parallel group.""" 26 | 27 | # Bypass the function if we are using only 1 GPU. 28 | if get_tensor_model_parallel_world_size()==1: 29 | return input_ 30 | 31 | # All-reduce. 32 | torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group(), async_op=True) 33 | 34 | return input_ 35 | 36 | 37 | def _split(input_): 38 | """Split the tensor along its last dimension and keep the 39 | corresponding slice.""" 40 | 41 | world_size = get_tensor_model_parallel_world_size() 42 | # Bypass the function if we are using only 1 GPU. 43 | if world_size==1: 44 | return input_ 45 | 46 | # Split along last dimension. 47 | input_list = split_tensor_along_last_dim(input_, world_size) 48 | 49 | # Note: torch.split does not create contiguous tensors by default. 50 | rank = get_tensor_model_parallel_rank() 51 | output = input_list[rank].contiguous() 52 | 53 | return output 54 | 55 | 56 | def _gather(input_): 57 | """Gather tensors and concatinate along the last dimension.""" 58 | 59 | world_size = get_tensor_model_parallel_world_size() 60 | # Bypass the function if we are using only 1 GPU. 61 | if world_size==1: 62 | return input_ 63 | 64 | # Size and dimension. 65 | last_dim = input_.dim() - 1 66 | rank = get_tensor_model_parallel_rank() 67 | 68 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)] 69 | tensor_list[rank] = input_ 70 | torch.distributed.all_gather(tensor_list, input_, 71 | group=get_tensor_model_parallel_group(), 72 | async_op=True) 73 | 74 | # Note: torch.cat already creates a contiguous tensor. 75 | output = torch.cat(tensor_list, dim=last_dim).contiguous() 76 | 77 | return output 78 | 79 | 80 | class _CopyToModelParallelRegion(torch.autograd.Function): 81 | """Pass the input to the model parallel region.""" 82 | 83 | @staticmethod 84 | def symbolic(graph, input_): 85 | return input_ 86 | 87 | @staticmethod 88 | def forward(ctx, input_): 89 | return input_ 90 | 91 | @staticmethod 92 | def backward(ctx, grad_output): 93 | return _reduce(grad_output) 94 | 95 | 96 | class _ReduceFromModelParallelRegion(torch.autograd.Function): 97 | """All-reduce the input from the model parallel region.""" 98 | 99 | @staticmethod 100 | def symbolic(graph, input_): 101 | return _reduce(input_) 102 | 103 | @staticmethod 104 | def forward(ctx, input_): 105 | return _reduce(input_) 106 | 107 | @staticmethod 108 | def backward(ctx, grad_output): 109 | return grad_output 110 | 111 | 112 | class _ScatterToModelParallelRegion(torch.autograd.Function): 113 | """Split the input and keep only the corresponding chuck to the rank.""" 114 | 115 | @staticmethod 116 | def symbolic(graph, input_): 117 | return _split(input_) 118 | 119 | @staticmethod 120 | def forward(ctx, input_): 121 | return _split(input_) 122 | 123 | @staticmethod 124 | def backward(ctx, grad_output): 125 | return _gather(grad_output) 126 | 127 | 128 | class _GatherFromModelParallelRegion(torch.autograd.Function): 129 | """Gather the input from model parallel region and concatinate.""" 130 | 131 | @staticmethod 132 | def symbolic(graph, input_): 133 | return _gather(input_) 134 | 135 | @staticmethod 136 | def forward(ctx, input_): 137 | return _gather(input_) 138 | 139 | @staticmethod 140 | def backward(ctx, grad_output): 141 | return _split(grad_output) 142 | 143 | 144 | # ----------------- 145 | # Helper functions. 146 | # ----------------- 147 | 148 | def copy_to_tensor_model_parallel_region(input_): 149 | return _CopyToModelParallelRegion.apply(input_) 150 | 151 | 152 | def reduce_from_tensor_model_parallel_region(input_): 153 | return _ReduceFromModelParallelRegion.apply(input_) 154 | 155 | 156 | def scatter_to_tensor_model_parallel_region(input_): 157 | return _ScatterToModelParallelRegion.apply(input_) 158 | 159 | 160 | def gather_from_tensor_model_parallel_region(input_): 161 | return _GatherFromModelParallelRegion.apply(input_) 162 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-neuron/aws-neuron-reference-for-megatron-lm/868d46cccb320a05eeac833be4a55c2c07db620a/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | from megatron import mpu 23 | import torch_xla.distributed.xla_backend #for XLA backend 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | 30 | def forward(self): 31 | return self.weight 32 | 33 | 34 | def set_random_seed(seed): 35 | """Set random seed for reproducability.""" 36 | random.seed(seed) 37 | numpy.random.seed(seed) 38 | torch.manual_seed(seed) 39 | mpu.model_parallel_cuda_manual_seed(seed) 40 | 41 | 42 | def initialize_distributed(backend='nccl'): 43 | """Initialize torch.distributed.""" 44 | # Get local rank in case it is provided. 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--local_rank', type=int, default=None, 47 | help='local rank passed from distributed launcher') 48 | args = parser.parse_args() 49 | local_rank = args.local_rank 50 | 51 | # Get rank and world size. 52 | rank = int(os.getenv('RANK', '0')) 53 | world_size = int(os.getenv("WORLD_SIZE", '1')) 54 | 55 | print('> initializing torch.distributed with local rank: {}, ' 56 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 57 | 58 | # Set the device id. 59 | device = rank % torch.cuda.device_count() 60 | if local_rank is not None: 61 | device = local_rank 62 | torch.cuda.set_device(device) 63 | 64 | # Call the init process. 65 | init_method = 'tcp://' 66 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 67 | master_port = os.getenv('MASTER_PORT', '6000') 68 | init_method += master_ip + ':' + master_port 69 | torch.distributed.init_process_group( 70 | backend=backend, 71 | world_size=world_size, 72 | rank=rank, 73 | init_method=init_method) 74 | 75 | 76 | def print_separator(message): 77 | torch.distributed.barrier() 78 | filler_len = (78 - len(message)) // 2 79 | filler = '-' * filler_len 80 | string = '\n' + filler + ' {} '.format(message) + filler 81 | if torch.distributed.get_rank() == 0: 82 | print(string, flush=True) 83 | torch.distributed.barrier() 84 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modifications copyright Amazon Web Services and its affiliates. All rights reserved. 16 | 17 | from commons import set_random_seed 18 | from commons import IdentityLayer 19 | from commons import print_separator 20 | from commons import initialize_distributed 21 | from mpu.cross_entropy import vocab_parallel_cross_entropy 22 | import mpu 23 | import torch.nn.functional as F 24 | import torch 25 | import random 26 | import sys 27 | sys.path.append("../..") 28 | 29 | 30 | def torch_cross_entropy(batch_size, seq_length, vocab_size, 31 | logits_scale, seed): 32 | set_random_seed(seed) 33 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 34 | scale=logits_scale).cuda() 35 | logits = identity() 36 | target = torch.cuda.LongTensor( 37 | size=(batch_size, seq_length)).random_(0, vocab_size) 38 | loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), 39 | target.view(-1), 40 | reduction='none').view_as(target).mean() 41 | loss.backward() 42 | return loss, identity.weight.grad 43 | 44 | 45 | def mpu_cross_entropy(batch_size, seq_length, vocab_size, 46 | logits_scale, seed): 47 | set_random_seed(seed) 48 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 49 | scale=logits_scale).cuda() 50 | logits = identity() 51 | logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits) 52 | target = torch.cuda.LongTensor( 53 | size=(batch_size, seq_length)).random_(0, vocab_size) 54 | loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() 55 | loss.backward() 56 | return loss, identity.weight.grad 57 | 58 | 59 | def test_cross_entropy(tensor_model_parallel_size): 60 | 61 | if torch.distributed.get_rank() == 0: 62 | print('> testing cross entropy with model parallel size {} ...'. 63 | format(tensor_model_parallel_size)) 64 | 65 | mpu.initialize_model_parallel(tensor_model_parallel_size) 66 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 67 | 68 | batch_size = 13 69 | seq_length = 17 70 | vocab_size_per_partition = 11 71 | logits_scale = 1000.0 72 | vocab_size = vocab_size_per_partition * tensor_model_parallel_size 73 | seed = 1234 74 | 75 | loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, 76 | vocab_size, logits_scale, 77 | seed) 78 | loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, 79 | vocab_size, logits_scale, 80 | seed) 81 | 82 | error = loss_torch.sub_(loss_mpu).abs().max() 83 | print(' max error in loss on global rank {}: {}'.format( 84 | torch.distributed.get_rank(), error)) 85 | assert error < 1.0e-6 86 | 87 | error = grad_torch.sub_(grad_mpu).abs().max() 88 | print(' max error in grad on global rank {}: {}'.format( 89 | torch.distributed.get_rank(), error)) 90 | assert error < 1.0e-6 91 | 92 | # Reset groups 93 | mpu.destroy_tensor_model_parallel() 94 | 95 | torch.distributed.barrier() 96 | if torch.distributed.get_rank() == 0: 97 | print('>> passed the test :-)') 98 | 99 | 100 | if __name__ == '__main__': 101 | 102 | initialize_distributed() 103 | world_size = torch.distributed.get_world_size() 104 | 105 | tensor_model_parallel_size = 1 106 | while tensor_model_parallel_size <= world_size: 107 | print_separator('test cross entropy') 108 | test_cross_entropy(tensor_model_parallel_size) 109 | tensor_model_parallel_size *= 2 110 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | from mpu import data as data_utils 19 | import mpu 20 | import torch 21 | import functools 22 | import operator 23 | import sys 24 | sys.path.append("../..") 25 | 26 | 27 | def test_broadcast_data(tensor_model_parallel_size): 28 | 29 | if torch.distributed.get_rank() == 0: 30 | print('> testing broadcast_data with model parallel size {} ...'. 31 | format(tensor_model_parallel_size)) 32 | 33 | mpu.initialize_model_parallel(tensor_model_parallel_size) 34 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 35 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 36 | 37 | key_size_t = {'key1': [7, 11], 38 | 'key2': [8, 2, 1], 39 | 'key3': [13], 40 | 'key4': [5, 1, 2], 41 | 'key5': [5, 12]} 42 | keys = list(key_size_t.keys()) 43 | 44 | data = {} 45 | data_t = {} 46 | for key in key_size_t: 47 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 48 | data_t[key] = data[key].clone() 49 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 50 | data_t['keyX'] = data['keyX'].clone() 51 | if mpu.get_tensor_model_parallel_rank() != 0: 52 | data = None 53 | 54 | data_utils._check_data_types(keys, data_t, torch.int64) 55 | key_size, key_numel, \ 56 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 57 | for key in keys: 58 | assert key_size[key] == key_size_t[key] 59 | total_numel_t = 0 60 | for key in keys: 61 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 62 | assert key_numel[key] == target_size 63 | total_numel_t += target_size 64 | assert total_numel == total_numel_t 65 | 66 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 67 | for key in keys: 68 | tensor = data_t[key].cuda() 69 | assert data_b[key].sub(tensor).abs().max() == 0 70 | 71 | # Reset groups 72 | mpu.destroy_tensor_model_parallel() 73 | 74 | torch.distributed.barrier() 75 | if torch.distributed.get_rank() == 0: 76 | print('>> passed the test :-)') 77 | 78 | 79 | if __name__ == '__main__': 80 | 81 | initialize_distributed() 82 | world_size = torch.distributed.get_world_size() 83 | 84 | tensor_model_parallel_size = 1 85 | while tensor_model_parallel_size <= world_size: 86 | print_separator('test test broadcast data') 87 | test_broadcast_data(tensor_model_parallel_size) 88 | tensor_model_parallel_size *= 2 89 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | import mpu 19 | import torch 20 | import sys 21 | sys.path.append("../..") 22 | 23 | 24 | def test_initialize_model_parallel(tensor_model_parallel_size): 25 | 26 | if torch.distributed.get_rank() == 0: 27 | print('> testing initialize_model_parallel with size {} ...'.format( 28 | tensor_model_parallel_size)) 29 | tensor_model_parallel_size_ = min(tensor_model_parallel_size, 30 | torch.distributed.get_world_size()) 31 | assert not mpu.model_parallel_is_initialized() 32 | mpu.initialize_model_parallel(tensor_model_parallel_size_) 33 | assert mpu.model_parallel_is_initialized() 34 | 35 | # Checks. 36 | def check(group, world_size, rank): 37 | assert world_size == torch.distributed.get_world_size(group=group) 38 | assert rank == torch.distributed.get_rank(group=group) 39 | 40 | # Model parallel. 41 | world_size = tensor_model_parallel_size_ 42 | rank = torch.distributed.get_rank() % tensor_model_parallel_size_ 43 | assert world_size == mpu.get_tensor_model_parallel_world_size() 44 | assert rank == mpu.get_tensor_model_parallel_rank() 45 | check(mpu.get_tensor_model_parallel_group(), world_size, rank) 46 | 47 | # Data parallel. 48 | world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_ 49 | rank = torch.distributed.get_rank() // tensor_model_parallel_size 50 | assert world_size == mpu.get_data_parallel_world_size() 51 | assert rank == mpu.get_data_parallel_rank() 52 | check(mpu.get_data_parallel_group(), world_size, rank) 53 | 54 | # Reset groups 55 | mpu.destroy_model_parallel() 56 | 57 | torch.distributed.barrier() 58 | if torch.distributed.get_rank() == 0: 59 | print('>> passed the test :-)') 60 | 61 | 62 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): 63 | 64 | if torch.distributed.get_rank() == 0: 65 | print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format( 66 | tensor_model_parallel_size_)) 67 | tensor_model_parallel_size = min(tensor_model_parallel_size_, 68 | torch.distributed.get_world_size()) 69 | assert not mpu.model_parallel_is_initialized() 70 | mpu.initialize_model_parallel(tensor_model_parallel_size) 71 | assert mpu.model_parallel_is_initialized() 72 | 73 | # Checks 74 | src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank() 75 | assert mpu.get_tensor_model_parallel_src_rank() == src_rank 76 | 77 | # Reset groups 78 | mpu.destroy_model_parallel() 79 | 80 | torch.distributed.barrier() 81 | if torch.distributed.get_rank() == 0: 82 | print('>> passed the test :-)') 83 | 84 | 85 | if __name__ == '__main__': 86 | 87 | initialize_distributed() 88 | world_size = torch.distributed.get_world_size() 89 | tensor_model_parallel_size = 1 90 | while tensor_model_parallel_size <= world_size: 91 | print_separator('test initialize model parallel') 92 | test_initialize_model_parallel(tensor_model_parallel_size) 93 | print_separator('test model parallel source rank') 94 | test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) 95 | tensor_model_parallel_size *= 2 96 | -------------------------------------------------------------------------------- /megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /megatron/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #from apex.optimizers import FusedAdam as Adam 17 | #from apex.optimizers import FusedSGD as SGD 18 | from torch.optim import AdamW as Adam 19 | from torch.optim import SGD 20 | 21 | from megatron import get_args 22 | from megatron.model import LayerNorm 23 | 24 | from .grad_scaler import ConstantGradScaler, DynamicGradScaler 25 | from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer 26 | 27 | 28 | def _get_params_for_weight_decay_optimization(modules): 29 | """Divide params into with-weight-decay and without-weight-decay groups. 30 | Layernorms and baises will have no weight decay but the rest will. 31 | """ 32 | 33 | weight_decay_params = {'params': []} 34 | no_weight_decay_params = {'params': [], 'weight_decay': 0.0} 35 | for module in modules: 36 | for module_ in module.modules(): 37 | if isinstance(module_, LayerNorm): 38 | no_weight_decay_params['params'].extend( 39 | [p for p in list(module_._parameters.values()) 40 | if p is not None]) 41 | else: 42 | weight_decay_params['params'].extend( 43 | [p for n, p in list(module_._parameters.items()) 44 | if p is not None and n != 'bias']) 45 | no_weight_decay_params['params'].extend( 46 | [p for n, p in list(module_._parameters.items()) 47 | if p is not None and n == 'bias']) 48 | 49 | return weight_decay_params, no_weight_decay_params 50 | 51 | 52 | def get_megatron_optimizer(model): 53 | args = get_args() 54 | 55 | # Base optimizer. 56 | param_groups = _get_params_for_weight_decay_optimization(model) 57 | if args.optimizer == 'adam': 58 | optimizer = Adam(param_groups, 59 | lr=args.lr, 60 | weight_decay=args.weight_decay, 61 | betas=(args.adam_beta1, args.adam_beta2), 62 | eps=args.adam_eps) 63 | elif args.optimizer == 'sgd': 64 | optimizer = SGD(param_groups, 65 | lr=args.lr, 66 | weight_decay=args.weight_decay, 67 | momentum=args.sgd_momentum) 68 | else: 69 | raise Exception('{} optimizer is not supported.'.format( 70 | args.optimizer)) 71 | 72 | # Determine whether the params have main-grad field. 73 | params_have_main_grad = False 74 | if args.DDP_impl == 'local': 75 | params_have_main_grad = True 76 | 77 | if args.fp16 or args.bf16: 78 | 79 | # Grad scaler: 80 | # if loss-scale is provided, instantiate the constant scaler. 81 | # if we are using fp16 and loss-scale is not present, use a 82 | # dynamic scaler. 83 | # otherwise we are running in bf16 with no loss-scale so 84 | # leave it as None. 85 | grad_scaler = None 86 | # Constant loss scale. 87 | if args.loss_scale: 88 | grad_scaler = ConstantGradScaler(args.loss_scale) 89 | # Dynamic loss scale. 90 | else: 91 | if args.fp16: 92 | grad_scaler = DynamicGradScaler( 93 | initial_scale=args.initial_loss_scale, 94 | min_scale=args.min_loss_scale, 95 | growth_factor=2.0, 96 | backoff_factor=0.5, 97 | growth_interval=args.loss_scale_window, 98 | hysteresis=args.hysteresis) 99 | 100 | # Megatron optimizer. 101 | return Float16OptimizerWithFloat16Params(optimizer, 102 | args.clip_grad, 103 | args.log_num_zeros_in_grad, 104 | params_have_main_grad, 105 | args.use_contiguous_buffers_in_local_ddp, 106 | args.bf16, 107 | grad_scaler) 108 | 109 | # FP32. 110 | return FP32Optimizer(optimizer, args.clip_grad, 111 | args.log_num_zeros_in_grad, 112 | params_have_main_grad, 113 | args.use_contiguous_buffers_in_local_ddp) 114 | -------------------------------------------------------------------------------- /megatron/optimizer/grad_scaler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Megatron grad scaler.""" 17 | 18 | from abc import ABC 19 | from abc import abstractmethod 20 | 21 | import torch 22 | 23 | import torch_xla.core.xla_model as xm 24 | torch.cuda.FloatTensor = lambda t: torch.FloatTensor(t).to(xm.xla_device()) 25 | torch.cuda.IntTensor = lambda t: torch.IntTensor(t).to(xm.xla_device()) 26 | 27 | class MegatronGradScaler(ABC): 28 | 29 | def __init__(self, initial_scale): 30 | """Initialize scale value with the input initial scale.""" 31 | assert initial_scale > 0.0 32 | self._scale = torch.cuda.FloatTensor([initial_scale]) 33 | 34 | @property 35 | def scale(self): 36 | return self._scale 37 | 38 | @property 39 | def inv_scale(self): 40 | return self._scale.double().reciprocal().float() 41 | 42 | @abstractmethod 43 | def update(self, found_inf): 44 | pass 45 | 46 | @abstractmethod 47 | def state_dict(self): 48 | pass 49 | 50 | @abstractmethod 51 | def load_state_dict(self, state_dict): 52 | pass 53 | 54 | 55 | 56 | class ConstantGradScaler(MegatronGradScaler): 57 | 58 | def update(self, found_inf): 59 | pass 60 | 61 | def state_dict(self): 62 | return dict() 63 | 64 | def load_state_dict(self, state_dict): 65 | pass 66 | 67 | 68 | 69 | class DynamicGradScaler(MegatronGradScaler): 70 | 71 | def __init__(self, initial_scale, min_scale, 72 | growth_factor, backoff_factor, 73 | growth_interval, hysteresis): 74 | """"Grad scaler with dynamic scale that gets adjusted 75 | during training.""" 76 | super(DynamicGradScaler, self).__init__(initial_scale) 77 | 78 | # Lower bound on the scale. 79 | assert min_scale > 0.0 80 | assert min_scale <= initial_scale 81 | self.min_scale = torch.cuda.FloatTensor([min_scale]) 82 | # Growth and backoff factors for the scale. 83 | assert growth_factor > 1.0 84 | self.growth_factor = torch.cuda.FloatTensor([growth_factor]) 85 | assert backoff_factor < 1.0 86 | assert backoff_factor > 0.0 87 | self.backoff_factor = torch.cuda.FloatTensor([backoff_factor]) 88 | # Interval over which if we don't see any inf/nan, 89 | # we will scale the grad scale by the growth factor. 90 | assert growth_interval > 0 91 | self.growth_interval = growth_interval 92 | # Number of inf/nans we should see before scaling down 93 | # the grad scale by the backoff factor. 94 | assert hysteresis > 0 95 | self.hysteresis = hysteresis 96 | 97 | # Trackers. 98 | self._growth_tracker = 0 99 | self._hysteresis_tracker = self.hysteresis 100 | 101 | 102 | def update(self, found_inf): 103 | 104 | # If we have an inf/nan, growth tracker is set to 0 105 | # and hysterisis tracker is reduced by 1. 106 | if found_inf: 107 | self._growth_tracker = 0 108 | self._hysteresis_tracker -= 1 109 | # Now if we are out of hysteresis count, scale down the loss. 110 | if self._hysteresis_tracker <= 0: 111 | self._scale = torch.max(self._scale * self.backoff_factor, 112 | self.min_scale) 113 | else: 114 | # If there is no nan/inf, increment the growth tracker. 115 | self._growth_tracker += 1 116 | # If we have had enough consequitive intervals with no nan/inf: 117 | if self._growth_tracker == self.growth_interval: 118 | # Reset the tracker and hysteresis trackers, 119 | self._growth_tracker = 0 120 | self._hysteresis_tracker = self.hysteresis 121 | # and scale up the loss scale. 122 | self._scale = self._scale * self.growth_factor 123 | 124 | 125 | def state_dict(self): 126 | state_dict = {} 127 | state_dict['scale'] = self._scale 128 | state_dict['growth_tracker'] = self._growth_tracker 129 | state_dict['hysteresis_tracker'] = self._hysteresis_tracker 130 | return state_dict 131 | 132 | 133 | def load_state_dict(self, state_dict): 134 | #self._scale = state_dict['scale'].cuda(torch.cuda.current_device()) 135 | self._scale = state_dict['scale'].to(xm.xla_device()) 136 | self._growth_tracker = state_dict['growth_tracker'] 137 | self._hysteresis_tracker = state_dict['hysteresis_tracker'] 138 | -------------------------------------------------------------------------------- /megatron/text_generation_server.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import datetime 16 | import torch 17 | import json 18 | import threading 19 | from flask import Flask, request, jsonify, current_app 20 | from flask_restful import Resource, Api 21 | from megatron import get_args 22 | from megatron import mpu 23 | from megatron.text_generation_utils import generate 24 | 25 | GENERATE_NUM = 0 26 | lock = threading.Lock() 27 | 28 | class MegatronGenerate(Resource): 29 | def __init__(self, model): 30 | self.model = model 31 | 32 | @staticmethod 33 | def send_do_generate(): 34 | choice = torch.cuda.LongTensor([GENERATE_NUM]) 35 | torch.distributed.broadcast(choice, 0) 36 | 37 | def put(self): 38 | args = get_args() 39 | print("request IP: " + str(request.remote_addr)) 40 | print(json.dumps(request.get_json()),flush=True) 41 | print("current time: ", datetime.datetime.now()) 42 | 43 | sentences = request.get_json()["sentences"] 44 | if len(sentences) > 128: 45 | return "Maximum number of sentences is 128", 400 46 | 47 | tokens_to_generate = 64 # Choosing hopefully sane default. Full sequence is slow 48 | if "tokens_to_generate" in request.get_json(): 49 | tokens_to_generate = request.get_json()["tokens_to_generate"] 50 | if not isinstance(tokens_to_generate, int): 51 | return "tokens_to_generate must be an integer greater than 0" 52 | if tokens_to_generate < 1: 53 | return "tokens_to_generate must be an integer greater than 0" 54 | 55 | all_probs = False 56 | if "all_probs" in request.get_json(): 57 | all_probs = request.get_json()["all_probs"] 58 | if not isinstance(all_probs, bool): 59 | return "all_probs must be a boolean value" 60 | 61 | temperature = args.temperature 62 | if "temperature" in request.get_json(): 63 | temperature = request.get_json()["temperature"] 64 | if not (type(temperature) == int or type(temperature) == float): 65 | return "temperature must be a positive number less than or equal to 100.0" 66 | if not (0.0 < temperature <= 100.0): 67 | return "temperature must be a positive number less than or equal to 100.0" 68 | 69 | add_BOS = False 70 | if "add_BOS" in request.get_json(): 71 | add_BOS = request.get_json()["add_BOS"] 72 | if not isinstance(add_BOS, bool): 73 | return "add_BOS must be a boolean value" 74 | 75 | with lock: # Need to get lock to keep multiple threads from hitting code 76 | MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate 77 | resp_sentences, resp_sentences_seg, output_logits, full_logits, tokens = generate(self.model, sentences, tokens_to_generate, all_probs, temperature, add_BOS) 78 | 79 | if all_probs: 80 | return jsonify({"sentences": resp_sentences, 81 | "segments": resp_sentences_seg, 82 | "logits": output_logits, 83 | "all_logits": full_logits, 84 | "tokens": tokens}) 85 | 86 | return jsonify({"sentences": resp_sentences, 87 | "segments": resp_sentences_seg, 88 | "logits": output_logits}) 89 | 90 | class MegatronServer(object): 91 | def __init__(self, model): 92 | self.app = Flask(__name__, static_url_path='') 93 | api = Api(self.app) 94 | api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model]) 95 | 96 | def run(self, url): 97 | self.app.run(url, threaded=True, debug=False) 98 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /pretrain_gpt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Pretrain GPT""" 17 | 18 | import torch 19 | from functools import partial 20 | from megatron import get_args 21 | from megatron import print_rank_0 22 | from megatron import get_timers 23 | from megatron import get_tokenizer 24 | from megatron import mpu 25 | from megatron.data.gpt_dataset import build_train_valid_test_datasets 26 | from megatron.model import GPTModel, ModelType 27 | from megatron.training import pretrain 28 | from megatron.utils import get_ltor_masks_and_position_ids 29 | from megatron.utils import average_losses_across_data_parallel_group 30 | import torch_xla.core.xla_model as xm 31 | import os 32 | 33 | os.environ["NEURON_CC_FLAGS"] = os.environ.get('NEURON_CC_FLAGS', '') + " --model-type transformer" 34 | 35 | def model_provider(pre_process=True, post_process=True): 36 | """Build the model.""" 37 | 38 | device = xm.xla_device() 39 | print_rank_0('building GPT model ...') 40 | model = GPTModel( 41 | num_tokentypes=0, 42 | parallel_output=True, 43 | pre_process=pre_process, 44 | post_process=post_process 45 | ).to(device) 46 | return model 47 | 48 | 49 | def get_batch(data_iterator): 50 | """Generate a batch""" 51 | args = get_args() 52 | tokenizer = get_tokenizer() 53 | 54 | # Items and their type. 55 | keys = ['text'] 56 | datatype = torch.int64 57 | 58 | # Broadcast data. 59 | if data_iterator is not None: 60 | data = next(data_iterator) 61 | else: 62 | data = None 63 | #data_b = mpu.broadcast_data(keys, data, datatype) 64 | data_b = data 65 | 66 | # Unpack. 67 | #tokens_ = data_b['text'].long() 68 | tokens_ = data_b['text'].int() 69 | labels = tokens_[:, 1:].contiguous() 70 | tokens = tokens_[:, :-1].contiguous() 71 | 72 | # Get the masks and postition ids. 73 | attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( 74 | tokens, 75 | tokenizer.eod, 76 | args.reset_position_ids, 77 | args.reset_attention_mask, 78 | args.eod_mask_loss) 79 | 80 | return tokens, labels, loss_mask, attention_mask, position_ids 81 | 82 | def loss_func(loss_mask, output_tensor): 83 | losses = output_tensor.float() 84 | loss_mask = loss_mask.view(-1).float() 85 | loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() 86 | 87 | if mpu.get_data_parallel_world_size() > 1: 88 | # Reduce loss for logging. 89 | averaged_loss = average_losses_across_data_parallel_group([loss]) 90 | return loss, {'lm loss': averaged_loss[0].detach()} 91 | else: 92 | return loss, {'lm loss': loss.detach()} 93 | 94 | 95 | def forward_step(data_iterator, model): 96 | """Forward step.""" 97 | args = get_args() 98 | timers = get_timers() 99 | 100 | # Get the batch. 101 | #commenting the line timer.start and timer.stop out to enable evaluation step 102 | # timers('batch-generator').start() 103 | tokens, labels, loss_mask, attention_mask, position_ids = get_batch( 104 | data_iterator) 105 | # timers('batch-generator').stop() 106 | output_tensor = model(tokens, position_ids, attention_mask, 107 | labels=labels) 108 | 109 | return output_tensor, partial(loss_func, loss_mask) 110 | 111 | 112 | def train_valid_test_datasets_provider(train_val_test_num_samples): 113 | """Build train, valid, and test datasets.""" 114 | args = get_args() 115 | 116 | print_rank_0('> building train, validation, and test datasets ' 117 | 'for GPT ...') 118 | train_ds, valid_ds, test_ds = build_train_valid_test_datasets( 119 | data_prefix=args.data_path, 120 | data_impl=args.data_impl, 121 | splits_string=args.split, 122 | train_valid_test_num_samples=train_val_test_num_samples, 123 | seq_length=args.seq_length, 124 | seed=args.seed, 125 | skip_warmup=(not args.mmap_warmup)) 126 | print_rank_0("> finished creating GPT datasets ...") 127 | return train_ds, valid_ds, test_ds 128 | 129 | 130 | if __name__ == '__main__': 131 | pretrain(train_valid_test_datasets_provider, model_provider, 132 | ModelType.encoder_or_decoder, 133 | forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) 134 | xm.rendezvous('ending') 135 | -------------------------------------------------------------------------------- /pretrain_gpt_mp.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Pretrain GPT""" 17 | 18 | import torch 19 | from functools import partial 20 | from megatron import get_args 21 | from megatron import print_rank_0 22 | from megatron import get_timers 23 | from megatron import get_tokenizer 24 | from megatron import mpu 25 | from megatron.data.gpt_dataset import build_train_valid_test_datasets 26 | from megatron.model import GPTModel, ModelType 27 | from megatron.training import pretrain 28 | from megatron.utils import get_ltor_masks_and_position_ids 29 | from megatron.utils import average_losses_across_data_parallel_group 30 | import torch_xla.core.xla_model as xm 31 | import torch_xla.distributed.xla_multiprocessing as xmp 32 | import os 33 | import torch_xla.debug.metrics as met 34 | 35 | import torch_xla.debug.profiler as xp 36 | 37 | os.environ["NEURON_CC_FLAGS"] = "--model-type transformer" 38 | 39 | def model_provider(pre_process=True, post_process=True): 40 | """Build the model.""" 41 | 42 | device = xm.xla_device() 43 | print_rank_0('building GPT model ...') 44 | model = GPTModel( 45 | num_tokentypes=0, 46 | parallel_output=True, 47 | pre_process=pre_process, 48 | post_process=post_process 49 | ).to(device) 50 | return model 51 | 52 | 53 | def get_batch(data_iterator): 54 | """Generate a batch""" 55 | args = get_args() 56 | tokenizer = get_tokenizer() 57 | 58 | # Items and their type. 59 | keys = ['text'] 60 | datatype = torch.int64 61 | 62 | # Broadcast data. 63 | if data_iterator is not None: 64 | data = next(data_iterator) 65 | else: 66 | data = None 67 | #data_b = mpu.broadcast_data(keys, data, datatype) 68 | data_b = data 69 | 70 | # Unpack. 71 | #tokens_ = data_b['text'].long() 72 | tokens_ = data_b['text'].int() 73 | labels = tokens_[:, 1:].contiguous() 74 | tokens = tokens_[:, :-1].contiguous() 75 | 76 | # Get the masks and postition ids. 77 | attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( 78 | tokens, 79 | tokenizer.eod, 80 | args.reset_position_ids, 81 | args.reset_attention_mask, 82 | args.eod_mask_loss) 83 | 84 | return tokens, labels, loss_mask, attention_mask, position_ids 85 | 86 | def loss_func(loss_mask, output_tensor): 87 | losses = output_tensor.float() 88 | loss_mask = loss_mask.view(-1).float() 89 | loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() 90 | 91 | if mpu.get_data_parallel_world_size() > 1: 92 | # Reduce loss for logging. 93 | averaged_loss = average_losses_across_data_parallel_group([loss]) 94 | return loss, {'lm loss': averaged_loss[0]} 95 | else: 96 | return loss, {'lm loss': loss} 97 | 98 | 99 | def forward_step(data_iterator, model): 100 | """Forward step.""" 101 | args = get_args() 102 | timers = get_timers() 103 | 104 | # Get the batch. 105 | timers('batch-generator').start() 106 | tokens, labels, loss_mask, attention_mask, position_ids = get_batch( 107 | data_iterator) 108 | timers('batch-generator').stop() 109 | output_tensor = model(tokens, position_ids, attention_mask, 110 | labels=labels) 111 | 112 | return output_tensor, partial(loss_func, loss_mask) 113 | 114 | 115 | def train_valid_test_datasets_provider(train_val_test_num_samples): 116 | """Build train, valid, and test datasets.""" 117 | args = get_args() 118 | 119 | print_rank_0('> building train, validation, and test datasets ' 120 | 'for GPT ...') 121 | train_ds, valid_ds, test_ds = build_train_valid_test_datasets( 122 | data_prefix=args.data_path, 123 | data_impl=args.data_impl, 124 | splits_string=args.split, 125 | train_valid_test_num_samples=train_val_test_num_samples, 126 | seq_length=args.seq_length, 127 | seed=args.seed, 128 | skip_warmup=(not args.mmap_warmup)) 129 | print_rank_0("> finished creating GPT datasets ...") 130 | return train_ds, valid_ds, test_ds 131 | 132 | 133 | def pretrain_mp(rank, world_size): 134 | os.environ['RANK'] = str(rank) 135 | os.environ['WORLD_SIZE'] = str(world_size) 136 | pretrain(train_valid_test_datasets_provider, model_provider, 137 | ModelType.encoder_or_decoder, 138 | forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) 139 | xm.rendezvous('ending') 140 | #xm.mark_step() 141 | 142 | if __name__ == '__main__': 143 | world_size = int(os.environ['NEURON_NUM_DEVICES']) 144 | xmp.spawn(pretrain_mp, 145 | args=(world_size,), 146 | nprocs=world_size, 147 | join=True) 148 | -------------------------------------------------------------------------------- /pretrain_vit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Pretrain VIT""" 17 | 18 | import torch 19 | import torch.nn.functional as F 20 | from functools import partial 21 | from megatron import get_args, get_timers, mpu, print_rank_0 22 | from megatron.data.vit_dataset import build_train_valid_datasets 23 | from megatron.model import ModelType 24 | from megatron.model.vit_model import VitModel 25 | from megatron.training import pretrain 26 | from megatron.utils import average_losses_across_data_parallel_group 27 | 28 | def model_provider(pre_process=True, post_process=True): 29 | """Build the model.""" 30 | 31 | print_rank_0("building VIT model ...") 32 | args = get_args() 33 | 34 | model = VitModel(num_classes=args.num_classes, 35 | pre_process=pre_process, 36 | post_process=post_process) 37 | return model 38 | 39 | def get_batch(data_iterator): 40 | """Build the batch.""" 41 | data = next(data_iterator) 42 | 43 | # only data parallelism; no need for broadcast 44 | images = data[0].cuda() 45 | labels = data[1].cuda() 46 | 47 | return images, labels 48 | 49 | def loss_func(labels, output_tensor): 50 | logits = output_tensor.contiguous().float() 51 | loss = F.cross_entropy(logits, labels) 52 | 53 | outputs = torch.argmax(logits, -1) 54 | correct = (outputs == labels).float() 55 | accuracy = torch.mean(correct) 56 | 57 | averaged_loss = average_losses_across_data_parallel_group([loss, accuracy]) 58 | 59 | return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]} 60 | 61 | def forward_step(data_iterator, model): 62 | """Forward step.""" 63 | timers = get_timers() 64 | 65 | # Get the batch. 66 | timers("batch-generator").start() 67 | ( 68 | images, 69 | labels, 70 | ) = get_batch(data_iterator) 71 | timers("batch-generator").stop() 72 | 73 | # Forward model. lm_labels 74 | output_tensor = model(images) 75 | 76 | return output_tensor, partial(loss_func, labels) 77 | 78 | def train_valid_test_datasets_provider(train_val_test_num_samples): 79 | """Build train, valid, and test datasets.""" 80 | args = get_args() 81 | 82 | print_rank_0( 83 | "> building train, validation, and test datasets " "for VIT ..." 84 | ) 85 | train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path) 86 | print_rank_0("> finished creating VIT datasets ...") 87 | 88 | return train_ds, valid_ds, None 89 | 90 | 91 | if __name__ == "__main__": 92 | 93 | pretrain( 94 | train_valid_test_datasets_provider, 95 | model_provider, 96 | ModelType.encoder_or_decoder, 97 | forward_step, 98 | args_defaults={'dataloader_type': 'cyclic'} 99 | ) 100 | -------------------------------------------------------------------------------- /tasks/data_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ Tasks data utility.""" 17 | 18 | import re 19 | import numpy as np 20 | 21 | 22 | def clean_text(text): 23 | """Remove new lines and multiple spaces and adjust end of sentence dot.""" 24 | 25 | text = text.replace("\n", " ") 26 | text = re.sub(r'\s+', ' ', text) 27 | for _ in range(3): 28 | text = text.replace(' . ', '. ') 29 | 30 | return text 31 | 32 | 33 | def build_sample(ids, types, paddings, label, unique_id): 34 | """Convert to numpy and return a sample consumed by the batch producer.""" 35 | 36 | ids_np = np.array(ids, dtype=np.int64) 37 | types_np = np.array(types, dtype=np.int64) 38 | paddings_np = np.array(paddings, dtype=np.int64) 39 | sample = ({'text': ids_np, 40 | 'types': types_np, 41 | 'padding_mask': paddings_np, 42 | 'label': int(label), 43 | 'uid': int(unique_id)}) 44 | 45 | return sample 46 | 47 | 48 | def build_tokens_types_paddings_from_text(text_a, text_b, 49 | tokenizer, max_seq_length): 50 | """Build token types and paddings, trim if needed, and pad if needed.""" 51 | 52 | text_a_ids = tokenizer.tokenize(text_a) 53 | text_b_ids = None 54 | if text_b is not None: 55 | text_b_ids = tokenizer.tokenize(text_b) 56 | 57 | return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, 58 | max_seq_length, tokenizer.cls, 59 | tokenizer.sep, tokenizer.pad) 60 | 61 | 62 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length, 63 | cls_id, sep_id, pad_id): 64 | """Build token types and paddings, trim if needed, and pad if needed.""" 65 | 66 | ids = [] 67 | types = [] 68 | paddings = [] 69 | 70 | # [CLS]. 71 | ids.append(cls_id) 72 | types.append(0) 73 | paddings.append(1) 74 | 75 | # A. 76 | len_text_a = len(text_a_ids) 77 | ids.extend(text_a_ids) 78 | types.extend([0] * len_text_a) 79 | paddings.extend([1] * len_text_a) 80 | 81 | # [SEP]. 82 | ids.append(sep_id) 83 | types.append(0) 84 | paddings.append(1) 85 | 86 | # B. 87 | if text_b_ids is not None: 88 | len_text_b = len(text_b_ids) 89 | ids.extend(text_b_ids) 90 | types.extend([1] * len_text_b) 91 | paddings.extend([1] * len_text_b) 92 | 93 | # Cap the size. 94 | trimmed = False 95 | if len(ids) >= max_seq_length: 96 | max_seq_length_m1 = max_seq_length - 1 97 | ids = ids[0:max_seq_length_m1] 98 | types = types[0:max_seq_length_m1] 99 | paddings = paddings[0:max_seq_length_m1] 100 | trimmed = True 101 | 102 | # [SEP]. 103 | if (text_b_ids is not None) or trimmed: 104 | ids.append(sep_id) 105 | if text_b_ids is None: 106 | types.append(0) 107 | else: 108 | types.append(1) 109 | paddings.append(1) 110 | 111 | # Padding. 112 | padding_length = max_seq_length - len(ids) 113 | if padding_length > 0: 114 | ids.extend([pad_id] * padding_length) 115 | types.extend([pad_id] * padding_length) 116 | paddings.extend([0] * padding_length) 117 | 118 | return ids, types, paddings 119 | -------------------------------------------------------------------------------- /tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GLUE dataset.""" 17 | 18 | from abc import ABC 19 | from abc import abstractmethod 20 | 21 | from torch.utils.data import Dataset 22 | 23 | from megatron import print_rank_0 24 | from tasks.data_utils import build_sample 25 | from tasks.data_utils import build_tokens_types_paddings_from_text 26 | 27 | 28 | class GLUEAbstractDataset(ABC, Dataset): 29 | """GLUE base dataset class.""" 30 | 31 | def __init__(self, task_name, dataset_name, datapaths, 32 | tokenizer, max_seq_length): 33 | # Store inputs. 34 | self.task_name = task_name 35 | self.dataset_name = dataset_name 36 | self.tokenizer = tokenizer 37 | self.max_seq_length = max_seq_length 38 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 39 | self.dataset_name)) 40 | # Process the files. 41 | string = ' > paths:' 42 | for path in datapaths: 43 | string += ' ' + path 44 | print_rank_0(string) 45 | self.samples = [] 46 | for datapath in datapaths: 47 | self.samples.extend(self.process_samples_from_single_path(datapath)) 48 | print_rank_0(' >> total number of samples: {}'.format( 49 | len(self.samples))) 50 | 51 | def __len__(self): 52 | return len(self.samples) 53 | 54 | def __getitem__(self, idx): 55 | raw_sample = self.samples[idx] 56 | ids, types, paddings = build_tokens_types_paddings_from_text( 57 | raw_sample['text_a'], raw_sample['text_b'], 58 | self.tokenizer, self.max_seq_length) 59 | sample = build_sample(ids, types, paddings, 60 | raw_sample['label'], raw_sample['uid']) 61 | return sample 62 | 63 | @abstractmethod 64 | def process_samples_from_single_path(self, datapath): 65 | """Abstract method that takes a single path / filename and 66 | returns a list of dataset samples, each sample being a dict of 67 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 68 | """ 69 | pass 70 | -------------------------------------------------------------------------------- /tasks/glue/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GLUE finetuning/evaluation.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron import get_tokenizer 21 | from megatron import mpu 22 | from megatron.model.classification import Classification 23 | from tasks.eval_utils import accuracy_func_provider 24 | from tasks.finetune_utils import finetune 25 | 26 | 27 | def glue_classification(num_classes, Dataset, 28 | name_from_datapath_func): 29 | 30 | def train_valid_datasets_provider(): 31 | """Build train and validation dataset.""" 32 | args = get_args() 33 | tokenizer = get_tokenizer() 34 | 35 | train_dataset = Dataset('training', args.train_data, 36 | tokenizer, args.seq_length) 37 | valid_dataset = Dataset('validation', args.valid_data, 38 | tokenizer, args.seq_length) 39 | 40 | return train_dataset, valid_dataset 41 | 42 | def model_provider(pre_process=True, post_process=True): 43 | """Build the model.""" 44 | args = get_args() 45 | 46 | print_rank_0('building classification model for {} ...'.format( 47 | args.task)) 48 | model = Classification(num_classes=num_classes, num_tokentypes=2, 49 | pre_process=pre_process, post_process=post_process) 50 | 51 | return model 52 | 53 | def metrics_func_provider(): 54 | """Privde metrics callback function.""" 55 | def single_dataset_provider(datapath): 56 | args = get_args() 57 | tokenizer = get_tokenizer() 58 | 59 | name = name_from_datapath_func(datapath) 60 | return Dataset(name, [datapath], tokenizer, args.seq_length) 61 | return accuracy_func_provider(single_dataset_provider) 62 | 63 | """Finetune/evaluate.""" 64 | finetune(train_valid_datasets_provider, model_provider, 65 | end_of_epoch_callback_provider=metrics_func_provider) 66 | 67 | 68 | def main(): 69 | args = get_args() 70 | 71 | if args.task == 'MNLI': 72 | 73 | num_classes = 3 74 | from tasks.glue.mnli import MNLIDataset as Dataset 75 | 76 | def name_from_datapath(datapath): 77 | return datapath.split('MNLI')[-1].strip( 78 | '.tsv').strip('/').replace('_', '-') 79 | 80 | elif args.task == 'QQP': 81 | 82 | num_classes = 2 83 | from tasks.glue.qqp import QQPDataset as Dataset 84 | 85 | def name_from_datapath(datapath): 86 | return datapath.split('QQP')[-1].strip( 87 | '.tsv').strip('/').replace('_', '-') 88 | 89 | else: 90 | raise NotImplementedError('GLUE task {} is not implemented.'.format( 91 | args.task)) 92 | 93 | glue_classification(num_classes, Dataset, name_from_datapath) 94 | -------------------------------------------------------------------------------- /tasks/glue/mnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """MNLI dataset.""" 17 | 18 | from megatron import print_rank_0 19 | from tasks.data_utils import clean_text 20 | from .data import GLUEAbstractDataset 21 | 22 | 23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} 24 | 25 | 26 | class MNLIDataset(GLUEAbstractDataset): 27 | 28 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 29 | test_label='contradiction'): 30 | self.test_label = test_label 31 | super().__init__('MNLI', name, datapaths, 32 | tokenizer, max_seq_length) 33 | 34 | def process_samples_from_single_path(self, filename): 35 | """"Implement abstract method.""" 36 | print_rank_0(' > Processing {} ...'.format(filename)) 37 | 38 | samples = [] 39 | total = 0 40 | first = True 41 | is_test = False 42 | with open(filename, 'r') as f: 43 | for line in f: 44 | row = line.strip().split('\t') 45 | if first: 46 | first = False 47 | if len(row) == 10: 48 | is_test = True 49 | print_rank_0( 50 | ' reading {}, {} and {} columns and setting ' 51 | 'labels to {}'.format( 52 | row[0].strip(), row[8].strip(), 53 | row[9].strip(), self.test_label)) 54 | else: 55 | print_rank_0(' reading {} , {}, {}, and {} columns ' 56 | '...'.format( 57 | row[0].strip(), row[8].strip(), 58 | row[9].strip(), row[-1].strip())) 59 | continue 60 | 61 | text_a = clean_text(row[8].strip()) 62 | text_b = clean_text(row[9].strip()) 63 | unique_id = int(row[0].strip()) 64 | label = row[-1].strip() 65 | if is_test: 66 | label = self.test_label 67 | 68 | assert len(text_a) > 0 69 | assert len(text_b) > 0 70 | assert label in LABELS 71 | assert unique_id >= 0 72 | 73 | sample = {'text_a': text_a, 74 | 'text_b': text_b, 75 | 'label': LABELS[label], 76 | 'uid': unique_id} 77 | total += 1 78 | samples.append(sample) 79 | 80 | if total % 50000 == 0: 81 | print_rank_0(' > processed {} so far ...'.format(total)) 82 | 83 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 84 | return samples 85 | -------------------------------------------------------------------------------- /tasks/glue/qqp.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """QQP dataset.""" 17 | 18 | from megatron import print_rank_0 19 | from tasks.data_utils import clean_text 20 | from .data import GLUEAbstractDataset 21 | 22 | 23 | LABELS = [0, 1] 24 | 25 | 26 | class QQPDataset(GLUEAbstractDataset): 27 | 28 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 29 | test_label=0): 30 | self.test_label = test_label 31 | super().__init__('QQP', name, datapaths, 32 | tokenizer, max_seq_length) 33 | 34 | def process_samples_from_single_path(self, filename): 35 | """"Implement abstract method.""" 36 | print_rank_0(' > Processing {} ...'.format(filename)) 37 | 38 | samples = [] 39 | total = 0 40 | first = True 41 | is_test = False 42 | with open(filename, 'r') as f: 43 | for line in f: 44 | row = line.strip().split('\t') 45 | if first: 46 | first = False 47 | if len(row) == 3: 48 | is_test = True 49 | print_rank_0(' reading {}, {}, and {} columns and ' 50 | 'setting labels to {}'.format( 51 | row[0].strip(), row[1].strip(), 52 | row[2].strip(), self.test_label)) 53 | else: 54 | assert len(row) == 6 55 | print_rank_0(' reading {}, {}, {}, and {} columns' 56 | ' ...'.format( 57 | row[0].strip(), row[3].strip(), 58 | row[4].strip(), row[5].strip())) 59 | continue 60 | 61 | if is_test: 62 | assert len(row) == 3, 'expected length 3: {}'.format(row) 63 | uid = int(row[0].strip()) 64 | text_a = clean_text(row[1].strip()) 65 | text_b = clean_text(row[2].strip()) 66 | label = self.test_label 67 | assert len(text_a) > 0 68 | assert len(text_b) > 0 69 | else: 70 | if len(row) == 6: 71 | uid = int(row[0].strip()) 72 | text_a = clean_text(row[3].strip()) 73 | text_b = clean_text(row[4].strip()) 74 | label = int(row[5].strip()) 75 | else: 76 | print_rank_0('***WARNING*** index error, ' 77 | 'skipping: {}'.format(row)) 78 | continue 79 | if len(text_a) == 0: 80 | print_rank_0('***WARNING*** zero length a, ' 81 | 'skipping: {}'.format(row)) 82 | continue 83 | if len(text_b) == 0: 84 | print_rank_0('***WARNING*** zero length b, ' 85 | 'skipping: {}'.format(row)) 86 | continue 87 | assert label in LABELS 88 | assert uid >= 0 89 | 90 | sample = {'uid': uid, 91 | 'text_a': text_a, 92 | 'text_b': text_b, 93 | 'label': label} 94 | total += 1 95 | samples.append(sample) 96 | 97 | if total % 50000 == 0: 98 | print_rank_0(' > processed {} so far ...'.format(total)) 99 | 100 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 101 | return samples 102 | -------------------------------------------------------------------------------- /tasks/orqa/README.md: -------------------------------------------------------------------------------- 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering 2 | 3 | Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408). 4 | 5 | ## Retriever Training 6 | 7 | #### Unsupervised pretraining 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body. 9 | 10 |
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | 
20 | 21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training. 22 | 23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf). 24 | 25 | #### Supervised finetuning 26 | 27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906). 28 | 29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model. 30 | 31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408). 32 | 33 | ## Reader Training 34 | 35 | The reader component will be available soon. 36 | 37 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | from megatron import get_args, print_rank_0 19 | from megatron.indexer import IndexBuilder 20 | from tasks.orqa.evaluate_utils import ORQAEvaluator 21 | 22 | def main(): 23 | """ 24 | Main program 25 | """ 26 | 27 | args = get_args() 28 | 29 | """ 30 | Create a BlockData data structure by running an IndexBuilder over an 31 | ICT Dataset and then evaluate on NQ task 32 | """ 33 | 34 | print_rank_0("Starting index builder!") 35 | 36 | index_builder = IndexBuilder() 37 | index_builder.build_and_save_index() 38 | print_rank_0("Build and save indices: done!") 39 | 40 | 41 | print_rank_0("Starting evaluations!") 42 | 43 | # Set up the model and evaluator 44 | evaluator = ORQAEvaluator() 45 | 46 | # Run evaluation 47 | if args.qa_data_dev is not None: 48 | evaluator.evaluate(args.qa_data_dev, "DEV") 49 | 50 | if args.qa_data_test is not None: 51 | evaluator.evaluate(args.qa_data_test, "TEST") 52 | 53 | -------------------------------------------------------------------------------- /tasks/race/data.py: -------------------------------------------------------------------------------- 1 | 2 | import glob 3 | import json 4 | import os 5 | import time 6 | 7 | from torch.utils.data import Dataset 8 | 9 | from megatron import print_rank_0 10 | from tasks.data_utils import build_sample 11 | from tasks.data_utils import build_tokens_types_paddings_from_ids 12 | from tasks.data_utils import clean_text 13 | 14 | 15 | NUM_CHOICES = 4 16 | MAX_QA_LENGTH = 128 17 | 18 | 19 | class RaceDataset(Dataset): 20 | 21 | def __init__(self, dataset_name, datapaths, tokenizer, max_seq_length, 22 | max_qa_length=MAX_QA_LENGTH): 23 | 24 | self.dataset_name = dataset_name 25 | print_rank_0(' > building RACE dataset for {}:'.format( 26 | self.dataset_name)) 27 | 28 | string = ' > paths:' 29 | for path in datapaths: 30 | string += ' ' + path 31 | print_rank_0(string) 32 | 33 | self.samples = [] 34 | for datapath in datapaths: 35 | self.samples.extend(process_single_datapath(datapath, tokenizer, 36 | max_qa_length, 37 | max_seq_length)) 38 | 39 | print_rank_0(' >> total number of samples: {}'.format( 40 | len(self.samples))) 41 | 42 | # This indicates that each "sample" has multiple samples that 43 | # will collapse into batch dimension 44 | self.sample_multiplier = NUM_CHOICES 45 | 46 | def __len__(self): 47 | return len(self.samples) 48 | 49 | def __getitem__(self, idx): 50 | return self.samples[idx] 51 | 52 | 53 | def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length): 54 | """Read in RACE files, combine, clean-up, tokenize, and convert to 55 | samples.""" 56 | 57 | print_rank_0(' > working on {}'.format(datapath)) 58 | start_time = time.time() 59 | 60 | # Get list of files. 61 | filenames = glob.glob(os.path.join(datapath, '*.txt')) 62 | 63 | samples = [] 64 | num_docs = 0 65 | num_questions = 0 66 | num_samples = 0 67 | # Load all the files 68 | for filename in filenames: 69 | with open(filename, 'r') as f: 70 | for line in f: 71 | data = json.loads(line) 72 | num_docs += 1 73 | 74 | context = data["article"] 75 | questions = data["questions"] 76 | choices = data["options"] 77 | answers = data["answers"] 78 | # Check the length. 79 | assert len(questions) == len(answers) 80 | assert len(questions) == len(choices) 81 | 82 | # Context: clean up and convert to ids. 83 | context = clean_text(context) 84 | context_ids = tokenizer.tokenize(context) 85 | 86 | # Loop over questions. 87 | for qi, question in enumerate(questions): 88 | num_questions += 1 89 | # Label. 90 | label = ord(answers[qi]) - ord("A") 91 | assert label >= 0 92 | assert label < NUM_CHOICES 93 | assert len(choices[qi]) == NUM_CHOICES 94 | 95 | # For each question, build num-choices samples. 96 | ids_list = [] 97 | types_list = [] 98 | paddings_list = [] 99 | for ci in range(NUM_CHOICES): 100 | choice = choices[qi][ci] 101 | # Merge with choice. 102 | if "_" in question: 103 | qa = question.replace("_", choice) 104 | else: 105 | qa = " ".join([question, choice]) 106 | # Clean QA. 107 | qa = clean_text(qa) 108 | # Tokenize. 109 | qa_ids = tokenizer.tokenize(qa) 110 | # Trim if needed. 111 | if len(qa_ids) > max_qa_length: 112 | qa_ids = qa_ids[0:max_qa_length] 113 | 114 | # Build the sample. 115 | ids, types, paddings \ 116 | = build_tokens_types_paddings_from_ids( 117 | qa_ids, context_ids, max_seq_length, 118 | tokenizer.cls, tokenizer.sep, tokenizer.pad) 119 | 120 | ids_list.append(ids) 121 | types_list.append(types) 122 | paddings_list.append(paddings) 123 | 124 | # Convert to numpy and add to samples 125 | samples.append(build_sample(ids_list, types_list, 126 | paddings_list, label, 127 | num_samples)) 128 | num_samples += 1 129 | 130 | elapsed_time = time.time() - start_time 131 | print_rank_0(' > processed {} document, {} questions, and {} samples' 132 | ' in {:.2f} seconds'.format(num_docs, num_questions, 133 | num_samples, elapsed_time)) 134 | 135 | return samples 136 | -------------------------------------------------------------------------------- /tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Race.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron import get_tokenizer 21 | from megatron import mpu 22 | from megatron.model.multiple_choice import MultipleChoice 23 | from tasks.eval_utils import accuracy_func_provider 24 | from tasks.finetune_utils import finetune 25 | from tasks.race.data import RaceDataset 26 | 27 | 28 | def train_valid_datasets_provider(): 29 | """Provide train and validation datasets.""" 30 | args = get_args() 31 | tokenizer = get_tokenizer() 32 | 33 | train_dataset = RaceDataset('training', args.train_data, 34 | tokenizer, args.seq_length) 35 | valid_dataset = RaceDataset('validation', args.valid_data, 36 | tokenizer, args.seq_length) 37 | 38 | return train_dataset, valid_dataset 39 | 40 | 41 | def model_provider(pre_process=True, post_process=True): 42 | """Build the model.""" 43 | 44 | print_rank_0('building multichoice model for RACE ...') 45 | model = MultipleChoice(num_tokentypes=2, 46 | pre_process=pre_process, 47 | post_process=post_process) 48 | 49 | return model 50 | 51 | 52 | def metrics_func_provider(): 53 | """Privde metrics callback function.""" 54 | args = get_args() 55 | tokenizer = get_tokenizer() 56 | 57 | def single_dataset_provider(datapath): 58 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 59 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 60 | 61 | return accuracy_func_provider(single_dataset_provider) 62 | 63 | 64 | def main(): 65 | 66 | finetune(train_valid_datasets_provider, model_provider, 67 | end_of_epoch_callback_provider=metrics_func_provider) 68 | -------------------------------------------------------------------------------- /tasks/vision/classification.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Vision-classification finetuning/evaluation.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron.model.vit_model import VitModel 21 | from megatron.data.vit_dataset import build_train_valid_datasets 22 | from tasks.vision.eval_utils import accuracy_func_provider 23 | from tasks.vision.finetune_utils import finetune 24 | 25 | 26 | def classification(): 27 | def train_valid_datasets_provider(): 28 | """Build train and validation dataset.""" 29 | args = get_args() 30 | 31 | train_ds, valid_ds = build_train_valid_datasets( 32 | data_path=args.data_path, 33 | crop_size=args.img_dim, 34 | ) 35 | return train_ds, valid_ds 36 | 37 | def model_provider(pre_process=True, post_process=True): 38 | """Build the model.""" 39 | args = get_args() 40 | 41 | print_rank_0("building classification model for ImageNet ...") 42 | 43 | return VitModel(num_classes=args.num_classes, finetune=True, 44 | pre_process=pre_process, post_process=post_process) 45 | 46 | """Finetune/evaluate.""" 47 | finetune( 48 | train_valid_datasets_provider, 49 | model_provider, 50 | end_of_epoch_callback_provider=accuracy_func_provider, 51 | ) 52 | 53 | 54 | def main(): 55 | classification() 56 | -------------------------------------------------------------------------------- /tasks/vision/eval_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Evaluation utilities.""" 17 | 18 | import os 19 | from functools import partial 20 | 21 | import torch 22 | 23 | from megatron import get_args 24 | from megatron import print_rank_0, print_rank_last 25 | from megatron import mpu 26 | from megatron.schedules import get_forward_backward_func 27 | from tasks.vision.finetune_utils import build_data_loader 28 | from tasks.vision.finetune_utils import process_batch 29 | from torchvision import datasets, transforms 30 | 31 | 32 | def accuracy_func_provider(): 33 | """Provide function that calculates accuracies.""" 34 | args = get_args() 35 | data_path = args.data_path 36 | crop_size = args.img_dim 37 | 38 | # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] 39 | # Build dataloaders. 40 | val_data_path = os.path.join(data_path[0], "val") 41 | normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 42 | transform_val = transforms.Compose( 43 | [ 44 | transforms.Resize(crop_size), 45 | transforms.CenterCrop(crop_size), 46 | transforms.ToTensor(), 47 | normalize, 48 | ] 49 | ) 50 | dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val) 51 | 52 | dataloader = build_data_loader( 53 | dataset, 54 | args.micro_batch_size, 55 | num_workers=args.num_workers, 56 | drop_last=(mpu.get_data_parallel_world_size() > 1), 57 | ) 58 | 59 | def metrics_func(model, epoch): 60 | print_rank_0("calculating metrics ...") 61 | correct, total = calculate_correct_answers(model, dataloader, epoch) 62 | percent = float(correct) * 100.0 / float(total) 63 | print_rank_last( 64 | " >> |epoch: {}| overall: correct / total = {} / {} = " 65 | "{:.4f} %".format(epoch, correct, total, percent) 66 | ) 67 | 68 | return metrics_func 69 | 70 | 71 | def calculate_correct_answers(model, dataloader, epoch): 72 | """Calculate correct over total answers""" 73 | 74 | args = get_args() 75 | forward_backward_func = get_forward_backward_func() 76 | for m in model: 77 | m.eval() 78 | 79 | def loss_func(labels, output_tensor): 80 | logits = output_tensor 81 | 82 | loss_dict = {} 83 | # Compute the correct answers. 84 | predicted = torch.argmax(logits, dim=-1) 85 | corrects = (predicted == labels).float() 86 | # Add to the counters. 87 | loss_dict['total'] = labels.size(0) 88 | loss_dict['correct'] = corrects.sum().item() 89 | 90 | return 0, loss_dict 91 | 92 | #defined inside to capture output_predictions 93 | def correct_answers_forward_step(batch, model): 94 | try: 95 | batch_ = next(batch) 96 | except BaseException: 97 | batch_ = batch 98 | images, labels = process_batch(batch_) 99 | 100 | # Forward model. 101 | args = get_args() 102 | output_tensor = model(images) 103 | 104 | return output_tensor, partial(loss_func, labels) 105 | 106 | with torch.no_grad(): 107 | # For all the batches in the dataset. 108 | total = 0 109 | correct = 0 110 | for _, batch in enumerate(dataloader): 111 | 112 | loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model, 113 | optimizer=None, timers=None, forward_only=True) 114 | 115 | for loss_dict in loss_dicts: 116 | total += loss_dict['total'] 117 | correct += loss_dict['correct'] 118 | 119 | for m in model: 120 | m.train() 121 | 122 | # Reduce. 123 | if mpu.is_pipeline_last_stage(): 124 | unreduced = torch.cuda.LongTensor([correct, total]) 125 | torch.distributed.all_reduce(unreduced, 126 | group=mpu.get_data_parallel_group()) 127 | 128 | # Print on screen. 129 | correct_ans = unreduced[0].item() 130 | total_count = unreduced[1].item() 131 | return correct_ans, total_count 132 | -------------------------------------------------------------------------------- /tasks/vision/main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | import os 19 | import sys 20 | 21 | sys.path.append( 22 | os.path.abspath( 23 | os.path.join( 24 | os.path.join(os.path.dirname(__file__), os.path.pardir), 25 | os.path.pardir, 26 | ) 27 | ) 28 | ) 29 | from megatron import get_args 30 | from megatron.initialize import initialize_megatron 31 | from classification import main 32 | 33 | 34 | def get_tasks_args(parser): 35 | """Provide extra arguments required for tasks.""" 36 | group = parser.add_argument_group(title="tasks") 37 | 38 | group.add_argument( 39 | "--epochs", 40 | type=int, 41 | default=None, 42 | help="Number of finetunning epochs. Zero results in " 43 | "evaluation only.", 44 | ) 45 | group.add_argument( 46 | "--pretrained-checkpoint", 47 | type=str, 48 | default=None, 49 | help="Pretrained checkpoint used for finetunning.", 50 | ) 51 | group.add_argument( 52 | "--keep-last", 53 | action="store_true", 54 | help="Keep the last batch (maybe incomplete) in" "the data loader", 55 | ) 56 | 57 | return parser 58 | 59 | 60 | if __name__ == "__main__": 61 | 62 | initialize_megatron(extra_args_provider=get_tasks_args) 63 | args = get_args() 64 | main() 65 | -------------------------------------------------------------------------------- /tasks/zeroshot_gpt/detokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Detokenization.""" 17 | 18 | import re 19 | 20 | 21 | def ptb_detokenizer(string): 22 | string = string.replace(" '", "'") 23 | string = string.replace(" \n", "\n") 24 | string = string.replace("\n ", "\n") 25 | string = string.replace(" n't", "n't") 26 | string = string.replace(" N ", "1 ") 27 | string = string.replace("$ 1", "$1") 28 | string = string.replace("# 1", "#1") 29 | return string 30 | 31 | 32 | def wikitext_detokenizer(string): 33 | # contractions 34 | string = string.replace("s '", "s'") 35 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 36 | # number separators 37 | string = string.replace(" @-@ ", "-") 38 | string = string.replace(" @,@ ", ",") 39 | string = string.replace(" @.@ ", ".") 40 | # punctuation 41 | string = string.replace(" : ", ": ") 42 | string = string.replace(" ; ", "; ") 43 | string = string.replace(" . ", ". ") 44 | string = string.replace(" ! ", "! ") 45 | string = string.replace(" ? ", "? ") 46 | string = string.replace(" , ", ", ") 47 | # double brackets 48 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 49 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 50 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 51 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 52 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 53 | # miscellaneous 54 | string = string.replace("= = = =", "====") 55 | string = string.replace("= = =", "===") 56 | string = string.replace("= =", "==") 57 | string = string.replace(" " + chr(176) + " ", chr(176)) 58 | string = string.replace(" \n", "\n") 59 | string = string.replace("\n ", "\n") 60 | string = string.replace(" N ", " 1 ") 61 | string = string.replace(" 's", "'s") 62 | 63 | return string 64 | 65 | 66 | def lambada_detokenizer(string): 67 | return string 68 | 69 | 70 | _DETOKENIZERS = { 71 | 'ptb': ptb_detokenizer, 72 | 'wiki': wikitext_detokenizer, 73 | 'lambada': lambada_detokenizer, 74 | } 75 | 76 | 77 | def get_detokenizer(path): 78 | for key in _DETOKENIZERS.keys(): 79 | if key in path: 80 | return _DETOKENIZERS[key] 81 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/openwebtext/README.md: -------------------------------------------------------------------------------- 1 | The following steps show how to prepare training dataset to train the mode. 2 | 3 | # Libraries to install 4 | 5 | ``` 6 | pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 7 | git clone https://github.com/mattilyra/LSH 8 | cd LSH 9 | python setup.py install 10 | ``` 11 | 12 | # Download the dataset 13 | 14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ) 15 | 2. Remove blacklisted URLs. 16 | ``` 17 | python blacklist_urls.py 18 | ``` 19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 20 | 21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique. 22 | 23 | # Prepare the data for GPT training: 24 | 25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards. 26 | ``` 27 | python cleanup_dataset.py 28 | ``` 29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`. 30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`. 31 | ``` 32 | python find_duplicates.py --inputs --output 33 | ``` 34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest. 35 | ``` 36 | python group_duplicate_urls.py 37 | ``` 38 | 4. Remove similar documents that were detected in the last step. 39 | ``` 40 | python remove_group_duplicates.py 41 | ``` 42 | 43 | 5. Shuffle the dataset. 44 | ``` 45 | shuf -o train_data.json 46 | ``` 47 | 48 | # Deduplicating ngrams 49 | 50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command. 51 | 52 | ``` 53 | python filter_ngrams.py --tasks --dedup-dataset --output 54 | ``` 55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments. 56 | 57 | Only for the lambada task, we need to provide the path, `--lambada-path `. 58 | 59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details. 60 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import json 18 | import os 19 | import time 20 | 21 | """ 22 | This code adds id to each json object in a json file. User can add prefix 23 | to the ids. 24 | """ 25 | 26 | if __name__ == '__main__': 27 | 28 | print('parsing the arguments ...') 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 32 | ' json file where id needs to be added') 33 | parser.add_argument('--output-file', type=str, default=None, help=\ 34 | 'Output file name with id') 35 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 36 | 'Id prefix') 37 | parser.add_argument('--log-interval', type=int, default=100, 38 | help='Log interval') 39 | args = parser.parse_args() 40 | 41 | print('Adding ids to dataset ...') 42 | 43 | f_input = open(args.input_file, 'r', encoding='utf-8') 44 | f_output = open(args.output_file, 'wb') 45 | 46 | unique_ids = 1 47 | start_time = time.time() 48 | for row in f_input: 49 | each_row = json.loads(row) 50 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 51 | each_row['adlr_id'] = adlr_id_string 52 | myjson = json.dumps(each_row, ensure_ascii=False) 53 | 54 | f_output.write(myjson.encode('utf-8')) 55 | f_output.write('\n'.encode('utf-8')) 56 | 57 | if unique_ids % args.log_interval == 0: 58 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 59 | unique_ids, time.time() - start_time), flush=True) 60 | 61 | unique_ids += 1 62 | 63 | # Close the file. 64 | f_input.close() 65 | f_output.close() 66 | 67 | print('done :-)', flush=True) 68 | -------------------------------------------------------------------------------- /tools/openwebtext/cleanup_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import ftfy 18 | import json 19 | from langdetect import detect 20 | import numpy as np 21 | import time 22 | import os 23 | import sys 24 | 25 | from tokenizer import Tokenizer 26 | 27 | MIN_DOCUMENT_LENGHT = 128 28 | 29 | 30 | def print_progress(prefix, start_time, num_docs, num_fixed_text, 31 | num_non_english_docs, chars_non_english_docs, 32 | num_small_docs, chars_small_docs): 33 | 34 | string = prefix + ' | ' 35 | string += 'elapsed time: {:.2f} | '.format(time.time() - start_time) 36 | string += 'documents: {} | '.format(num_docs) 37 | string += 'fixed text: {} | '.format(num_fixed_text) 38 | string += 'non-english: {} | '.format(num_non_english_docs) 39 | string += 'non-english chars: {} | '.format(chars_non_english_docs) 40 | string += 'small docs: {} | '.format(num_small_docs) 41 | string += 'small docs chars: {}'.format(chars_small_docs) 42 | print(string, flush=True) 43 | 44 | 45 | def filter_corpus(filename, out_filename, print_interval=10000): 46 | 47 | print(' > filtering {}'.format(filename)) 48 | 49 | tokenizer = Tokenizer(cache_dir='./cache') 50 | 51 | num_docs = 0 52 | num_written_docs = 0 53 | num_small_docs = 0 54 | num_fixed_text = 0 55 | num_non_english_docs = 0 56 | chars_non_english_docs = 0 57 | chars_small_docs = 0 58 | start_time = time.time() 59 | with open(out_filename, 'wb') as f: 60 | with open(filename, 'r') as fin: 61 | for line in fin: 62 | try: 63 | num_docs += 1 64 | myjson = json.loads(line) 65 | # Fix text 66 | text = ftfy.fix_text(myjson['text']) 67 | if text != myjson['text']: 68 | num_fixed_text += 1 69 | myjson['text'] = text 70 | # Detect language. 71 | if detect(text) != 'en': 72 | print('[non-english text]', myjson) 73 | num_non_english_docs += 1 74 | chars_non_english_docs += len(text) 75 | continue 76 | # On average each token is 5 characters so 8 is an 77 | # upper bound. 78 | if len(text) < (8 * MIN_DOCUMENT_LENGHT): 79 | tokens = tokenizer.tokenize_document(text) 80 | if len(tokens) < MIN_DOCUMENT_LENGHT: 81 | print('[small document, skipping]:', myjson) 82 | num_small_docs += 1 83 | chars_small_docs += len(text) 84 | continue 85 | myjson = json.dumps(myjson, ensure_ascii=False) 86 | f.write(myjson.encode('utf-8')) 87 | f.write('\n'.encode('utf-8')) 88 | num_written_docs += 1 89 | if num_docs % print_interval == 0: 90 | print_progress('[PROGRESS]', start_time, num_docs, 91 | num_fixed_text, num_non_english_docs, 92 | chars_non_english_docs, 93 | num_small_docs, chars_small_docs) 94 | except Exception as e: 95 | print(' skipping ', line, e) 96 | 97 | print_progress('[FINAL]', start_time, num_docs, 98 | num_fixed_text, num_non_english_docs, 99 | chars_non_english_docs, 100 | num_small_docs, chars_small_docs) 101 | 102 | 103 | if __name__ == '__main__': 104 | 105 | print('building gpt2 dataset ...') 106 | 107 | input_filename = sys.argv[1] 108 | output_filename = sys.argv[2] 109 | 110 | print('will be reading {}'.format(input_filename)) 111 | print('and will write the results to {}'.format(output_filename)) 112 | 113 | filter_corpus(input_filename, output_filename) 114 | 115 | 116 | -------------------------------------------------------------------------------- /tools/openwebtext/group_duplicate_url.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import time 18 | import sys 19 | 20 | 21 | if __name__ == '__main__': 22 | 23 | 24 | print('grouping duplicate urls ...') 25 | 26 | input = sys.argv[1] 27 | output = sys.argv[2] 28 | if len(sys.argv) > 3: 29 | jaccard_similarity_threshold = float(sys.argv[3]) 30 | else: 31 | jaccard_similarity_threshold = 0.7 32 | 33 | url_to_index = {} 34 | index_to_urls = [] 35 | counter = 0 36 | start_time = time.time() 37 | with open(input, 'r') as f: 38 | for line in f: 39 | counter += 1 40 | myjson = json.loads(line) 41 | urls = [] 42 | for main_url in myjson.keys(): 43 | urls.append(main_url) 44 | for value in myjson[main_url]: 45 | for other_url, js in value.items(): 46 | if js >= jaccard_similarity_threshold: 47 | urls.append(other_url) 48 | current_index = -1 49 | other_indices = set() 50 | for url in urls: 51 | if url in url_to_index: 52 | if current_index == -1: 53 | current_index = url_to_index[url] 54 | elif current_index != url_to_index[url]: 55 | other_indices.add(url_to_index[url]) 56 | if current_index == -1: 57 | current_index = len(index_to_urls) 58 | index_to_urls.append(set()) 59 | for url in urls: 60 | url_to_index[url] = current_index 61 | index_to_urls[current_index].add(url) 62 | for index in other_indices: 63 | for url in index_to_urls[index]: 64 | index_to_urls[current_index].add(url) 65 | url_to_index[url] = current_index 66 | index_to_urls[index] = None 67 | 68 | if counter % 100000 == 0: 69 | print(' > processed {} lines in {} seconds ...'.format( 70 | counter, time.time() - start_time)) 71 | 72 | 73 | total_remove = 0 74 | total_remain = 0 75 | for urls in index_to_urls: 76 | if urls is not None: 77 | if len(urls) > 1: 78 | total_remove += (len(urls) - 1) 79 | total_remain += 1 80 | print('out of {} urls, only {} are unique and {} should be removed'.format( 81 | total_remove+total_remain, total_remain, total_remove)) 82 | 83 | with open(output, 'wb') as f: 84 | for i, urls in enumerate(index_to_urls): 85 | if urls is not None: 86 | if len(urls) > 1: 87 | myjson = json.dumps({str(i): list(urls)}, 88 | ensure_ascii=False) 89 | f.write(myjson.encode('utf-8')) 90 | f.write('\n'.encode('utf-8')) 91 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import glob 18 | import sys 19 | import json 20 | import argparse 21 | 22 | if __name__ == '__main__': 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--json_path", type=str, default=".", 26 | help="path where all the json files are located") 27 | 28 | parser.add_argument("--output_file", type=str, default="merged_output.json", 29 | help="filename where the merged json should go") 30 | 31 | args = parser.parse_args() 32 | 33 | json_path = args.json_path 34 | out_file = args.output_file 35 | 36 | json_files = glob.glob(json_path + '/*.json') 37 | 38 | counter = 0 39 | 40 | with open(out_file, 'w') as outfile: 41 | for fname in json_files: 42 | counter += 1 43 | 44 | if counter % 1024 == 0: 45 | print("Merging at ", counter, flush=True) 46 | 47 | with open(fname, 'r') as infile: 48 | for row in infile: 49 | each_row = json.loads(row) 50 | outfile.write(row) 51 | 52 | 53 | print("Merged file", out_file, flush=True) 54 | 55 | 56 | -------------------------------------------------------------------------------- /tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import time 19 | import sys 20 | 21 | 22 | if __name__ == '__main__': 23 | 24 | url_filename = sys.argv[1] 25 | data_filename = sys.argv[2] 26 | output_filename = sys.argv[3] 27 | 28 | urls = set() 29 | with open(url_filename, 'r') as f: 30 | for line in f: 31 | myjson = json.loads(line) 32 | for key in myjson: 33 | this_urls = myjson[key] 34 | for i in range(1, len(this_urls)): 35 | urls.add(this_urls[i]) 36 | print('will be removing {} urls'.format(len(urls)), flush=True) 37 | 38 | written_docs = 0 39 | removed_docs = 0 40 | removed_chars = 0 41 | start_time = time.time() 42 | with open(output_filename, 'wb') as fout: 43 | with open(data_filename, 'r') as fin: 44 | for line in fin: 45 | try: 46 | myjson = json.loads(line) 47 | url = myjson['url'] 48 | if url in urls: 49 | print('removing', myjson) 50 | removed_docs += 1 51 | removed_chars += len(myjson['text']) 52 | continue 53 | myjson = json.dumps(myjson, ensure_ascii=False) 54 | fout.write(myjson.encode('utf-8')) 55 | fout.write('\n'.encode('utf-8')) 56 | written_docs += 1 57 | if written_docs % 10000 == 0: 58 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 59 | '| removed: {} (char: {})'.format( 60 | time.time() - start_time, 61 | written_docs, removed_docs, removed_chars)) 62 | except Exception as e: 63 | print('[SKIPPING]', line, e) 64 | 65 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 66 | '| removed: {} (char: {})'.format( 67 | time.time() - start_time, 68 | written_docs, removed_docs, removed_chars)) 69 | print('done :-)') 70 | -------------------------------------------------------------------------------- /tools/run_text_generation_server.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Sample Generate GPT""" 17 | import os 18 | import sys 19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 20 | os.path.pardir))) 21 | import socket 22 | from megatron import get_args 23 | from megatron import print_rank_0 24 | from megatron import mpu 25 | from megatron.checkpointing import load_checkpoint 26 | from megatron.initialize import initialize_megatron 27 | from megatron.model import GPTModel 28 | from megatron.training import get_model 29 | from megatron.text_generation_server import MegatronServer 30 | from megatron.text_generation_utils import generate 31 | import torch 32 | 33 | def model_provider(pre_process=True, post_process=True): 34 | """Build the model.""" 35 | 36 | print_rank_0('building GPT model ...') 37 | model = GPTModel(num_tokentypes=0, parallel_output=False, pre_process=pre_process, post_process=post_process) 38 | 39 | return model 40 | 41 | def add_text_generate_args(parser): 42 | group = parser.add_argument_group(title='text generation') 43 | 44 | group.add_argument("--temperature", type=float, default=1.0, 45 | help='Sampling temperature.') 46 | group.add_argument("--greedy", action='store_true', default=False, 47 | help='Use greedy sampling.') 48 | group.add_argument("--top_p", type=float, default=0.0, 49 | help='Top p sampling.') 50 | group.add_argument("--top_k", type=int, default=0, 51 | help='Top k sampling.') 52 | group.add_argument("--out-seq-length", type=int, default=1024, 53 | help='Size of the output generated text.') 54 | return parser 55 | 56 | 57 | if __name__ == "__main__": 58 | initialize_megatron(extra_args_provider=add_text_generate_args, 59 | args_defaults={'tokenizer_type': 'GPT2BPETokenizer', 60 | 'no_load_rng': True, 61 | 'no_load_optim': True}) 62 | 63 | args = get_args() 64 | if args.num_layers_per_virtual_pipeline_stage is not None: 65 | print("Interleaved pipeline schedule is not yet supported for text generation.") 66 | exit() 67 | # Set up model and load checkpoint 68 | model = get_model(model_provider, wrap_with_ddp=False) 69 | 70 | if args.load is not None: 71 | _ = load_checkpoint(model, None, None) 72 | 73 | assert len(model) == 1, "Above condition should have caught this" 74 | model = model[0] 75 | if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: 76 | server = MegatronServer(model) 77 | server.run("0.0.0.0") 78 | 79 | while True: 80 | choice = torch.cuda.LongTensor(1) 81 | torch.distributed.broadcast(choice, 0) 82 | if choice[0].item() == 0: 83 | generate(model) 84 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import json 16 | import sys 17 | import urllib2 18 | class PutRequest(urllib2.Request): 19 | '''class to handling putting with urllib2''' 20 | 21 | def get_method(self, *args, **kwargs): 22 | return 'PUT' 23 | 24 | if __name__ == "__main__": 25 | url = sys.argv[1] 26 | while True: 27 | sentence = raw_input("Enter prompt: ") 28 | max_len = int(input("Enter number tokens output: ")) 29 | data = json.dumps({"sentences": [sentence], "max_len":max_len}) 30 | req = PutRequest(url, data, {'Content-Type': 'application/json'}) 31 | response = urllib2.urlopen(req) 32 | resp_sentences = json.load(response) 33 | print("Megatron Response: ") 34 | print(resp_sentences["sentences"][0]) 35 | --------------------------------------------------------------------------------