├── .gitignore ├── 1.pre_train_math.py ├── 1.pre_train_math_moe.py ├── 3.pretrain_gpt125M.sh ├── 3.pretrain_llama2.sh ├── 4.aft_train_math.py ├── README.md ├── assets ├── pre_math.png └── title.png ├── scripts ├── kill_process.sh ├── sbatch.sh └── srun.sh └── src ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── core │ ├── README.md │ ├── __init__.py │ ├── enums.py │ ├── fusions │ │ ├── __init__.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_layer_norm.py │ │ └── fused_softmax.py │ ├── model_parallel_config.py │ ├── models │ │ ├── __init__.py │ │ └── gpt │ │ │ ├── __init__.py │ │ │ ├── gpt_embedding.py │ │ │ └── gpt_model.py │ ├── package_info.py │ ├── parallel_state.py │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── p2p_communication.py │ │ └── schedules.py │ ├── requirements.txt │ ├── sequence_parallel │ │ ├── __init__.py │ │ └── cross_entropy.py │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── core_attention.py │ │ ├── custom_layers │ │ │ └── transformer_engine.py │ │ ├── enums.py │ │ ├── mlp.py │ │ ├── module.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── utils.py │ └── utils.py ├── data │ ├── Makefile │ ├── __init__.py │ ├── autoaugment.py │ ├── bert_dataset.py │ ├── biencoder_dataset_utils.py │ ├── blendable_dataset.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── gpt_dataset.py │ ├── helpers.cpp │ ├── ict_dataset.py │ ├── image_folder.py │ ├── indexed_dataset.py │ ├── orqa_wiki_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ ├── t5_dataset.py │ ├── test │ │ ├── test_indexed_dataset.py │ │ └── test_preprocess_data.sh │ └── vit_dataset.py ├── dist_signal_handler.py ├── enums.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── scaled_masked_softmax.cpp │ ├── scaled_masked_softmax.h │ ├── scaled_masked_softmax_cuda.cu │ ├── scaled_softmax.cpp │ ├── scaled_softmax_cuda.cu │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax.h │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ ├── tests │ │ ├── __init__.py │ │ └── test_fused_kernels.py │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── distributed.py │ ├── enums.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_softmax.py │ ├── gpt_model.py │ ├── language_model.py │ ├── module.py │ ├── multiple_choice.py │ ├── realm_model.py │ ├── rotary_pos_embedding.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vision │ │ ├── classification.py │ │ ├── dino.py │ │ ├── esvit_swin_backbone.py │ │ ├── inpainting.py │ │ ├── knn_monitor.py │ │ ├── mit_backbone.py │ │ ├── swin_backbone.py │ │ ├── utils.py │ │ └── vit_backbone.py ├── mpu │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py ├── optimizer │ ├── __init__.py │ ├── clip_grads.py │ ├── distrib_optimizer.py │ ├── grad_scaler.py │ └── optimizer.py ├── optimizer_param_scheduler.py ├── p2p_communication.py ├── static │ └── index.html ├── text_generation │ ├── __init__.py │ ├── api.py │ ├── beam_utils.py │ ├── communication.py │ ├── forward_step.py │ ├── generation.py │ ├── sampling.py │ └── tokenization.py ├── text_generation_server.py ├── text_generation_utils.py ├── timers.py ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── pretrain.py ├── scripts ├── generate_text.sh ├── gpt │ └── ds_config_gpt_TEMPLATE.json ├── pretrain_llama_distributed.sh ├── run_deepspeed_example.sh └── sequence_parallel │ ├── README.md │ ├── ds_config_gpt_TEMPLATE.json │ ├── ds_pretrain_gpt_1.3B_seq_parallel_32k.sh │ └── ds_pretrain_gpt_30B_seq_parallel_32k.sh └── tools ├── convert_checkpoint ├── convert_llama_weights_to_hf.py ├── deepspeed_to_megatron.py ├── deepspeed_to_transformers.py ├── loader_llama2_hf.py ├── saver_megatron.py └── utils.py └── preprocess_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | # *.txt ,*.xls 表示过滤某种类型的文件 2 | # target/ :表示过滤这个文件夹下的所有文件 3 | # /test/a.txt ,/test/b.xls 表示指定过滤某个文件下具体文件 4 | # !*.java , !/dir/test/ !开头表示不过滤 5 | # *.[ab] 支持通配符:过滤所有以.a或者.b为扩展名的文件 6 | # /test 仅仅忽略项目根目录下的 test 文件,不包括 child/test等非根目录的test目录 7 | 8 | 9 | /ckpts/* 10 | -------------------------------------------------------------------------------- /3.pretrain_llama2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example script is contributed by external user https://github.com/nrailgun 3 | set -ex 4 | 5 | ###################################### 6 | # Change the below configurations here 7 | BASE_PATH=./tmp 8 | DS_CONFIG=${BASE_PATH}/deepspeed.json 9 | DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence" 10 | DATASET="1 ${DATASET_1}" 11 | CHECKPOINT_PATH=./tmp 12 | TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model 13 | 14 | TP=2 15 | PP=2 16 | ZERO_STAGE=0 17 | 18 | GPUS_PER_NODE=8 19 | MASTER_ADDR=localhost 20 | MASTER_PORT=6000 21 | NNODES=1 22 | NODE_RANK=0 23 | 24 | HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 25 | FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 26 | NUM_LAYERS=24 # e.g. llama-13b: 40 27 | NUM_HEADS=16 # e.g. llama-13b: 40 28 | SEQ_LENGTH=2048 29 | NUM_KV_HEADS=4 # llama2 70B uses GQA 30 | 31 | MICRO_BATCH_SIZE=4 32 | GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens 33 | TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps 34 | LR=3e-4 35 | MIN_LR=3e-5 36 | LR_WARMUP_STEPS=2000 37 | WEIGHT_DECAY=0.1 38 | GRAD_CLIP=1 39 | 40 | ## Activation checkpointing saves GPU memory, but reduces training speed 41 | activation_checkpoint="true" 42 | # activation_checkpoint="false" 43 | 44 | # Below configuration required for llama model as per llama paper 45 | # --no-query-key-layer-scaling \ 46 | # --attention-dropout 0 \ 47 | # --hidden-dropout 0 \ 48 | # --use-rotary-position-embeddings \ 49 | # --untie-embeddings-and-output-weights \ 50 | # --swiglu \ 51 | # --normalization rmsnorm \ 52 | # --disable-bias-linear \ 53 | ###################################### 54 | 55 | 56 | 57 | cat < $DS_CONFIG 58 | { 59 | "train_batch_size" : $GLOBAL_BATCH_SIZE, 60 | "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, 61 | "steps_per_print": 1, 62 | "zero_optimization": { 63 | "stage": $ZERO_STAGE 64 | }, 65 | "bf16": { 66 | "enabled": true 67 | } 68 | } 69 | EOT 70 | 71 | ds_args="" 72 | ds_args=" --deepspeed ${ds_args}" 73 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 74 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 75 | 76 | if [ "${activation_checkpoint}" = "true" ]; then 77 | ds_args="--deepspeed-activation-checkpointing ${ds_args}" 78 | 79 | ## old argument for recomputing the transformer layer 80 | # ds_args="--checkpoint-activations ${ds_args}" 81 | 82 | ## new argument for recomputing the transformer layer 83 | ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}" 84 | ## new argument for recomputing only the attention layer 85 | # ds_args="--recompute-granularity selective ${ds_args}" 86 | fi 87 | 88 | 89 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 90 | 91 | torchrun $DISTRIBUTED_ARGS \ 92 | ./src/pretrain.py \ 93 | --tensor-model-parallel-size $TP \ 94 | --pipeline-model-parallel-size $PP \ 95 | --num-layers $NUM_LAYERS \ 96 | --hidden-size $HIDDEN_SIZE \ 97 | --ffn-hidden-size $FFN_HIDDEN_SIZE \ 98 | --num-attention-heads $NUM_HEADS \ 99 | --micro-batch-size $MICRO_BATCH_SIZE \ 100 | --global-batch-size $GLOBAL_BATCH_SIZE \ 101 | --seq-length $SEQ_LENGTH \ 102 | --max-position-embeddings $SEQ_LENGTH \ 103 | --train-iters $TRAIN_STEPS \ 104 | --save $CHECKPOINT_PATH \ 105 | --load $CHECKPOINT_PATH \ 106 | --data-path $DATASET \ 107 | --data-impl mmap \ 108 | --tokenizer-type GPTSentencePieceTokenizer \ 109 | --tokenizer-model $TOKENIZER_PATH \ 110 | --split 949,50,1 \ 111 | --distributed-backend nccl \ 112 | --lr $LR \ 113 | --lr-decay-style cosine \ 114 | --min-lr $MIN_LR \ 115 | --weight-decay $WEIGHT_DECAY \ 116 | --clip-grad $GRAD_CLIP \ 117 | --lr-warmup-iters $LR_WARMUP_STEPS \ 118 | --optimizer adam \ 119 | --adam-beta1 0.9 \ 120 | --adam-beta2 0.95 \ 121 | --log-interval 1 \ 122 | --save-interval 10000 \ 123 | --eval-interval 1000 \ 124 | --eval-iters 10 \ 125 | --bf16 \ 126 | --no-query-key-layer-scaling \ 127 | --attention-dropout 0 \ 128 | --hidden-dropout 0 \ 129 | --use-rotary-position-embeddings \ 130 | --untie-embeddings-and-output-weights \ 131 | --swiglu \ 132 | --normalization rmsnorm \ 133 | --disable-bias-linear \ 134 | --num-key-value-heads $NUM_KV_HEADS \ 135 | $ds_args 136 | -------------------------------------------------------------------------------- /4.aft_train_math.py: -------------------------------------------------------------------------------- 1 | model_size_in_B=7 2 | seqlen=4096 3 | global_batch_size=128 4 | time_in_sec_per_interation=6.8 5 | total_gpus=32 6 | 7 | TFLOPS=model_size_in_B * 4 * 2 * seqlen * global_batch_size / (time_in_sec_per_interation * total_gpus * 1e3) # https://arxiv.org/pdf/2104.04473.pdf 8 | print(f'TFLOPS:{TFLOPS:.2f}') -------------------------------------------------------------------------------- /assets/pre_math.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/assets/pre_math.png -------------------------------------------------------------------------------- /assets/title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/assets/title.png -------------------------------------------------------------------------------- /scripts/kill_process.sh: -------------------------------------------------------------------------------- 1 | # 终止进程 2 | # pkill -f "python -u src/gpt4_eval.py" 3 | pkill -f "python" 4 | 5 | if [ $? -eq 0 ]; then 6 | echo "Process terminated." 7 | else 8 | echo "No process found." 9 | fi -------------------------------------------------------------------------------- /scripts/sbatch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -J test 3 | #SBATCH -p p-A100 4 | #SBATCH -N 1 5 | #SBATCH --cpus-per-task=96 6 | #SBATCH --reservation=root_114 # 仅限于wangbeny用户 其余用户不用这一行 7 | #SBATCH -w pgpu17 8 | #SBATCH --gres=gpu:8 9 | 10 | bash /mntcephfs/data/med/xidong/yaojishi/gen_ans.sh # 要运行的命令行指令 -------------------------------------------------------------------------------- /scripts/srun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | srun --job-name=test --gres=gpu:2 -w pgpu20 -p p-A100 -c 24 --reservation=root_114 --pty bash -------------------------------------------------------------------------------- /src/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args, get_retro_args 6 | from .global_vars import get_current_global_batch_size 7 | from .global_vars import get_num_microbatches 8 | from .global_vars import get_signal_handler 9 | from .global_vars import update_num_microbatches 10 | from .global_vars import get_tokenizer 11 | from .global_vars import get_tensorboard_writer 12 | from .global_vars import get_adlr_autoresume 13 | from .global_vars import get_timers 14 | from .initialize import initialize_megatron 15 | 16 | from .utils import (print_rank_0, 17 | is_last_rank, 18 | print_rank_last, 19 | is_rank_0, 20 | is_aml) 21 | -------------------------------------------------------------------------------- /src/megatron/core/README.md: -------------------------------------------------------------------------------- 1 | Megatron Core is a library for efficient and scalable training of transformer based models. 2 | -------------------------------------------------------------------------------- /src/megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | import megatron.core.parallel_state 2 | import megatron.core.tensor_parallel 3 | import megatron.core.utils 4 | 5 | from .model_parallel_config import ModelParallelConfig 6 | 7 | # Alias parallel_state as mpu, its legacy name 8 | mpu = parallel_state 9 | 10 | __all__ = [ 11 | "parallel_state", 12 | "tensor_parallel", 13 | "utils", 14 | "ModelParallelConfig" 15 | ] 16 | -------------------------------------------------------------------------------- /src/megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class ModelType(enum.Enum): 6 | encoder_or_decoder = 1 7 | encoder_and_decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | -------------------------------------------------------------------------------- /src/megatron/core/fusions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/core/fusions/__init__.py -------------------------------------------------------------------------------- /src/megatron/core/fusions/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from typing import Tuple, Optional 5 | 6 | def _bias_dropout_add_func(x, bias, residual, prob, training): 7 | # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor 8 | # NOTE: Previously, the argument `bias` used to be passed as 9 | # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the 10 | # transformer layer but broadcasting should automatically take care of that. 11 | # Also, looking at broadcasting semantics, `expand_as` and broadcasting 12 | # seem to be identical performance-wise (both just change the view). 13 | if bias is not None: 14 | x = x + bias 15 | out = torch.nn.functional.dropout(x, p=prob, training=training) 16 | out = residual + out 17 | return out 18 | 19 | def get_bias_dropout_add(training, fused): 20 | 21 | def unfused_bias_dropout_add(x_with_bias, residual, prob): 22 | x, bias = x_with_bias # unpack 23 | return _bias_dropout_add_func(x, bias, residual, prob, training) 24 | 25 | @torch.jit.script 26 | def bias_dropout_add_fused_train( 27 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], 28 | residual: torch.Tensor, 29 | prob: float 30 | ) -> torch.Tensor: 31 | x, bias = x_with_bias # unpack 32 | return _bias_dropout_add_func(x, bias, residual, prob, True) 33 | 34 | @torch.jit.script 35 | def bias_dropout_add_fused_inference( 36 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], 37 | residual: torch.Tensor, 38 | prob: float 39 | ) -> torch.Tensor: 40 | x, bias = x_with_bias # unpack 41 | return _bias_dropout_add_func(x, bias, residual, prob, False) 42 | 43 | if fused: 44 | # jit scripting for a nn.module (with dropout) is not 45 | # triggering the fusion kernel. For now, we use two 46 | # different nn.functional routines to account for varying 47 | # dropout semantics during training and inference phases. 48 | if training: 49 | return bias_dropout_add_fused_train 50 | else: 51 | return bias_dropout_add_fused_inference 52 | else: 53 | return unfused_bias_dropout_add 54 | -------------------------------------------------------------------------------- /src/megatron/core/fusions/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 7 | # 1/sqrt(2*pi)-> 0.3989423 8 | # 1/sqrt(2) -> 0.70710678 9 | # sqrt(2/pi) -> 0.79788456 10 | # this function is tanh approximation of gelu 11 | # actual gelu is: 12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 13 | 14 | @torch.jit.script 15 | def bias_gelu(bias, y): 16 | x = bias + y 17 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 18 | 19 | # gradient of tanh approximation of gelu 20 | # gradient of actual gelu is: 21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 22 | @torch.jit.script 23 | def bias_gelu_back(g, bias, y): 24 | x = bias + y 25 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 26 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 27 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 28 | return ff*g 29 | 30 | class GeLUFunction(torch.autograd.Function): 31 | @staticmethod 32 | # bias is an optional argument 33 | def forward(ctx, input, bias): 34 | ctx.save_for_backward(input, bias) 35 | return bias_gelu(bias, input) 36 | 37 | @staticmethod 38 | def backward(ctx, grad_output): 39 | input, bias = ctx.saved_tensors 40 | tmp = bias_gelu_back(grad_output, bias, input) 41 | return tmp, tmp 42 | 43 | bias_gelu_impl = GeLUFunction.apply 44 | -------------------------------------------------------------------------------- /src/megatron/core/fusions/fused_layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import numbers 4 | import torch 5 | from torch.nn.parameter import Parameter 6 | from torch.nn import init 7 | import importlib 8 | 9 | from megatron.core.utils import make_viewless_tensor 10 | 11 | try: 12 | from apex.contrib.layer_norm.layer_norm import FastLayerNormFN 13 | HAVE_PERSIST_LAYER_NORM = True 14 | except: 15 | HAVE_PERSIST_LAYER_NORM = False 16 | 17 | try: 18 | from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction 19 | HAVE_FUSED_LAYER_NORM = True 20 | except: 21 | HAVE_FUSED_LAYER_NORM = False 22 | 23 | 24 | class FusedLayerNorm(torch.nn.Module): 25 | 26 | def __init__(self, hidden_size, eps=1e-5, 27 | persist_layer_norm=True, 28 | sequence_parallel=False, 29 | zero_centered_gamma=False): 30 | super().__init__() 31 | 32 | self.zero_centered_gamma = zero_centered_gamma 33 | 34 | # List of hiddens sizes supported in the persistent layer norm kernel 35 | # If the hidden size is not supported, fall back to the non-persistent 36 | # kernel. 37 | persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096, 38 | 5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480, 39 | 24576, 25600, 30720, 32768, 40960, 49152, 65536] 40 | if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM: 41 | persist_layer_norm = False 42 | 43 | if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM: 44 | # TODO: Add pytorch only layer norm 45 | raise ValueError(f'Apex must currently be installed to use megatron core.') 46 | 47 | if isinstance(hidden_size, numbers.Integral): 48 | hidden_size = (hidden_size,) 49 | self.hidden_size = torch.Size(hidden_size) 50 | self.eps = eps 51 | self.weight = Parameter(torch.Tensor(*hidden_size)) 52 | self.bias = Parameter(torch.Tensor(*hidden_size)) 53 | self.reset_parameters() 54 | self.persist_layer_norm = persist_layer_norm 55 | self.sequence_parallel = sequence_parallel 56 | 57 | # set sequence parallelism flag on weight and bias parameters 58 | setattr(self.weight, 'sequence_parallel', self.sequence_parallel) 59 | setattr(self.bias, 'sequence_parallel', self.sequence_parallel) 60 | 61 | 62 | def reset_parameters(self): 63 | 64 | if self.zero_centered_gamma: 65 | init.zeros_(self.weight) 66 | init.zeros_(self.bias) 67 | else: 68 | init.ones_(self.weight) 69 | init.zeros_(self.bias) 70 | 71 | def forward(self, input): 72 | 73 | weight = self.weight + 1 if self.zero_centered_gamma else self.weight 74 | 75 | if self.persist_layer_norm: 76 | output = FastLayerNormFN.apply(input, weight, self.bias, self.eps) 77 | 78 | # Apex's fast layer norm function outputs a 'view' tensor (i.e., has 79 | # a populated '_base' field). This will result in schedule.py's 80 | # deallocate_output_tensor() throwing an error, so a viewless tensor is 81 | # created to prevent this. 82 | output = make_viewless_tensor(inp = output, 83 | requires_grad = input.requires_grad, 84 | keep_graph = True) 85 | 86 | else: 87 | output = FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.hidden_size, self.eps) 88 | 89 | return output 90 | -------------------------------------------------------------------------------- /src/megatron/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/core/models/__init__.py -------------------------------------------------------------------------------- /src/megatron/core/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_model import GPTModel 2 | -------------------------------------------------------------------------------- /src/megatron/core/models/gpt/gpt_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core import tensor_parallel 6 | 7 | from megatron.core.transformer.module import MegatronModule 8 | from megatron.core.transformer.transformer_config import TransformerConfig 9 | 10 | 11 | class GPTEmbedding(MegatronModule): 12 | """Language model embeddings. 13 | 14 | Arguments: 15 | config (TransformerConfig): config object with all necessary configs for TransformerBlock 16 | vocab_size (int): vocabulary size 17 | max_sequence_length (int): maximum size of sequence. This 18 | is used for positional embedding 19 | embedding_dropout_prob float): dropout probability for embeddings 20 | """ 21 | 22 | def __init__(self, config: TransformerConfig, vocab_size: int, max_sequence_length: int): 23 | super().__init__(config=config) 24 | 25 | self.config: TransformerConfig = config 26 | self.vocab_size: int = vocab_size 27 | self.max_sequence_length: int = max_sequence_length 28 | 29 | # Word embeddings (parallel). 30 | self.word_embeddings = tensor_parallel.VocabParallelEmbedding( 31 | num_embeddings=self.vocab_size, 32 | embedding_dim=self.config.hidden_size, 33 | init_method=self.config.init_method, 34 | config=self.config 35 | ) 36 | # @jcasper are these keys needed? 37 | self._word_embeddings_key = 'word_embeddings' 38 | 39 | # Position embedding (serial). 40 | self.position_embeddings = torch.nn.Embedding(self.max_sequence_length, self.config.hidden_size) 41 | self._position_embeddings_key = 'position_embeddings' 42 | 43 | # Initialize the position embeddings. 44 | if self.config.perform_initialization: 45 | self.config.init_method(self.position_embeddings.weight) 46 | 47 | # Embeddings dropout 48 | self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout) 49 | 50 | def zero_parameters(self): 51 | """Zero out all parameters in embedding.""" 52 | self.word_embeddings.weight.data.fill_(0) 53 | self.word_embeddings.weight.shared = True 54 | self.position_embeddings.weight.data.fill_(0) 55 | self.position_embeddings.weight.shared = True 56 | 57 | def forward(self, input_ids, position_ids): 58 | # Embeddings. 59 | words_embeddings = self.word_embeddings(input_ids) 60 | position_embeddings = self.position_embeddings(position_ids) 61 | embeddings = words_embeddings + position_embeddings 62 | 63 | # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. 64 | embeddings = embeddings.transpose(0, 1).contiguous() 65 | 66 | # If the input flag for fp32 residual connection is set, convert for float. 67 | if self.config.fp32_residual_connection: 68 | embeddings = embeddings.float() 69 | 70 | # Dropout. 71 | if self.config.sequence_parallel: 72 | embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings) 73 | with tensor_parallel.get_cuda_rng_tracker().fork(): 74 | embeddings = self.embedding_dropout(embeddings) 75 | else: 76 | embeddings = self.embedding_dropout(embeddings) 77 | 78 | return embeddings 79 | 80 | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): 81 | """For easy load.""" 82 | 83 | state_dict_ = {} 84 | state_dict_[self._word_embeddings_key] = self.word_embeddings.state_dict(prefix=prefix, keep_vars=keep_vars) 85 | state_dict_[self._position_embeddings_key] = self.position_embeddings.state_dict( 86 | prefix=prefix, keep_vars=keep_vars 87 | ) 88 | 89 | return state_dict_ 90 | 91 | def load_state_dict(self, state_dict, strict=True): 92 | """Customized load.""" 93 | 94 | # Word embedding. 95 | if self._word_embeddings_key in state_dict: 96 | state_dict_ = state_dict[self._word_embeddings_key] 97 | else: 98 | # for backward compatibility. 99 | state_dict_ = {} 100 | for key in state_dict.keys(): 101 | if 'word_embeddings' in key: 102 | state_dict_[key.split('word_embeddings.')[1]] = state_dict[key] 103 | self.word_embeddings.load_state_dict(state_dict_, strict=strict) 104 | 105 | # Position embedding. 106 | if self._position_embeddings_key in state_dict: 107 | state_dict_ = state_dict[self._position_embeddings_key] 108 | else: 109 | # for backward compatibility. 110 | state_dict_ = {} 111 | for key in state_dict.keys(): 112 | if 'position_embeddings' in key: 113 | state_dict_[key.split('position_embeddings.')[1]] = state_dict[key] 114 | self.position_embeddings.load_state_dict(state_dict_, strict=strict) 115 | -------------------------------------------------------------------------------- /src/megatron/core/package_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | MAJOR = 0 5 | MINOR = 2 6 | PATCH = 0 7 | PRE_RELEASE = '' 8 | 9 | # Use the following formatting: (major, minor, patch, pre-release) 10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) 11 | 12 | __shortversion__ = '.'.join(map(str, VERSION[:3])) 13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) 14 | 15 | __package_name__ = 'megatron_core' 16 | __contact_names__ = 'NVIDIA' 17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email 18 | __homepage__ = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage 19 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' 20 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 21 | __description__ = 'Megatron Core - a library for efficient and scalable training of transformer based models' 22 | __license__ = 'BSD-3' 23 | __keywords__ = 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' 24 | -------------------------------------------------------------------------------- /src/megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .schedules import get_forward_backward_func 2 | -------------------------------------------------------------------------------- /src/megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | pybind11 2 | torch 3 | regex -------------------------------------------------------------------------------- /src/megatron/core/sequence_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy import vocab_sequence_parallel_cross_entropy 2 | -------------------------------------------------------------------------------- /src/megatron/core/sequence_parallel/cross_entropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from megatron.core.parallel_state import ( 4 | get_sequence_parallel_group, 5 | get_sequence_parallel_rank, 6 | get_sequence_parallel_world_size 7 | ) 8 | 9 | class _VocabSequenceParallelCrossEntropy(torch.autograd.Function): 10 | 11 | @staticmethod 12 | def forward(ctx, vocab_seq_parallel_logits, target, label_smoothing=0.0): 13 | # vocab_seq_parallel_logits: [S/P, B, V] 14 | # target: [S/P, B] 15 | # return: [S, B] 16 | 17 | # Need softmax for backward 18 | softmax = torch.nn.functional.softmax(vocab_seq_parallel_logits, dim=-1) 19 | ctx.vocab_size = vocab_seq_parallel_logits.size(2) 20 | loss = torch.nn.functional.nll_loss(softmax.log().view(-1, ctx.vocab_size), target.view(-1), reduction='none') 21 | 22 | ctx.seqlen = vocab_seq_parallel_logits.size(0) * get_sequence_parallel_world_size() 23 | batch_size = vocab_seq_parallel_logits.size(1) 24 | 25 | loss_all = torch.empty(ctx.seqlen, batch_size, dtype=vocab_seq_parallel_logits.dtype, device=vocab_seq_parallel_logits.device) 26 | torch.distributed.all_gather_into_tensor(loss_all, loss, group=get_sequence_parallel_group()) 27 | 28 | ctx.save_for_backward(softmax, target) 29 | 30 | return loss_all 31 | 32 | @staticmethod 33 | def backward(ctx, grad_output): 34 | softmax, target = ctx.saved_tensors 35 | 36 | step_seqlen = ctx.seqlen // get_sequence_parallel_world_size() 37 | sp_rank = get_sequence_parallel_rank() 38 | grad_output_part = grad_output[step_seqlen*sp_rank:step_seqlen*(sp_rank + 1), :] 39 | 40 | grad_input = softmax 41 | grad_2d = grad_input.view(-1, ctx.vocab_size) 42 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], 43 | device=grad_2d.device) 44 | 45 | grad_2d[arange_1d, target.view(-1)] -= 1 46 | grad_input.mul_(grad_output_part.unsqueeze(dim=-1)) 47 | 48 | return grad_input, None, None 49 | 50 | 51 | def vocab_sequence_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0): 52 | return _VocabSequenceParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing) 53 | -------------------------------------------------------------------------------- /src/megatron/core/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy import vocab_parallel_cross_entropy 2 | from .data import broadcast_data 3 | 4 | from .layers import ( 5 | ColumnParallelLinear, 6 | RowParallelLinear, 7 | VocabParallelEmbedding, 8 | set_tensor_model_parallel_attributes, 9 | set_defaults_if_not_set_tensor_model_parallel_attributes, 10 | copy_tensor_model_parallel_attributes, 11 | param_is_not_tensor_parallel_duplicate, 12 | linear_with_grad_accumulation_and_async_allreduce 13 | 14 | ) 15 | 16 | from .mappings import ( 17 | copy_to_tensor_model_parallel_region, 18 | gather_from_tensor_model_parallel_region, 19 | gather_from_sequence_parallel_region, 20 | scatter_to_tensor_model_parallel_region, 21 | scatter_to_sequence_parallel_region, 22 | ) 23 | 24 | from .random import ( 25 | checkpoint, 26 | get_cuda_rng_tracker, 27 | model_parallel_cuda_manual_seed, 28 | init_checkpointed_activations_memory_buffer, 29 | reset_checkpointed_activations_memory_buffer, 30 | ) 31 | 32 | from .utils import ( 33 | split_tensor_along_last_dim, 34 | split_tensor_into_1d_equal_chunks, 35 | gather_split_1d_tensor, 36 | ) 37 | 38 | __all__ = [ 39 | # cross_entropy.py 40 | "vocab_parallel_cross_entropy", 41 | # data.py 42 | "broadcast_data", 43 | #layers.py 44 | "ColumnParallelLinear", 45 | "RowParallelLinear", 46 | "VocabParallelEmbedding", 47 | "set_tensor_model_parallel_attributes", 48 | "set_defaults_if_not_set_tensor_model_parallel_attributes", 49 | "copy_tensor_model_parallel_attributes", 50 | "param_is_not_tensor_parallel_duplicate", 51 | "linear_with_grad_accumulation_and_async_allreduce", 52 | # mappings.py 53 | "copy_to_tensor_model_parallel_region", 54 | "gather_from_tensor_model_parallel_region", 55 | "gather_from_sequence_parallel_region", 56 | # "reduce_from_tensor_model_parallel_region", 57 | "scatter_to_tensor_model_parallel_region", 58 | "scatter_to_sequence_parallel_region", 59 | # random.py 60 | "checkpoint", 61 | "get_cuda_rng_tracker", 62 | "model_parallel_cuda_manual_seed", 63 | "init_checkpointed_activations_memory_buffer", 64 | "reset_checkpointed_activations_memory_buffer", 65 | # utils.py 66 | "split_tensor_along_last_dim", 67 | "split_tensor_into_1d_equal_chunks", 68 | "gather_split_1d_tensor", 69 | ] 70 | -------------------------------------------------------------------------------- /src/megatron/core/tensor_parallel/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core.parallel_state import ( 6 | get_tensor_model_parallel_group, 7 | get_tensor_model_parallel_rank, 8 | get_tensor_model_parallel_src_rank, 9 | get_sequence_parallel_group, 10 | get_sequence_parallel_world_size, 11 | get_sequence_parallel_rank, 12 | get_sequence_parallel_src_rank, 13 | ) 14 | from deepspeed.accelerator import get_accelerator 15 | 16 | _MAX_DATA_DIM = 5 17 | 18 | 19 | def _check_data_types(keys, data, target_dtype): 20 | """Check that all the keys have the same target data type.""" 21 | for key in keys: 22 | assert data[key].dtype == target_dtype, '{} has data type {} which '\ 23 | 'is different than {}'.format(key, data[key].dtype, target_dtype) 24 | 25 | 26 | def _build_key_size_numel_dictionaries(keys, data, group=None, rank=-1, src_rank=-1): 27 | if group is None: 28 | group = get_tensor_model_parallel_group() 29 | if src_rank < 0: 30 | src_rank = get_tensor_model_parallel_src_rank() 31 | if rank < 0: 32 | rank = get_tensor_model_parallel_rank() 33 | 34 | """Build the size on rank 0 and broadcast.""" 35 | max_dim = _MAX_DATA_DIM 36 | sizes = [0 for _ in range(max_dim) for _ in keys] 37 | 38 | # Pack the sizes on rank zero. 39 | if rank == 0: 40 | offset = 0 41 | for key in keys: 42 | assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' 43 | size = data[key].size() 44 | for i, s in enumerate(size): 45 | sizes[i + offset] = s 46 | offset += max_dim 47 | 48 | # Move to GPU and broadcast. 49 | sizes_cuda = get_accelerator().LongTensor(sizes) 50 | torch.distributed.broadcast(sizes_cuda, src_rank, group=group) 51 | 52 | # Move back to cpu and unpack. 53 | sizes_cpu = sizes_cuda.cpu() 54 | key_size = {} 55 | key_numel = {} 56 | total_numel = 0 57 | offset = 0 58 | for key in keys: 59 | i = 0 60 | size = [] 61 | numel = 1 62 | while sizes_cpu[offset + i] > 0: 63 | this_size = sizes_cpu[offset + i] 64 | size.append(this_size) 65 | numel *= this_size 66 | i += 1 67 | key_size[key] = size 68 | key_numel[key] = numel 69 | total_numel += numel 70 | offset += max_dim 71 | 72 | return key_size, key_numel, total_numel 73 | 74 | 75 | def broadcast_data(keys, data, datatype): 76 | """Broadcast data from rank zero of each model parallel group to the 77 | members of the same model parallel group. 78 | 79 | Arguments: 80 | keys: list of keys in the data disctionary to be broadcasted 81 | data: data dictionary of string keys and cpu tensor values. 82 | datatype: torch data type of all tensors in data associated 83 | with keys. 84 | """ 85 | # Build (key, size) and (key, number of elements) dictionaries along 86 | # with the total number of elements on all ranks. 87 | if get_sequence_parallel_world_size() > 1: 88 | rank = get_sequence_parallel_rank() 89 | src_rank = get_sequence_parallel_src_rank() 90 | group = get_sequence_parallel_group() 91 | else: 92 | rank = get_tensor_model_parallel_rank() 93 | src_rank = get_tensor_model_parallel_src_rank() 94 | group = get_tensor_model_parallel_group() 95 | 96 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries( 97 | keys, data, group=group, rank=rank, src_rank=src_rank) 98 | 99 | # Pack on rank zero. 100 | if rank == 0: 101 | # Check that all keys have the same data type. 102 | _check_data_types(keys, data, datatype) 103 | # Flatten the data associated with the keys 104 | flatten_data = torch.cat( 105 | [data[key].contiguous().view(-1) for key in keys], dim=0).to(get_accelerator().device_name()) 106 | else: 107 | flatten_data = torch.empty(total_numel, 108 | device=get_accelerator().current_device_name(), 109 | dtype=datatype) 110 | 111 | # Broadcast 112 | torch.distributed.broadcast(flatten_data, src_rank, group=group) 113 | 114 | # Unpack 115 | output = {} 116 | offset = 0 117 | for key in keys: 118 | size = key_size[key] 119 | numel = key_numel[key] 120 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 121 | offset += numel 122 | 123 | return output 124 | -------------------------------------------------------------------------------- /src/megatron/core/tensor_parallel/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from typing import List, Sequence 5 | 6 | from megatron.core.utils import divide 7 | from megatron.core import parallel_state 8 | 9 | def split_tensor_along_last_dim( 10 | tensor: torch.Tensor, 11 | num_partitions: int, 12 | contiguous_split_chunks: bool = False, 13 | ) -> List[torch.Tensor]: 14 | """ Split a tensor along its last dimension. 15 | 16 | Arguments: 17 | tensor: input tensor. 18 | num_partitions: number of partitions to split the tensor 19 | contiguous_split_chunks: If True, make each chunk contiguous 20 | in memory. 21 | 22 | Returns: 23 | A list of Tensors 24 | """ 25 | # Get the size and dimension. 26 | last_dim = tensor.dim() - 1 27 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 28 | # Split. 29 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 30 | # Note: torch.split does not create contiguous tensors by default. 31 | if contiguous_split_chunks: 32 | return tuple(chunk.contiguous() for chunk in tensor_list) 33 | 34 | return tensor_list 35 | 36 | def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False): 37 | """ Break a tensor into equal 1D chunks across tensor parallel ranks. 38 | 39 | Returns a Tensor or View with this rank's portion of the data. 40 | 41 | Arguments: 42 | tensor: The tensor to split 43 | 44 | Keyword Arguments: 45 | new_buffer (bool): If True, returns a new Tensor. 46 | If False, returns a view into the existing Tensor. 47 | Default is False 48 | 49 | """ 50 | partition_size = torch.numel(tensor) // \ 51 | parallel_state.get_tensor_model_parallel_world_size() 52 | start_index = partition_size * parallel_state.get_tensor_model_parallel_rank() 53 | end_index = start_index + partition_size 54 | if new_buffer: 55 | data = torch.empty(partition_size, dtype=tensor.dtype, 56 | device=torch.cuda.current_device(), 57 | requires_grad=False) 58 | data.copy_(tensor.view(-1)[start_index:end_index]) 59 | else: 60 | data = tensor.view(-1)[start_index:end_index] 61 | return data 62 | 63 | 64 | def gather_split_1d_tensor(tensor): 65 | """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor 66 | model parallel ranks. 67 | 68 | Returns a new Tensor with the gathered data. 69 | 70 | Arguments: 71 | tensor: A Tensor or view of this rank's portion of the data. 72 | """ 73 | numel_gathered = torch.numel(tensor) * \ 74 | parallel_state.get_tensor_model_parallel_world_size() 75 | gathered = torch.empty(numel_gathered, dtype=tensor.dtype, 76 | device=torch.cuda.current_device(), 77 | requires_grad=False) 78 | # TODO: This API is experimental in pytorch (as of Feb 2022) and 79 | # this might break in future pytorch releases. We chose this API 80 | # as opposed to torch.distributed.all_gather for efficiency reasons. 81 | # This API calls directly NCCL all-gather versus the former does 82 | # internal copies and can potentially cause slow down. 83 | torch.distributed._all_gather_base(gathered, tensor, 84 | group=parallel_state.get_tensor_model_parallel_group()) 85 | return gathered 86 | 87 | 88 | class VocabUtility: 89 | """ Split the vocabulary into `world_size` chunks and return the first 90 | and last index of the vocabulary belonging to the `rank` 91 | partition: Note that indices in [fist, last) 92 | 93 | """ 94 | 95 | @staticmethod 96 | def vocab_range_from_per_partition_vocab_size( 97 | per_partition_vocab_size: int, rank, world_size: int 98 | ) -> Sequence[int]: 99 | index_f = rank * per_partition_vocab_size 100 | index_l = index_f + per_partition_vocab_size 101 | return index_f, index_l 102 | 103 | @staticmethod 104 | def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]: 105 | per_partition_vocab_size = divide(global_vocab_size, world_size) 106 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 107 | per_partition_vocab_size, rank, world_size 108 | ) 109 | -------------------------------------------------------------------------------- /src/megatron/core/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .transformer_config import TransformerConfig 4 | from .core_attention import CoreAttention 5 | -------------------------------------------------------------------------------- /src/megatron/core/transformer/custom_layers/transformer_engine.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformer_engine as te 3 | from typing import Callable 4 | 5 | from megatron.core.transformer.transformer_config import TransformerConfig 6 | from megatron.core.transformer.enums import AttnMaskType 7 | from megatron.core.parallel_state import get_tensor_model_parallel_group 8 | from megatron.core.tensor_parallel import get_cuda_rng_tracker 9 | 10 | class TELayerNorm(te.pytorch.module.LayerNorm): 11 | """ 12 | Wrapper for the Transformer-Engine's `LayerNorm`. 13 | """ 14 | def __init__(self, 15 | hidden_size: int, 16 | eps: float = 1e-5, 17 | sequence_parallel: bool = False, 18 | **kwargs): 19 | super().__init__( 20 | hidden_size=hidden_size, 21 | eps=eps, 22 | sequence_parallel=sequence_parallel 23 | ) 24 | 25 | class TELinear(te.pytorch.module.Linear): 26 | """ 27 | Wrapper for the Transformer-Engine's `Linear` layer. 28 | 29 | Note that if Megatron's parallel_state has not been initialized 30 | yet, the tp_group passed to TE will be None and must be set later 31 | via set_tensor_parallel_group(). 32 | """ 33 | def __init__(self, 34 | input_size: int, 35 | output_size: int, 36 | config: TransformerConfig, 37 | parallel_mode: str, 38 | init_method: Callable, *, 39 | bias: bool = True, 40 | skip_bias_add: bool = False, 41 | **kwargs): 42 | self.config = config 43 | 44 | # TE returns a zero length Tensor when bias=False and 45 | # return_bias=True, but we prefer None. So in that case we 46 | # tell TE to not return the bias, and return None 47 | # ourselves. This way our forward always returns two values 48 | # and we don't have to deal with the zero length Tensor. 49 | self.te_return_bias = skip_bias_add and bias 50 | 51 | super().__init__( 52 | in_features=input_size, 53 | out_features=output_size, 54 | sequence_parallel=self.config.sequence_parallel, 55 | fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion, 56 | tp_group=get_tensor_model_parallel_group(check_initialized=False), 57 | tp_size=self.config.tensor_model_parallel_size, 58 | get_rng_state_tracker=get_cuda_rng_tracker, 59 | init_method=init_method, 60 | params_dtype=self.config.params_dtype, 61 | parallel_mode=parallel_mode, 62 | bias=bias, 63 | return_bias=self.te_return_bias, 64 | **kwargs 65 | ) 66 | 67 | def forward(self, x): 68 | out = super().forward(x) 69 | 70 | # TE only returns a tuple when return_bias is True, otherwise 71 | # it returns a single Tensor, we always want to return two 72 | # values regardless of the arguments. 73 | if self.te_return_bias: 74 | return out 75 | return out, None 76 | 77 | class TEColumnParallelLinear(TELinear): 78 | """ 79 | Wrapper for the Transformer-Engine's `Linear` layer but specialized similar 80 | to megatron's `ColumnParallelLinear` layer. 81 | """ 82 | def __init__(self, 83 | input_size: int, 84 | output_size: int, 85 | config: TransformerConfig, 86 | **kwargs): 87 | self.config = config 88 | super().__init__( 89 | input_size=input_size, 90 | output_size=output_size, 91 | config=self.config, 92 | parallel_mode="column", 93 | **kwargs 94 | ) 95 | 96 | class TERowParallelLinear(TELinear): 97 | """ 98 | Wrapper for the Transformer-Engine's `Linear` layer but specialized similar 99 | to megatron's `RowParallelLinear` layer. 100 | """ 101 | def __init__(self, 102 | input_size: int, 103 | output_size: int, 104 | config: TransformerConfig, 105 | **kwargs): 106 | self.config = config 107 | super().__init__( 108 | input_size=input_size, 109 | output_size=output_size, 110 | config=self.config, 111 | parallel_mode="row", 112 | **kwargs 113 | ) 114 | 115 | class TECoreAttention(te.pytorch.transformer.DotProductAttention): 116 | """ 117 | Wrapper for the Transformer-Engine's `DotProductAttention` layer that also 118 | has "flash attention" enabled. 119 | 120 | Note that if Megatron's parallel_state has not been initialized 121 | yet, the tp_group passed to TE will be None and must be set later 122 | via set_tensor_parallel_group(). 123 | """ 124 | def __init__(self, 125 | config: TransformerConfig, 126 | layer_number: int = 1, 127 | attn_mask_type: AttnMaskType = AttnMaskType.padding, 128 | **kwargs): 129 | self.config = config 130 | super().__init__( 131 | num_attention_heads=self.config.num_attention_heads, 132 | kv_channels=self.config.kv_channels, 133 | attention_dropout=self.config.attention_dropout, 134 | layer_number=layer_number, 135 | attn_mask_type=attn_mask_type.name, 136 | sequence_parallel=self.config.sequence_parallel, 137 | tp_size=self.config.tensor_model_parallel_size, 138 | get_rng_state_tracker=get_cuda_rng_tracker, 139 | tp_group=get_tensor_model_parallel_group(check_initialized=False), 140 | **kwargs 141 | ) 142 | -------------------------------------------------------------------------------- /src/megatron/core/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | # can we get rid of this? 7 | # it's being used in pipeline schedules 8 | class ModelType(enum.Enum): 9 | encoder_or_decoder = 1 10 | encoder_and_decoder = 2 11 | 12 | 13 | # class LayerType(enum.Enum): 14 | # encoder = 1 15 | # decoder = 2 16 | 17 | 18 | class AttnType(enum.Enum): 19 | self_attn = 1 20 | cross_attn = 2 21 | 22 | 23 | class AttnMaskType(enum.Enum): 24 | padding = 1 25 | causal = 2 26 | -------------------------------------------------------------------------------- /src/megatron/core/transformer/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from megatron.core import tensor_parallel 7 | from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl 8 | from megatron.core.transformer.module import MegatronModule 9 | from megatron.core.transformer.transformer_config import TransformerConfig 10 | from megatron.core.transformer.custom_layers.transformer_engine import \ 11 | TERowParallelLinear, TEColumnParallelLinear 12 | 13 | class MLP(MegatronModule): 14 | """ 15 | MLP will take the input with h hidden state, project it to 4*h 16 | hidden dimension, perform nonlinear transformation, and project the 17 | state back into h hidden dimension. 18 | 19 | 20 | Returns an output and a bias to be added to the output. 21 | If config.add_bias_linear is False, the bias returned is None. 22 | 23 | We use the following notation: 24 | h: hidden size 25 | p: number of tensor model parallel partitions 26 | b: batch size 27 | s: sequence length 28 | """ 29 | 30 | def __init__(self, config: TransformerConfig): 31 | super().__init__(config=config) 32 | 33 | self.config: TransformerConfig = config 34 | 35 | # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf 36 | ffn_hidden_size = self.config.ffn_hidden_size 37 | if self.config.gated_linear_unit: 38 | ffn_hidden_size *= 2 39 | 40 | self.linear_fc1 = TEColumnParallelLinear( 41 | self.config.hidden_size, 42 | ffn_hidden_size, 43 | config=self.config, 44 | init_method=self.config.init_method, 45 | bias=self.config.add_bias_linear, 46 | skip_bias_add=True, 47 | ) 48 | 49 | if self.config.gated_linear_unit: 50 | def glu(x): 51 | x = torch.chunk(x, 2, dim=-1) 52 | return self.config.activation_func(x[0]) * x[1] 53 | self.activation_func = glu 54 | else: 55 | self.activation_func = self.config.activation_func 56 | 57 | self.linear_fc2 = TERowParallelLinear( 58 | self.config.ffn_hidden_size, 59 | self.config.hidden_size, 60 | config=self.config, 61 | init_method=self.config.output_layer_init_method, 62 | bias=self.config.add_bias_linear, 63 | skip_bias_add=True, 64 | ) 65 | 66 | def forward(self, hidden_states): 67 | 68 | # [s, b, 4 * h/p] 69 | intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) 70 | 71 | if self.config.bias_gelu_fusion: 72 | assert self.config.add_bias_linear is True 73 | assert self.activation_func == F.gelu 74 | intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel) 75 | else: 76 | if bias_parallel is not None: 77 | intermediate_parallel = intermediate_parallel + bias_parallel 78 | intermediate_parallel = self.activation_func(intermediate_parallel) 79 | 80 | # [s, b, h] 81 | output, output_bias = self.linear_fc2(intermediate_parallel) 82 | return output, output_bias 83 | -------------------------------------------------------------------------------- /src/megatron/core/transformer/module.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Megatron Module""" 4 | 5 | import torch 6 | from torch.autograd import Variable 7 | from torch.nn.parameter import Parameter 8 | 9 | from megatron.core import parallel_state, tensor_parallel 10 | from megatron.core.transformer.transformer_config import TransformerConfig 11 | 12 | 13 | _FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) 14 | _HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) 15 | _BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor) 16 | 17 | 18 | def param_is_not_shared(param): 19 | return not hasattr(param, 'shared') or not param.shared 20 | 21 | 22 | class MegatronModule(torch.nn.Module): 23 | """Megatron specific extensions of torch Module with support 24 | for pipelining.""" 25 | 26 | # def __init__(self, config: TransformerConfig, share_word_embeddings=True): 27 | def __init__(self, config: TransformerConfig): 28 | super().__init__() 29 | self.config = config 30 | 31 | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): 32 | """Use this function to override the state dict for 33 | saving checkpoints.""" 34 | return self.state_dict(prefix=prefix, keep_vars=keep_vars) 35 | 36 | 37 | def conversion_helper(val, conversion): 38 | """Apply conversion to val. Recursively apply conversion if `val` 39 | #is a nested tuple/list structure.""" 40 | if not isinstance(val, (tuple, list)): 41 | return conversion(val) 42 | rtn = [conversion_helper(v, conversion) for v in val] 43 | if isinstance(val, tuple): 44 | rtn = tuple(rtn) 45 | return rtn 46 | 47 | 48 | def fp32_to_float16(val, float16_convertor): 49 | """Convert fp32 `val` to fp16/bf16""" 50 | 51 | def half_conversion(val): 52 | val_typecheck = val 53 | if isinstance(val_typecheck, (Parameter, Variable)): 54 | val_typecheck = val.data 55 | if isinstance(val_typecheck, _FLOAT_TYPES): 56 | val = float16_convertor(val) 57 | return val 58 | 59 | return conversion_helper(val, half_conversion) 60 | 61 | 62 | def float16_to_fp32(val): 63 | """Convert fp16/bf16 `val` to fp32""" 64 | 65 | def float_conversion(val): 66 | val_typecheck = val 67 | if isinstance(val_typecheck, (Parameter, Variable)): 68 | val_typecheck = val.data 69 | if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)): 70 | val = val.float() 71 | return val 72 | 73 | return conversion_helper(val, float_conversion) 74 | 75 | 76 | class Float16Module(MegatronModule): 77 | def __init__(self, config: TransformerConfig, module: torch.nn.Module): 78 | super(Float16Module, self).__init__(config) 79 | self.config = config 80 | self.fp16 = config.fp16 81 | self.bf16 = config.bf16 82 | 83 | if self.fp16: 84 | self.add_module('module', module.half()) 85 | 86 | def float16_convertor(val): 87 | return val.half() 88 | 89 | elif self.bf16: 90 | self.add_module('module', module.bfloat16()) 91 | 92 | def float16_convertor(val): 93 | return val.bfloat16() 94 | 95 | else: 96 | raise Exception('Either config.fp16 or config.bf16 should be True.') 97 | 98 | self.float16_convertor = float16_convertor 99 | 100 | def set_input_tensor(self, input_tensor): 101 | return self.module.set_input_tensor(input_tensor) 102 | 103 | def forward(self, *inputs, **kwargs): 104 | if parallel_state.is_pipeline_first_stage(): 105 | inputs = fp32_to_float16(inputs, self.float16_convertor) 106 | outputs = self.module(*inputs, **kwargs) 107 | if parallel_state.is_pipeline_last_stage(): 108 | outputs = float16_to_fp32(outputs) 109 | return outputs 110 | 111 | def state_dict(self, destination=None, prefix='', keep_vars=False): 112 | return self.module.state_dict(prefix=prefix, keep_vars=keep_vars) 113 | 114 | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): 115 | return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars) 116 | 117 | def load_state_dict(self, state_dict, strict=True): 118 | self.module.load_state_dict(state_dict, strict=strict) 119 | -------------------------------------------------------------------------------- /src/megatron/core/transformer/transformer_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core.transformer.module import MegatronModule 6 | from megatron.core.transformer.transformer_config import TransformerConfig 7 | from megatron.core.transformer.enums import AttnType, AttnMaskType 8 | from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add 9 | from megatron.core.transformer.attention import SelfAttention 10 | from megatron.core.transformer.mlp import MLP 11 | from megatron.core.utils import make_viewless_tensor 12 | from megatron.core.transformer.custom_layers.transformer_engine import TELayerNorm 13 | 14 | class TransformerLayer(MegatronModule): 15 | """A single transformer layer. 16 | 17 | Transformer layer takes input with size [s, b, h] and returns an 18 | output of the same size. 19 | """ 20 | 21 | def __init__( 22 | self, config: TransformerConfig, layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding, 23 | ): 24 | super().__init__(config=config) 25 | self.config: TransformerConfig = config 26 | 27 | self.layer_number = layer_number 28 | self.self_attn_mask_type = self_attn_mask_type 29 | 30 | # Layernorm on the input data. 31 | # TODO: add pytorch only layernorm 32 | self.input_layernorm = TELayerNorm( 33 | hidden_size=self.config.hidden_size, 34 | eps=self.config.layernorm_epsilon, 35 | persist_layer_norm=self.config.persist_layer_norm, 36 | sequence_parallel=self.config.sequence_parallel, 37 | zero_centered_gamma=self.config.layernorm_zero_centered_gamma, 38 | ) 39 | 40 | # Self attention. 41 | self.self_attention = SelfAttention( 42 | config=self.config, 43 | layer_number=layer_number, 44 | attn_mask_type=self_attn_mask_type, 45 | ) 46 | 47 | # Layernorm on the attention output 48 | self.post_self_attn_layernorm = TELayerNorm( 49 | hidden_size=self.config.hidden_size, 50 | eps=self.config.layernorm_epsilon, 51 | persist_layer_norm=self.config.persist_layer_norm, 52 | sequence_parallel=self.config.sequence_parallel, 53 | zero_centered_gamma=self.config.layernorm_zero_centered_gamma, 54 | ) 55 | 56 | # MLP 57 | self.mlp = MLP(config=self.config) 58 | 59 | # @jcasper how should we handle nvfuser? 60 | # Set bias+dropout+add fusion grad_enable execution handler. 61 | # TORCH_MAJOR = int(torch.__version__.split('.')[0]) 62 | # TORCH_MINOR = int(torch.__version__.split('.')[1]) 63 | # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) 64 | # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad 65 | self.bias_dropout_add_exec_handler = torch.enable_grad 66 | 67 | self.bias_dropout_add_func = get_bias_dropout_add( 68 | self.training, 69 | self.config.bias_dropout_fusion 70 | ) 71 | 72 | # TODO: decide how to do inference_params 73 | def forward( 74 | self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None 75 | ): 76 | # hidden_states: [s, b, h] 77 | 78 | # Layer norm at the beginning of the transformer layer. 79 | layernorm_output = self.input_layernorm(hidden_states) 80 | # Self attention. 81 | attention_output_with_bias = self.self_attention( 82 | layernorm_output, attention_mask, inference_params=inference_params 83 | ) 84 | 85 | # Residual connection. 86 | if self.config.apply_residual_connection_post_layernorm: 87 | residual = layernorm_output 88 | else: 89 | residual = hidden_states 90 | 91 | # bias_dropout_add fusion returning fp32 instead of bf16 92 | with self.bias_dropout_add_exec_handler(): 93 | layernorm_input = self.bias_dropout_add_func( 94 | attention_output_with_bias, residual, self.config.hidden_dropout 95 | ) 96 | 97 | # Layer norm post the self attention. 98 | layernorm_output = self.post_self_attn_layernorm(layernorm_input) 99 | 100 | # MLP. 101 | mlp_output_with_bias = self.mlp(layernorm_output) 102 | 103 | # Second residual connection. 104 | if self.config.apply_residual_connection_post_layernorm: 105 | residual = layernorm_output 106 | else: 107 | residual = layernorm_input 108 | 109 | with self.bias_dropout_add_exec_handler(): 110 | output = self.bias_dropout_add_func( 111 | mlp_output_with_bias, residual, self.config.hidden_dropout 112 | ) 113 | 114 | # Jit compiled function creates 'view' tensor. This tensor 115 | # potentially gets saved in the MPU checkpoint function context, 116 | # which rejects view tensors. While making a viewless tensor here 117 | # won't result in memory savings (like the data loader, or 118 | # p2p_communication), it serves to document the origin of this 119 | # 'view' tensor. 120 | output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True) 121 | 122 | return output 123 | -------------------------------------------------------------------------------- /src/megatron/core/transformer/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for transformer layers.""" 4 | 5 | import torch 6 | 7 | from megatron import get_args 8 | 9 | from deepspeed.runtime.zero import GatheredParameters 10 | 11 | def attention_mask_func(attention_scores, attention_mask): 12 | attention_scores.masked_fill_(attention_mask, -10000.0) 13 | return attention_scores 14 | 15 | 16 | def get_linear_layer(rows, columns, init_method, gather_params_on_init=False): 17 | """Simple linear layer with weight initialization.""" 18 | layer = torch.nn.Linear(rows, columns) 19 | if get_args().perform_initialization: 20 | with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init): 21 | init_method(layer.weight) 22 | with torch.no_grad(): 23 | with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init): 24 | layer.bias.zero_() 25 | return layer 26 | 27 | 28 | @torch.jit.script 29 | def gelu_impl(x): 30 | """OpenAI's gelu implementation.""" 31 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x))) 32 | 33 | 34 | def openai_gelu(x): 35 | return gelu_impl(x) 36 | 37 | 38 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 39 | @torch.jit.script 40 | def erf_gelu(x): 41 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype)) 42 | -------------------------------------------------------------------------------- /src/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /src/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /src/megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Blendable dataset.""" 4 | 5 | import hashlib 6 | import os 7 | import time 8 | 9 | import numpy as np 10 | import torch 11 | 12 | from megatron import print_rank_0 13 | from megatron.core import mpu 14 | 15 | class BlendableDataset(torch.utils.data.Dataset): 16 | 17 | 18 | def __init__(self, datasets, weights, size, *, 19 | data_cache_path=None): 20 | 21 | self.datasets = datasets 22 | num_datasets = len(datasets) 23 | assert num_datasets == len(weights) 24 | 25 | self.size = size 26 | 27 | # Normalize weights. 28 | weights = np.array(weights, dtype=np.float64) 29 | sum_weights = np.sum(weights) 30 | assert sum_weights > 0.0 31 | weights /= sum_weights 32 | 33 | # Build indicies. 34 | def _build_indices(): 35 | start_time = time.time() 36 | assert num_datasets < 255 37 | dataset_index = np.zeros(self.size, dtype=np.uint8) 38 | dataset_sample_index = np.zeros(self.size, dtype=np.int64) 39 | 40 | from megatron.data import helpers 41 | helpers.build_blending_indices(dataset_index, dataset_sample_index, 42 | weights, num_datasets, self.size, 43 | torch.distributed.get_rank() == 0) 44 | print_rank_0('> elapsed time for building blendable dataset indices: ' 45 | '{:.2f} (sec)'.format(time.time() - start_time)) 46 | return dataset_index, dataset_sample_index 47 | 48 | desc = "Blendable dataset\n\n" 49 | desc += "Datasets:\n" 50 | for dataset in datasets: 51 | desc += dataset.desc + "\n\n" 52 | desc += f"Weights: {weights}\n" 53 | desc += f"Size: {size}\n" 54 | self.desc = desc 55 | 56 | if data_cache_path: 57 | desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest() 58 | desc_path = os.path.join(data_cache_path, desc_hash + ".dsc") 59 | index_path = os.path.join(data_cache_path, desc_hash + "_index.npy") 60 | sample_index_path = os.path.join(data_cache_path, desc_hash + "_sample_index.npy") 61 | cache_hit = os.path.isfile(index_path) and os.path.isfile(sample_index_path) 62 | cache_success = True 63 | if torch.distributed.get_rank() == 0 and not cache_hit: 64 | print(' > WARNING: could not find index map files for blendable' 65 | ' dataset, building indices on rank 0 ...', flush=True) 66 | dataset_index, dataset_sample_index = _build_indices() 67 | try: 68 | os.makedirs(os.path.dirname(index_path), exist_ok=True) 69 | with open(desc_path, 'wt') as fd: 70 | fd.write(desc) 71 | np.save(index_path, dataset_index, allow_pickle=True) 72 | np.save(sample_index_path, dataset_sample_index, 73 | allow_pickle=True) 74 | except OSError: 75 | print(f'There was an error trying to create the data cache directory ({data_cache_path})') 76 | print('or a file in it. This is set with the --data-cache-path argument. Please') 77 | print('ensure you have write access to this directory or specify one that you do have') 78 | print('write access to.') 79 | cache_success = False 80 | 81 | 82 | counts = torch.cuda.LongTensor([cache_success]) 83 | torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) 84 | torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) 85 | if counts[0].item() != ( 86 | torch.distributed.get_world_size() // 87 | torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // 88 | torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())): 89 | print_rank_0("Data index creation unsuccessful, exiting.") 90 | exit() 91 | 92 | # Load on all ranks. 93 | print_rank_0(f'> loading blendable dataset index: {index_path}') 94 | self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r') 95 | assert self.dataset_index.size == self.size 96 | 97 | print_rank_0(f'> loading blendable dataset sample index: {sample_index_path}') 98 | self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r') 99 | assert self.dataset_sample_index.size == self.size 100 | else: 101 | self.dataset_index, self.dataset_sample_index = _build_indices() 102 | 103 | 104 | # Check size 105 | _ = self.__getitem__(self.size - 1) 106 | try: 107 | _ = self.__getitem__(self.size) 108 | raise RuntimeError('BlendedDataset size is improperly bounded') 109 | except IndexError: 110 | pass 111 | print_rank_0('> size of blendable dataset: ' 112 | '{} samples'.format(self.size)) 113 | 114 | 115 | def __len__(self): 116 | return self.size 117 | 118 | 119 | def __getitem__(self, idx): 120 | dataset_idx = self.dataset_index[idx] 121 | sample_idx = self.dataset_sample_index[idx] 122 | return { 123 | "dataset_idx" : dataset_idx, 124 | **self.datasets[dataset_idx][sample_idx], 125 | } 126 | -------------------------------------------------------------------------------- /src/megatron/data/test/test_indexed_dataset.py: -------------------------------------------------------------------------------- 1 | # This file isn't really a formal automated test, it's just a place to 2 | # put some code used during development and manual testing of 3 | # indexed_dataset. 4 | 5 | from megatron.data import indexed_dataset 6 | from megatron.tokenizer import build_tokenizer 7 | import argparse 8 | import os 9 | import sys 10 | 11 | import torch 12 | 13 | script_dir = os.path.dirname(os.path.realpath(__file__)) 14 | sys.path.append(os.path.join(script_dir, "../../../")) 15 | 16 | 17 | def test_indexed_dataset(args): 18 | ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) 19 | tokenizer = build_tokenizer(args) 20 | print(len(ds.doc_idx)) 21 | print(len(ds)) 22 | print(ds.doc_idx[-1]) 23 | if ds.supports_prefetch: 24 | # just prefetch the whole thing in test (so assume it is small) 25 | ds.prefetch(range(len(ds))) 26 | if args.count > len(ds.doc_idx) - 1: 27 | args.count = len(ds.doc_idx) - 1 28 | 29 | for i in range(args.count): 30 | start = ds.doc_idx[i] 31 | end = ds.doc_idx[i + 1] 32 | ids = ds[start:end] 33 | print(f"Document {i}:") 34 | print("--------------") 35 | for s in ids: 36 | assert len(s) > 0 37 | l = s.data.tolist() 38 | text = tokenizer.detokenize(l) 39 | print(text) 40 | print("---") 41 | 42 | 43 | def test_indexed_dataset_get(args): 44 | ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) 45 | tokenizer = build_tokenizer(args) 46 | size = ds.sizes[0] 47 | print(f"size: {size}") 48 | full = ds.get(0) 49 | print(full) 50 | # print(tokenizer.detokenize(full.data.tolist())) 51 | print("---") 52 | end = ds.get(0, offset=size - 10) 53 | print(end) 54 | # print(tokenizer.detokenize(end.data.tolist())) 55 | 56 | start = ds.get(0, length=10) 57 | print(start) 58 | # print(tokenizer.detokenize(start.data.tolist())) 59 | 60 | part = ds.get(0, offset=2, length=8) 61 | print(part) 62 | # print(tokenizer.detokenize(part.data.tolist())) 63 | 64 | # def test_albert_dataset(args): 65 | # # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True) 66 | # # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl) 67 | # # ds = AlbertDataset(idataset, tokenizer) 68 | # ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl, 69 | # args.epochs, args.max_num_samples, 70 | # args.masked_lm_prob, args.seq_length, 71 | # args.short_seq_prob, args.seed) 72 | # truncated = 0 73 | # total = 0 74 | # for i, s in enumerate(ds): 75 | # ids = s['text'] 76 | # tokens = ds.tokenizer.convert_ids_to_tokens(ids) 77 | # print(tokens) 78 | # if i >= args.count-1: 79 | # exit() 80 | 81 | 82 | def main(): 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument('--data', type=str, help='prefix to data files') 85 | parser.add_argument('--dataset-impl', type=str, default='infer', 86 | choices=['lazy', 'cached', 'mmap', 'infer']) 87 | parser.add_argument('--count', type=int, default=10, 88 | help='Number of samples/documents to print') 89 | 90 | group = parser.add_argument_group(title='tokenizer') 91 | group.add_argument('--tokenizer-type', type=str, required=True, 92 | choices=['BertWordPieceLowerCase', 93 | 'GPT2BPETokenizer'], 94 | help='What type of tokenizer to use.') 95 | group.add_argument('--vocab-file', type=str, default=None, 96 | help='Path to the vocab file') 97 | group.add_argument('--merge-file', type=str, default=None, 98 | help='Path to the BPE merge file (if necessary).') 99 | 100 | parser.add_argument('--epochs', type=int, default=5, 101 | help='Number of epochs to plan for') 102 | parser.add_argument('--max-num-samples', type=int, default=None, 103 | help='Maximum number of samples to plan for') 104 | parser.add_argument('--masked-lm-prob', type=float, default=0.15, 105 | help='probability of masking tokens') 106 | parser.add_argument('--seq-length', type=int, default=512, 107 | help='maximum sequence length') 108 | parser.add_argument('--short-seq-prob', type=float, default=0.1, 109 | help='probability of creating a short sequence') 110 | parser.add_argument('--seed', type=int, default=1234, 111 | help='random seed') 112 | args = parser.parse_args() 113 | args.rank = 0 114 | args.make_vocab_size_divisible_by = 128 115 | args.tensor_model_parallel_size = 1 116 | 117 | if args.dataset_impl == "infer": 118 | args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data) 119 | 120 | # test_albert_dataset(args) 121 | test_indexed_dataset_get(args) 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /src/megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /src/megatron/dist_signal_handler.py: -------------------------------------------------------------------------------- 1 | import signal 2 | 3 | import torch 4 | 5 | 6 | def get_world_size(): 7 | if torch.distributed.is_available() and torch.distributed.is_initialized(): 8 | world_size = torch.distributed.get_world_size() 9 | else: 10 | world_size = 1 11 | return world_size 12 | 13 | 14 | def get_device(local_rank=None): 15 | backend = torch.distributed.get_backend() 16 | if backend == 'nccl': 17 | if local_rank is None: 18 | device = torch.device('cuda') 19 | else: 20 | device = torch.device(f'cuda:{local_rank}') 21 | elif backend == 'gloo': 22 | device = torch.device('cpu') 23 | else: 24 | raise RuntimeError 25 | return device 26 | 27 | 28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): 29 | if not torch.distributed.is_available() or \ 30 | not torch.distributed.is_initialized(): 31 | return [item] 32 | 33 | device = get_device(local_rank) 34 | 35 | if group is not None: 36 | group_size = group.size() 37 | else: 38 | group_size = get_world_size() 39 | 40 | tensor = torch.tensor([item], device=device, dtype=dtype) 41 | output_tensors = [ 42 | torch.zeros(1, dtype=tensor.dtype, device=tensor.device) 43 | for _ in range(group_size) 44 | ] 45 | torch.distributed.all_gather(output_tensors, tensor, group, async_op) 46 | output = [elem.item() for elem in output_tensors] 47 | return output 48 | 49 | 50 | class DistributedSignalHandler: 51 | def __init__(self, sig=signal.SIGTERM): 52 | self.sig = sig 53 | 54 | def signals_received(self): 55 | all_received = all_gather_item( 56 | self._signal_received, dtype=torch.int32 57 | ) 58 | return all_received 59 | 60 | def __enter__(self): 61 | self._signal_received = False 62 | self.released = False 63 | self.original_handler = signal.getsignal(self.sig) 64 | 65 | def handler(signum, frame): 66 | self._signal_received = True 67 | 68 | signal.signal(self.sig, handler) 69 | 70 | return self 71 | 72 | def __exit__(self, type, value, tb): 73 | self.release() 74 | 75 | def release(self): 76 | if self.released: 77 | return False 78 | 79 | signal.signal(self.sig, self.original_handler) 80 | self.released = True 81 | return True 82 | -------------------------------------------------------------------------------- /src/megatron/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 29 | prefix = 3 30 | 31 | class PositionEmbeddingType(enum.Enum): 32 | rotary = 1 33 | absolute = 2 34 | alibi = 3 35 | -------------------------------------------------------------------------------- /src/megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import pathlib 5 | import subprocess 6 | 7 | import torch 8 | from torch.utils import cpp_extension 9 | 10 | # Setting this param to a list has a problem of generating different 11 | # compilation commands (with diferent order of architectures) and 12 | # leading to recompilation of fused kernels. Set it to empty string 13 | # to avoid recompilation and assign arch flags explicity in 14 | # extra_cuda_cflags below 15 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 16 | 17 | 18 | def load(args): 19 | 20 | # Check if cuda 11 is installed for compute capability 8.0 21 | cc_flag = [] 22 | if torch.version.hip is None: 23 | _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version( 24 | cpp_extension.CUDA_HOME) 25 | if int(bare_metal_major) >= 11: 26 | cc_flag.append('-gencode') 27 | cc_flag.append('arch=compute_80,code=sm_80') 28 | if int(bare_metal_minor) >= 1: 29 | cc_flag.append('-gencode') 30 | cc_flag.append('arch=compute_86,code=sm_86') 31 | if int(bare_metal_minor) >= 4: 32 | cc_flag.append('-gencode') 33 | cc_flag.append('arch=compute_87,code=sm_87') 34 | if int(bare_metal_minor) >= 8: 35 | cc_flag.append('-gencode') 36 | cc_flag.append('arch=compute_89,code=sm_89') 37 | if int(bare_metal_major) >= 12: 38 | cc_flag.append('-gencode') 39 | cc_flag.append('arch=compute_90,code=sm_90') 40 | 41 | # Build path 42 | srcpath = pathlib.Path(__file__).parent.absolute() 43 | buildpath = srcpath / 'build' 44 | _create_build_dir(buildpath) 45 | 46 | # Helper function to build the kernels. 47 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags, extra_include_paths): 48 | if torch.version.hip is not None: 49 | extra_cuda_cflags=['-O3'] + extra_cuda_flags + cc_flag 50 | else: 51 | extra_cuda_cflags=['-O3', 52 | '-gencode', 'arch=compute_70,code=sm_70', 53 | '--use_fast_math'] + extra_cuda_flags + cc_flag 54 | 55 | return cpp_extension.load( 56 | name=name, 57 | sources=sources, 58 | build_directory=buildpath, 59 | extra_cflags=['-O3',], 60 | extra_cuda_cflags=extra_cuda_cflags, 61 | extra_include_paths=extra_include_paths, 62 | verbose=(args.rank == 0) 63 | ) 64 | 65 | # ============== 66 | # Fused softmax. 67 | # ============== 68 | 69 | if torch.version.hip is not None: 70 | extra_include_paths=[os.path.abspath(srcpath)] 71 | else: 72 | extra_include_paths=[] 73 | 74 | if args.masked_softmax_fusion: 75 | if torch.version.hip is not None: 76 | extra_cuda_flags = ['-D__HIP_NO_HALF_OPERATORS__=1', 77 | '-D__HIP_NO_HALF_CONVERSIONS__=1'] 78 | else: 79 | extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', 80 | '-U__CUDA_NO_HALF_CONVERSIONS__', 81 | '--expt-relaxed-constexpr', 82 | '--expt-extended-lambda'] 83 | 84 | # Upper triangular softmax. 85 | sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', 86 | srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] 87 | scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( 88 | "scaled_upper_triang_masked_softmax_cuda", 89 | sources, extra_cuda_flags, extra_include_paths) 90 | 91 | # Masked softmax. 92 | sources=[srcpath / 'scaled_masked_softmax.cpp', 93 | srcpath / 'scaled_masked_softmax_cuda.cu'] 94 | scaled_masked_softmax_cuda = _cpp_extention_load_helper( 95 | "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths) 96 | 97 | # Softmax 98 | sources=[srcpath / 'scaled_softmax.cpp', 99 | srcpath / 'scaled_softmax_cuda.cu'] 100 | scaled_softmax_cuda = _cpp_extention_load_helper( 101 | "scaled_softmax_cuda", sources, extra_cuda_flags, extra_include_paths) 102 | 103 | 104 | def _get_cuda_bare_metal_version(cuda_dir): 105 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], 106 | universal_newlines=True) 107 | output = raw_output.split() 108 | release_idx = output.index("release") + 1 109 | release = output[release_idx].split(".") 110 | bare_metal_major = release[0] 111 | bare_metal_minor = release[1][0] 112 | 113 | return raw_output, bare_metal_major, bare_metal_minor 114 | 115 | 116 | def _create_build_dir(buildpath): 117 | try: 118 | os.mkdir(buildpath) 119 | except OSError: 120 | if not os.path.isdir(buildpath): 121 | print(f"Creation of the build directory {buildpath} failed") 122 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_masked_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | torch::Tensor const& mask, 14 | float scale_factor); 15 | 16 | torch::Tensor bwd_cuda( 17 | torch::Tensor const& output_grads, 18 | torch::Tensor const& softmax_results, 19 | float scale_factor); 20 | 21 | int get_batch_per_block_cuda( 22 | int query_seq_len, 23 | int key_seq_len, 24 | int batches, 25 | int attn_heads); 26 | 27 | torch::Tensor fwd( 28 | torch::Tensor const& input, 29 | torch::Tensor const& mask, 30 | float scale_factor) { 31 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 32 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 33 | (input.scalar_type() == at::ScalarType::BFloat16), 34 | "Only fp16 and bf16 are supported"); 35 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 36 | 37 | return fwd_cuda(input, mask, scale_factor); 38 | } 39 | 40 | torch::Tensor bwd( 41 | torch::Tensor const& output_grads, 42 | torch::Tensor const& softmax_results, 43 | float scale_factor) { 44 | 45 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 46 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 47 | 48 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 49 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 50 | "Only fp16 and bf16 are supported"); 51 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 52 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | 55 | return bwd_cuda(output_grads, softmax_results, scale_factor); 56 | } 57 | 58 | int get_batch_per_block( 59 | int query_seq_len, 60 | int key_seq_len, 61 | int batches, 62 | int attn_heads) { 63 | return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads); 64 | } 65 | 66 | } // end namespace scaled_masked_softmax 67 | } // end namespace fused_softmax 68 | } // end namespace multihead_attn 69 | 70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 71 | m.def("forward", 72 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 73 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 74 | 75 | m.def("backward", 76 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 77 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 78 | 79 | m.def("get_batch_per_block", 80 | &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block, 81 | "Return Batch per block size." 82 | ); 83 | } 84 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/scaled_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #ifndef __HIP_PLATFORM_HCC__ 8 | #include 9 | #endif 10 | #include 11 | #include 12 | #include "scaled_masked_softmax.h" 13 | #include "type_shim.h" 14 | 15 | namespace multihead_attn { 16 | namespace fused_softmax { 17 | namespace scaled_masked_softmax { 18 | 19 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){ 20 | return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads); 21 | } 22 | 23 | 24 | torch::Tensor fwd_cuda( 25 | torch::Tensor const& input, 26 | torch::Tensor const& mask, 27 | float scale_factor) 28 | { 29 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 30 | const int batches = input.size(0); 31 | const int pad_batches = mask.size(0); 32 | const int attn_heads = input.size(1); 33 | const int query_seq_len = input.size(2); 34 | const int key_seq_len = input.size(3); 35 | TORCH_INTERNAL_ASSERT(key_seq_len <= 4096); 36 | TORCH_INTERNAL_ASSERT(query_seq_len > 1); 37 | TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); 38 | TORCH_INTERNAL_ASSERT(mask.size(1) == 1); 39 | TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len); 40 | TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len); 41 | 42 | // Output 43 | auto act_options = input.options().requires_grad(false); 44 | torch::Tensor softmax_results = 45 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 46 | 47 | // Softmax Intermediate Result Ptr 48 | void* input_ptr = static_cast(input.data_ptr()); 49 | void* mask_ptr = static_cast(mask.data_ptr()); 50 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 51 | 52 | DISPATCH_HALF_AND_BFLOAT( 53 | input.scalar_type(), 54 | "dispatch_scaled_masked_softmax_forward", 55 | dispatch_scaled_masked_softmax_forward( 56 | reinterpret_cast(softmax_results_ptr), 57 | reinterpret_cast(input_ptr), 58 | reinterpret_cast(mask_ptr), 59 | scale_factor, 60 | query_seq_len, 61 | key_seq_len, 62 | batches, 63 | attn_heads, 64 | pad_batches); 65 | ); 66 | return softmax_results; 67 | } 68 | 69 | torch::Tensor bwd_cuda( 70 | torch::Tensor const& output_grads_, 71 | torch::Tensor const& softmax_results_, 72 | float scale_factor) { 73 | 74 | auto output_grads = output_grads_.contiguous(); 75 | auto softmax_results = softmax_results_.contiguous(); 76 | 77 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 78 | const int batches = output_grads.size(0); 79 | const int attn_heads = output_grads.size(1); 80 | const int query_seq_len = output_grads.size(2); 81 | const int key_seq_len = output_grads.size(3); 82 | 83 | auto act_options = output_grads.options().requires_grad(false); 84 | torch::Tensor input_grads = 85 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 86 | 87 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 88 | void* input_grads_ptr = static_cast(input_grads.data_ptr()); 89 | 90 | //Softmax Grad 91 | DISPATCH_HALF_AND_BFLOAT( 92 | output_grads_.scalar_type(), 93 | "dispatch_scaled_masked_softmax_backward", 94 | dispatch_scaled_masked_softmax_backward( 95 | reinterpret_cast(input_grads_ptr), 96 | reinterpret_cast(output_grads_ptr), 97 | reinterpret_cast(softmax_results.data_ptr()), 98 | scale_factor, 99 | query_seq_len, 100 | key_seq_len, 101 | batches, 102 | attn_heads); 103 | ); 104 | 105 | return input_grads; 106 | } 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/scaled_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | float scale_factor); 14 | 15 | torch::Tensor bwd_cuda( 16 | torch::Tensor const& output_grads, 17 | torch::Tensor const& softmax_results, 18 | float scale_factor); 19 | 20 | torch::Tensor fwd( 21 | torch::Tensor const& input, 22 | float scale_factor) { 23 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 24 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 25 | (input.scalar_type() == at::ScalarType::BFloat16), 26 | "Only fp16 and bf16 are supported"); 27 | 28 | return fwd_cuda(input, scale_factor); 29 | } 30 | 31 | torch::Tensor bwd( 32 | torch::Tensor const& output_grads, 33 | torch::Tensor const& softmax_results, 34 | float scale_factor) { 35 | 36 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 37 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 38 | 39 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 40 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 41 | "Only fp16 and bf16 are supported"); 42 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 43 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 44 | "Only fp16 and bf16 are supported"); 45 | 46 | return bwd_cuda(output_grads, softmax_results, scale_factor); 47 | } 48 | 49 | } // end namespace scaled_softmax 50 | } // end namespace fused_softmax 51 | } // end namespace multihead_attn 52 | 53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 54 | m.def("forward", 55 | &multihead_attn::fused_softmax::scaled_softmax::fwd, 56 | "Self Multihead Attention scaled, softmax -- Forward."); 57 | m.def("backward", 58 | &multihead_attn::fused_softmax::scaled_softmax::bwd, 59 | "Self Multihead Attention scaled, softmax -- Backward."); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/scaled_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #ifndef __HIP_PLATFORM_HCC__ 8 | #include 9 | #endif 10 | #include 11 | #include 12 | #include "scaled_masked_softmax.h" 13 | #include "type_shim.h" 14 | 15 | namespace multihead_attn { 16 | namespace fused_softmax { 17 | namespace scaled_softmax { 18 | 19 | torch::Tensor fwd_cuda( 20 | torch::Tensor const& input, 21 | float scale_factor) 22 | { 23 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 24 | const int batches = input.size(0); 25 | const int attn_heads = input.size(1); 26 | const int query_seq_len = input.size(2); 27 | const int key_seq_len = input.size(3); 28 | TORCH_INTERNAL_ASSERT(key_seq_len <= 4096); 29 | TORCH_INTERNAL_ASSERT(query_seq_len > 1); 30 | 31 | // Output 32 | auto act_options = input.options().requires_grad(false); 33 | torch::Tensor softmax_results = 34 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 35 | 36 | // Softmax Intermediate Result Ptr 37 | void* input_ptr = static_cast(input.data_ptr()); 38 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 39 | 40 | DISPATCH_HALF_AND_BFLOAT( 41 | input.scalar_type(), 42 | "dispatch_scaled_softmax_forward", 43 | dispatch_scaled_softmax_forward( 44 | reinterpret_cast(softmax_results_ptr), 45 | reinterpret_cast(input_ptr), 46 | scale_factor, 47 | query_seq_len, 48 | key_seq_len, 49 | batches, 50 | attn_heads); 51 | ); 52 | return softmax_results; 53 | } 54 | 55 | torch::Tensor bwd_cuda( 56 | torch::Tensor const& output_grads_, 57 | torch::Tensor const& softmax_results_, 58 | float scale_factor) { 59 | 60 | auto output_grads = output_grads_.contiguous(); 61 | auto softmax_results = softmax_results_.contiguous(); 62 | 63 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 64 | const int batches = output_grads.size(0); 65 | const int attn_heads = output_grads.size(1); 66 | const int query_seq_len = output_grads.size(2); 67 | const int key_seq_len = output_grads.size(3); 68 | 69 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 70 | 71 | //Softmax Grad 72 | DISPATCH_HALF_AND_BFLOAT( 73 | output_grads_.scalar_type(), 74 | "dispatch_scaled_masked_softmax_backward", 75 | dispatch_scaled_masked_softmax_backward( 76 | reinterpret_cast(output_grads_ptr), 77 | reinterpret_cast(output_grads_ptr), 78 | reinterpret_cast(softmax_results.data_ptr()), 79 | scale_factor, 80 | query_seq_len, 81 | key_seq_len, 82 | batches, 83 | attn_heads); 84 | ); 85 | 86 | //backward pass is completely in-place 87 | return output_grads; 88 | } 89 | } 90 | } 91 | } 92 | 93 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_upper_triang_masked_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | float scale_factor); 14 | 15 | torch::Tensor bwd_cuda( 16 | torch::Tensor const& output_grads, 17 | torch::Tensor const& softmax_results, 18 | float scale_factor); 19 | 20 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 21 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 22 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 23 | (input.scalar_type() == at::ScalarType::BFloat16), 24 | "Only fp16 and bf16 are supported"); 25 | 26 | return fwd_cuda(input, scale_factor); 27 | } 28 | 29 | torch::Tensor bwd( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor) { 33 | 34 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 35 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 36 | 37 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 38 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 39 | "Only fp16 and bf16 are supported"); 40 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 41 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 42 | "Only fp16 and bf16 are supported"); 43 | 44 | return bwd_cuda(output_grads, softmax_results, scale_factor); 45 | } 46 | 47 | } // end namespace scaled_upper_triang_masked_softmax 48 | } // end namespace fused_softmax 49 | } // end namespace multihead_attn 50 | 51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 52 | m.def("forward", 53 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 54 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 55 | m.def("backward", 56 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 57 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 58 | } 59 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #ifndef __HIP_PLATFORM_HCC__ 8 | #include 9 | #endif 10 | #include 11 | #include 12 | #include "scaled_upper_triang_masked_softmax.h" 13 | #include "type_shim.h" 14 | 15 | namespace multihead_attn { 16 | namespace fused_softmax { 17 | namespace scaled_upper_triang_masked_softmax { 18 | 19 | torch::Tensor fwd_cuda( 20 | torch::Tensor const& input, 21 | float scale_factor) 22 | { 23 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 24 | const int attn_batches = input.size(0); 25 | const int seq_len = input.size(1); 26 | TORCH_INTERNAL_ASSERT(seq_len <= 16384); 27 | 28 | // Output 29 | auto act_options = input.options().requires_grad(false); 30 | torch::Tensor softmax_results = 31 | torch::empty({attn_batches, seq_len, seq_len}, act_options); 32 | 33 | // Softmax Intermediate Result Ptr 34 | void* input_ptr = static_cast(input.data_ptr()); 35 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 36 | 37 | DISPATCH_HALF_AND_BFLOAT( 38 | input.scalar_type(), 39 | "dispatch_scaled_upper_triang_masked_softmax_forward", 40 | dispatch_scaled_upper_triang_masked_softmax_forward( 41 | reinterpret_cast(softmax_results_ptr), 42 | reinterpret_cast(input_ptr), 43 | scale_factor, 44 | seq_len, 45 | seq_len, 46 | attn_batches); 47 | ); 48 | return softmax_results; 49 | } 50 | 51 | 52 | torch::Tensor bwd_cuda( 53 | torch::Tensor const& output_grads_, 54 | torch::Tensor const& softmax_results_, 55 | float scale_factor) { 56 | 57 | auto output_grads = output_grads_.contiguous(); 58 | auto softmax_results = softmax_results_.contiguous(); 59 | 60 | //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 61 | const int attn_batches = output_grads.size(0); 62 | const int seq_len = output_grads.size(1); 63 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 64 | 65 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 66 | 67 | //Softmax Grad 68 | DISPATCH_HALF_AND_BFLOAT( 69 | output_grads_.scalar_type(), 70 | "dispatch_scaled_upper_triang_masked_softmax_backward", 71 | dispatch_scaled_upper_triang_masked_softmax_backward( 72 | reinterpret_cast(output_grads_ptr), 73 | reinterpret_cast(output_grads_ptr), 74 | reinterpret_cast(softmax_results.data_ptr()), 75 | scale_factor, 76 | seq_len, 77 | seq_len, 78 | attn_batches); 79 | ); 80 | 81 | //backward pass is completely in-place 82 | return output_grads; 83 | } 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /src/megatron/fused_kernels/type_shim.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | 4 | #include 5 | #include "compat.h" 6 | 7 | 8 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 9 | switch(TYPE) \ 10 | { \ 11 | case at::ScalarType::Half: \ 12 | { \ 13 | using scalar_t = at::Half; \ 14 | __VA_ARGS__; \ 15 | break; \ 16 | } \ 17 | case at::ScalarType::BFloat16: \ 18 | { \ 19 | using scalar_t = at::BFloat16; \ 20 | __VA_ARGS__; \ 21 | break; \ 22 | } \ 23 | default: \ 24 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 25 | } 26 | 27 | 28 | #define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...) \ 29 | switch(TYPE) \ 30 | { \ 31 | case at::ScalarType::Half: \ 32 | { \ 33 | using scalar_t = at::Half; \ 34 | __VA_ARGS__; \ 35 | break; \ 36 | } \ 37 | case at::ScalarType::BFloat16: \ 38 | { \ 39 | using scalar_t = at::BFloat16; \ 40 | __VA_ARGS__; \ 41 | break; \ 42 | } \ 43 | case at::ScalarType::Float: \ 44 | { \ 45 | using scalar_t = float; \ 46 | __VA_ARGS__; \ 47 | break; \ 48 | } \ 49 | default: \ 50 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 51 | } 52 | 53 | 54 | 55 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ 56 | switch(TYPEIN) \ 57 | { \ 58 | case at::ScalarType::Float: \ 59 | { \ 60 | using scalar_t_in = float; \ 61 | switch(TYPEOUT) \ 62 | { \ 63 | case at::ScalarType::Float: \ 64 | { \ 65 | using scalar_t_out = float; \ 66 | __VA_ARGS__; \ 67 | break; \ 68 | } \ 69 | case at::ScalarType::Half: \ 70 | { \ 71 | using scalar_t_out = at::Half; \ 72 | __VA_ARGS__; \ 73 | break; \ 74 | } \ 75 | case at::ScalarType::BFloat16: \ 76 | { \ 77 | using scalar_t_out = at::BFloat16; \ 78 | __VA_ARGS__; \ 79 | break; \ 80 | } \ 81 | default: \ 82 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ 83 | } \ 84 | break; \ 85 | } \ 86 | case at::ScalarType::Half: \ 87 | { \ 88 | using scalar_t_in = at::Half; \ 89 | using scalar_t_out = at::Half; \ 90 | __VA_ARGS__; \ 91 | break; \ 92 | } \ 93 | case at::ScalarType::BFloat16: \ 94 | { \ 95 | using scalar_t_in = at::BFloat16; \ 96 | using scalar_t_out = at::BFloat16; \ 97 | __VA_ARGS__; \ 98 | break; \ 99 | } \ 100 | default: \ 101 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ 102 | } 103 | 104 | -------------------------------------------------------------------------------- /src/megatron/global_vars.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Megatron global variables.""" 4 | 5 | import os 6 | import sys 7 | import torch 8 | 9 | from megatron import dist_signal_handler 10 | from megatron.tokenizer import build_tokenizer 11 | from .microbatches import build_num_microbatches_calculator 12 | from .timers import Timers 13 | 14 | _GLOBAL_ARGS = None 15 | _GLOBAL_RETRO_ARGS = None 16 | _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None 17 | _GLOBAL_TOKENIZER = None 18 | _GLOBAL_TENSORBOARD_WRITER = None 19 | _GLOBAL_ADLR_AUTORESUME = None 20 | _GLOBAL_TIMERS = None 21 | _GLOBAL_SIGNAL_HANDLER = None 22 | 23 | def get_args(): 24 | """Return arguments.""" 25 | _ensure_var_is_initialized(_GLOBAL_ARGS, 'args') 26 | return _GLOBAL_ARGS 27 | 28 | 29 | def get_retro_args(): 30 | """Return retro arguments.""" 31 | return _GLOBAL_RETRO_ARGS 32 | 33 | 34 | def get_num_microbatches(): 35 | return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get() 36 | 37 | 38 | def get_current_global_batch_size(): 39 | return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size() 40 | 41 | 42 | def update_num_microbatches(consumed_samples, consistency_check=True): 43 | _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples, 44 | consistency_check) 45 | 46 | 47 | def get_tokenizer(): 48 | """Return tokenizer.""" 49 | _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer') 50 | return _GLOBAL_TOKENIZER 51 | 52 | 53 | def get_tensorboard_writer(): 54 | """Return tensorboard writer. It can be None so no need 55 | to check if it is initialized.""" 56 | return _GLOBAL_TENSORBOARD_WRITER 57 | 58 | 59 | def get_adlr_autoresume(): 60 | """ADLR autoresume object. It can be None so no need 61 | to check if it is initialized.""" 62 | return _GLOBAL_ADLR_AUTORESUME 63 | 64 | 65 | def get_timers(): 66 | """Return timers.""" 67 | _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers') 68 | return _GLOBAL_TIMERS 69 | 70 | 71 | def get_signal_handler(): 72 | _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') 73 | return _GLOBAL_SIGNAL_HANDLER 74 | 75 | 76 | def _set_signal_handler(): 77 | global _GLOBAL_SIGNAL_HANDLER 78 | _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') 79 | _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__() 80 | 81 | 82 | 83 | def set_global_variables(args): 84 | """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers.""" 85 | 86 | assert args is not None 87 | 88 | _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args') 89 | set_args(args) 90 | 91 | _build_num_microbatches_calculator(args) 92 | _ = _build_tokenizer(args) 93 | _set_tensorboard_writer(args) 94 | _set_adlr_autoresume(args) 95 | _set_timers(args) 96 | 97 | if args.exit_signal_handler: 98 | _set_signal_handler() 99 | 100 | 101 | def set_args(args): 102 | global _GLOBAL_ARGS 103 | _GLOBAL_ARGS = args 104 | 105 | 106 | def set_retro_args(retro_args): 107 | global _GLOBAL_RETRO_ARGS 108 | _GLOBAL_RETRO_ARGS = retro_args 109 | 110 | 111 | def _build_num_microbatches_calculator(args): 112 | 113 | global _GLOBAL_NUM_MICROBATCHES_CALCULATOR 114 | _ensure_var_is_not_initialized(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, 115 | 'num microbatches calculator') 116 | 117 | _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( 118 | args) 119 | 120 | 121 | def _build_tokenizer(args): 122 | """Initialize tokenizer.""" 123 | global _GLOBAL_TOKENIZER 124 | _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer') 125 | _GLOBAL_TOKENIZER = build_tokenizer(args) 126 | return _GLOBAL_TOKENIZER 127 | 128 | 129 | def rebuild_tokenizer(args): 130 | global _GLOBAL_TOKENIZER 131 | _GLOBAL_TOKENIZER = None 132 | return _build_tokenizer(args) 133 | 134 | 135 | def _set_tensorboard_writer(args): 136 | """Set tensorboard writer.""" 137 | global _GLOBAL_TENSORBOARD_WRITER 138 | _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, 139 | 'tensorboard writer') 140 | 141 | if hasattr(args, 'tensorboard_dir') and \ 142 | args.tensorboard_dir and args.rank == (args.world_size - 1): 143 | try: 144 | from torch.utils.tensorboard import SummaryWriter 145 | print('> setting tensorboard ...') 146 | _GLOBAL_TENSORBOARD_WRITER = SummaryWriter( 147 | log_dir=args.tensorboard_dir, 148 | max_queue=args.tensorboard_queue_size) 149 | except ModuleNotFoundError: 150 | print('WARNING: TensorBoard writing requested but is not ' 151 | 'available (are you using PyTorch 1.1.0 or later?), ' 152 | 'no TensorBoard logs will be written.', flush=True) 153 | 154 | 155 | def _set_adlr_autoresume(args): 156 | """Initialize ADLR autoresume.""" 157 | global _GLOBAL_ADLR_AUTORESUME 158 | _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume') 159 | 160 | if args.adlr_autoresume: 161 | if args.rank == 0: 162 | print('enabling autoresume ...', flush=True) 163 | sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.')) 164 | try: 165 | from userlib.auto_resume import AutoResume 166 | except BaseException: 167 | print('ADLR autoresume is not available, exiting ...') 168 | sys.exit() 169 | 170 | _GLOBAL_ADLR_AUTORESUME = AutoResume 171 | 172 | 173 | def _set_timers(args): 174 | """Initialize timers.""" 175 | global _GLOBAL_TIMERS 176 | _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers') 177 | _GLOBAL_TIMERS = Timers(args.timing_log_level, args.timing_log_option) 178 | 179 | 180 | def _ensure_var_is_initialized(var, name): 181 | """Make sure the input variable is not None.""" 182 | assert var is not None, '{} is not initialized.'.format(name) 183 | 184 | 185 | def _ensure_var_is_not_initialized(var, name): 186 | """Make sure the input variable is not None.""" 187 | assert var is None, '{} is already initialized.'.format(name) 188 | -------------------------------------------------------------------------------- /src/megatron/indexer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import torch 4 | import torch.distributed as dist 5 | 6 | from megatron import get_args, print_rank_0 7 | from megatron.core import mpu 8 | from megatron.checkpointing import load_biencoder_checkpoint 9 | from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset 10 | from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch 11 | from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader 12 | from megatron.data.realm_index import detach, OpenRetreivalDataStore 13 | from megatron.model.biencoder_model import get_model_provider 14 | from megatron.training import get_model 15 | 16 | 17 | class IndexBuilder(object): 18 | """ 19 | Object for taking one pass over a dataset and creating a BlockData of its 20 | embeddings 21 | """ 22 | def __init__(self): 23 | args = get_args() 24 | self.model = None 25 | self.dataloader = None 26 | self.evidence_embedder_obj = None 27 | self.biencoder_shared_query_context_model = \ 28 | args.biencoder_shared_query_context_model 29 | 30 | # need to know whether we're using a REALM checkpoint (args.load) 31 | # or ICT checkpoint 32 | assert not (args.load and args.ict_load) 33 | 34 | self.log_interval = args.indexer_log_interval 35 | self.batch_size = args.indexer_batch_size 36 | 37 | self.load_attributes() 38 | self.is_main_builder = mpu.get_data_parallel_rank() == 0 39 | self.num_total_builders = mpu.get_data_parallel_world_size() 40 | self.iteration = self.total_processed = 0 41 | 42 | def load_attributes(self): 43 | """ 44 | Load the necessary attributes: model, dataloader and empty BlockData 45 | """ 46 | only_context_model = True 47 | if self.biencoder_shared_query_context_model: 48 | only_context_model = False 49 | 50 | model = get_model(get_model_provider(only_context_model=\ 51 | only_context_model, biencoder_shared_query_context_model=\ 52 | self.biencoder_shared_query_context_model)) 53 | 54 | self.model = load_biencoder_checkpoint(model, 55 | only_context_model=only_context_model) 56 | 57 | assert len(self.model) == 1 58 | self.model[0].eval() 59 | 60 | self.dataset = get_open_retrieval_wiki_dataset() 61 | self.dataloader = iter(get_one_epoch_dataloader(self.dataset, \ 62 | self.batch_size)) 63 | 64 | self.evidence_embedder_obj = OpenRetreivalDataStore( \ 65 | load_from_path=False) 66 | 67 | def track_and_report_progress(self, batch_size): 68 | """ 69 | Utility function for tracking progress 70 | """ 71 | self.iteration += 1 72 | self.total_processed += batch_size * self.num_total_builders 73 | if self.is_main_builder and self.iteration % self.log_interval == 0: 74 | print('Batch {:10d} | Total {:10d}'.format(self.iteration, 75 | self.total_processed), flush=True) 76 | 77 | def build_and_save_index(self): 78 | """ 79 | Goes through one epoch of the dataloader and adds all data to this 80 | instance's BlockData. 81 | 82 | The copy of BlockData is saved as a shard, which when run in a 83 | distributed setting will be consolidated by the rank 0 process 84 | and saved as a final pickled BlockData. 85 | """ 86 | assert len(self.model) == 1 87 | unwrapped_model = self.model[0] 88 | 89 | while not hasattr(unwrapped_model, 'embed_text'): 90 | unwrapped_model = unwrapped_model.module 91 | 92 | while True: 93 | try: 94 | # batch also has query_tokens and query_pad_data 95 | row_id, context_tokens, context_mask, context_types, \ 96 | context_pad_mask = get_open_retrieval_batch( \ 97 | self.dataloader) 98 | except (StopIteration, IndexError): 99 | break 100 | 101 | # TODO: can we add with torch.no_grad() to reduce memory usage 102 | # detach, separate fields and add to BlockData 103 | assert context_mask.dtype == torch.bool 104 | context_logits = unwrapped_model.embed_text( 105 | unwrapped_model.context_model, context_tokens, context_mask, 106 | context_types) 107 | 108 | context_logits = detach(context_logits) 109 | row_id = detach(row_id) 110 | 111 | self.evidence_embedder_obj.add_block_data(row_id, context_logits) 112 | self.track_and_report_progress(batch_size=len(row_id)) 113 | 114 | # This process signals to finalize its shard and then synchronize with 115 | # the other processes 116 | self.evidence_embedder_obj.save_shard() 117 | torch.distributed.barrier() 118 | del self.model 119 | 120 | # rank 0 process builds the final copy 121 | if self.is_main_builder: 122 | self.evidence_embedder_obj.merge_shards_and_save() 123 | # make sure that every single piece of data was embedded 124 | assert len(self.evidence_embedder_obj.embed_data) == \ 125 | len(self.dataset) 126 | self.evidence_embedder_obj.clear() 127 | 128 | # complete building the final copy 129 | torch.distributed.barrier() 130 | -------------------------------------------------------------------------------- /src/megatron/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import torch 5 | from deepspeed.accelerator import get_accelerator 6 | 7 | # A dictionary of all the memory buffers allocated. 8 | _MEM_BUFFS = dict() 9 | 10 | 11 | def allocate_mem_buff(name, numel, dtype, track_usage): 12 | """Allocate a memory buffer.""" 13 | assert name not in _MEM_BUFFS, \ 14 | 'memory buffer {} already allocated.'.format(name) 15 | _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage) 16 | return _MEM_BUFFS[name] 17 | 18 | 19 | def get_mem_buff(name): 20 | """Get the memory buffer.""" 21 | return _MEM_BUFFS[name] 22 | 23 | 24 | class MemoryBuffer: 25 | """Contiguous memory buffer. 26 | Allocate a contiguous memory of type `dtype` and size `numel`. It is 27 | used to reduce memory fragmentation. 28 | 29 | Usage: After the allocation, the `_start` index is set tot the first 30 | index of the memory. A memory chunk starting from `_start` index 31 | can be `allocated` for an input tensor, with the elements of the 32 | tensor being coppied. The buffer can be reused by resetting the 33 | `_start` index. 34 | 35 | """ 36 | def __init__(self, name, numel, dtype, track_usage): 37 | if torch.distributed.get_rank() == 0: 38 | element_size = torch.tensor([], dtype=dtype).element_size() 39 | print('> building the {} memory buffer with {} num elements ' 40 | 'and {} dtype ({:.1f} MB)...'.format( 41 | name, numel, dtype, numel*element_size/1024/1024), 42 | flush=True) 43 | self.name = name 44 | self.numel = numel 45 | self.dtype = dtype 46 | self.data = torch.empty(self.numel, 47 | dtype=self.dtype, 48 | device=get_accelerator().current_device_name(), 49 | requires_grad=False) 50 | 51 | # Index tracking the start of the free memory. 52 | self._start = 0 53 | 54 | # Values used for tracking usage. 55 | self.track_usage = track_usage 56 | if self.track_usage: 57 | self.in_use_value = 0.0 58 | self.total_value = 0.0 59 | 60 | 61 | def reset(self): 62 | """Reset the buffer start index to the beginning of the buffer.""" 63 | self._start = 0 64 | 65 | 66 | def is_in_use(self): 67 | """Whether the current buffer hold on to any memory.""" 68 | return self._start > 0 69 | 70 | 71 | def numel_in_use(self): 72 | """Return number of elements in use.""" 73 | return self._start 74 | 75 | 76 | def add(self, tensor): 77 | """Allocate a chunk of memory from the buffer to tensor and copy 78 | the values.""" 79 | assert tensor.dtype == self.dtype, \ 80 | 'Input tensor type {} different from buffer type {}'.format( 81 | tensor.dtype, self.dtype) 82 | # Number of elements of the input tensor. 83 | tensor_numel = torch.numel(tensor) 84 | new_start = self._start + tensor_numel 85 | assert new_start <= self.numel, \ 86 | 'Not enough memory left in the buffer ({} > {})'.format( 87 | tensor_numel, self.numel - self._start) 88 | # New tensor is a view into the memory. 89 | new_tensor = self.data[self._start:new_start] 90 | self._start = new_start 91 | new_tensor = new_tensor.view(tensor.shape) 92 | new_tensor.copy_(tensor) 93 | # Return a pointer to the new tensor. 94 | return new_tensor 95 | 96 | 97 | def get_data(self): 98 | """Return the data currently in use.""" 99 | if self.track_usage: 100 | self.in_use_value += float(self._start) 101 | self.total_value += float(self.numel) 102 | return self.data[:self._start] 103 | 104 | 105 | def print_average_usage(self): 106 | """Print memory usage average over time. We would like this value 107 | to be as high as possible.""" 108 | assert self.track_usage, 'You need to enable track usage.' 109 | if torch.distributed.get_rank() == 0: 110 | print(' > usage of {} memory buffer: {:.2f} %'.format( 111 | self.name, self.in_use_value * 100.0 / self.total_value), 112 | flush=True) 113 | 114 | 115 | 116 | class RingMemBuffer: 117 | """A ring of memory buffers.""" 118 | 119 | def __init__(self, name, num_buffers, numel, dtype, track_usage): 120 | self.num_buffers = num_buffers 121 | self.buffers = [ 122 | allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage) 123 | for i in range(num_buffers)] 124 | self._index = -1 125 | 126 | 127 | def get_next_buffer(self): 128 | self._index += 1 129 | self._index = self._index % self.num_buffers 130 | buff = self.buffers[self._index] 131 | assert not buff.is_in_use(), 'buffer is already in use.' 132 | return buff 133 | -------------------------------------------------------------------------------- /src/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from deepspeed.accelerator.real_accelerator import get_accelerator 4 | if get_accelerator().device_name() == 'cuda': 5 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 6 | else: 7 | from torch.nn import LayerNorm 8 | 9 | from .distributed import DistributedDataParallel 10 | from .bert_model import BertModel 11 | from .gpt_model import GPTModel, GPTModelPipe 12 | from .t5_model import T5Model 13 | from .language_model import get_language_model 14 | from .module import Float16Module 15 | -------------------------------------------------------------------------------- /src/megatron/model/classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Classification model.""" 4 | 5 | import torch 6 | 7 | from megatron import get_args, print_rank_last 8 | from megatron.model.enums import AttnMaskType 9 | from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids 10 | from megatron.model.language_model import get_language_model 11 | from megatron.model.utils import get_linear_layer 12 | from megatron.model.utils import init_method_normal 13 | from megatron.model.utils import scaled_init_method_normal 14 | from .module import MegatronModule 15 | 16 | 17 | class Classification(MegatronModule): 18 | 19 | def __init__(self, 20 | config, 21 | num_classes, 22 | num_tokentypes=2, 23 | pre_process=True, 24 | post_process=True): 25 | super().__init__(config=config, share_embeddings_and_output_weights=False) 26 | args = get_args() 27 | 28 | self.num_classes = num_classes 29 | self.pre_process = pre_process 30 | self.post_process = post_process 31 | 32 | self.language_model, self._language_model_key = get_language_model( 33 | config=config, 34 | num_tokentypes=num_tokentypes, 35 | add_pooler=True, 36 | encoder_attn_mask_type=AttnMaskType.padding, 37 | pre_process=self.pre_process, 38 | post_process=self.post_process) 39 | 40 | # Multi-choice head. 41 | if self.post_process: 42 | self.classification_dropout = torch.nn.Dropout(args.hidden_dropout) 43 | self.classification_head = get_linear_layer(args.hidden_size, 44 | self.num_classes, 45 | init_method, 46 | gather_params_on_init=args.zero_stage == 3) 47 | self._classification_head_key = 'classification_head' 48 | 49 | def set_input_tensor(self, input_tensor): 50 | """See megatron.model.transformer.set_input_tensor()""" 51 | self.language_model.set_input_tensor(input_tensor) 52 | 53 | def forward(self, model_input, attention_mask, tokentype_ids=None): 54 | 55 | extended_attention_mask = bert_extended_attention_mask(attention_mask) 56 | input_ids = model_input 57 | position_ids = bert_position_ids(input_ids) 58 | 59 | lm_output = self.language_model( 60 | input_ids, 61 | position_ids, 62 | extended_attention_mask, 63 | tokentype_ids=tokentype_ids 64 | ) 65 | 66 | if self.post_process: 67 | _, pooled_output = lm_output[0], lm_output[1] 68 | classification_output = self.classification_dropout(pooled_output) 69 | classification_logits = self.classification_head(classification_output) 70 | 71 | # Reshape back to separate choices. 72 | classification_logits = classification_logits.view(-1, self.num_classes) 73 | 74 | return classification_logits 75 | return lm_output 76 | 77 | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): 78 | """For easy load when model is combined with other heads, 79 | add an extra key.""" 80 | 81 | state_dict_ = {} 82 | state_dict_[self._language_model_key] \ 83 | = self.language_model.state_dict_for_save_checkpoint(prefix=prefix, 84 | keep_vars=keep_vars) 85 | if self.post_process: 86 | state_dict_[self._classification_head_key] \ 87 | = self.classification_head.state_dict(prefix=prefix, keep_vars=keep_vars) 88 | return state_dict_ 89 | 90 | def load_state_dict(self, state_dict, strict=True): 91 | """Customized load.""" 92 | 93 | self.language_model.load_state_dict( 94 | state_dict[self._language_model_key], strict=strict) 95 | if self.post_process: 96 | if self._classification_head_key in state_dict: 97 | self.classification_head.load_state_dict( 98 | state_dict[self._classification_head_key], strict=strict) 99 | else: 100 | print_rank_last('***WARNING*** could not find {} in the checkpoint, ' 101 | 'initializing to random'.format( 102 | self._classification_head_key)) 103 | -------------------------------------------------------------------------------- /src/megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | retro_decoder_with_retriever = 5 11 | 12 | class AttnType(enum.Enum): 13 | self_attn = 1 14 | cross_attn = 2 15 | 16 | class AttnMaskType(enum.Enum): 17 | padding = 1 18 | causal = 2 19 | 20 | # For backward compatibility with old model checkpoints 21 | from megatron.core.enums import ModelType 22 | -------------------------------------------------------------------------------- /src/megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 7 | # 1/sqrt(2*pi)-> 0.3989423 8 | # 1/sqrt(2) -> 0.70710678 9 | # sqrt(2/pi) -> 0.79788456 10 | # this function is tanh approximation of gelu 11 | # actual gelu is: 12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 13 | 14 | @torch.jit.script 15 | def bias_gelu(bias, y): 16 | x = bias + y 17 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 18 | 19 | # gradient of tanh approximation of gelu 20 | # gradient of actual gelu is: 21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 22 | @torch.jit.script 23 | def bias_gelu_back(g, bias, y): 24 | x = bias + y 25 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 26 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 27 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 28 | return ff*g 29 | 30 | class GeLUFunction(torch.autograd.Function): 31 | @staticmethod 32 | # bias is an optional argument 33 | def forward(ctx, input, bias): 34 | ctx.save_for_backward(input, bias) 35 | return bias_gelu(bias, input) 36 | 37 | @staticmethod 38 | def backward(ctx, grad_output): 39 | input, bias = ctx.saved_tensors 40 | tmp = bias_gelu_back(grad_output, bias, input) 41 | return tmp, tmp 42 | 43 | bias_gelu_impl = GeLUFunction.apply 44 | -------------------------------------------------------------------------------- /src/megatron/model/fused_layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """This code is copied fron NVIDIA apex: 4 | https://github.com/NVIDIA/apex 5 | with some changes. """ 6 | 7 | import numbers 8 | import torch 9 | from torch.nn.parameter import Parameter 10 | from torch.nn import init 11 | import importlib 12 | from torch.nn import functional as F 13 | 14 | from megatron.core.utils import make_viewless_tensor 15 | 16 | try: 17 | from apex.contrib.layer_norm.layer_norm import FastLayerNormFN 18 | HAVE_PERSIST_LAYER_NORM = True 19 | except: 20 | HAVE_PERSIST_LAYER_NORM = False 21 | 22 | from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction 23 | 24 | 25 | global fused_layer_norm_cuda 26 | fused_layer_norm_cuda = None 27 | 28 | 29 | class MixedFusedLayerNorm(torch.nn.Module): 30 | 31 | def __init__(self, normalized_shape, eps=1e-5, 32 | no_persist_layer_norm=True, 33 | sequence_parallel=False, 34 | apply_layernorm_1p=False): 35 | super(MixedFusedLayerNorm, self).__init__() 36 | 37 | self.apply_layernorm_1p = apply_layernorm_1p 38 | 39 | global fused_layer_norm_cuda 40 | fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda") 41 | 42 | # List of hiddens sizes supported in the persistent layer norm kernel 43 | # If the hidden size is not supported, fall back to the non-persistent 44 | # kernel. 45 | persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096, 46 | 5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480, 47 | 24576, 25600, 30720, 32768, 40960, 49152, 65536] 48 | if normalized_shape not in persist_ln_hidden_sizes or \ 49 | not HAVE_PERSIST_LAYER_NORM: 50 | no_persist_layer_norm = True 51 | 52 | if isinstance(normalized_shape, numbers.Integral): 53 | normalized_shape = (normalized_shape,) 54 | self.normalized_shape = torch.Size(normalized_shape) 55 | self.eps = eps 56 | self.weight = Parameter(torch.Tensor(*normalized_shape)) 57 | self.bias = Parameter(torch.Tensor(*normalized_shape)) 58 | self.reset_parameters() 59 | self.no_persist_layer_norm = no_persist_layer_norm 60 | self.sequence_parallel = sequence_parallel 61 | 62 | # set sequence parallelism flag on weight and bias parameters 63 | setattr(self.weight, 'sequence_parallel', self.sequence_parallel) 64 | setattr(self.bias, 'sequence_parallel', self.sequence_parallel) 65 | 66 | 67 | def reset_parameters(self): 68 | 69 | if self.apply_layernorm_1p: 70 | init.zeros_(self.weight) 71 | init.zeros_(self.bias) 72 | else: 73 | init.ones_(self.weight) 74 | init.zeros_(self.bias) 75 | 76 | def forward(self, input): 77 | 78 | weight = self.weight + 1 if self.apply_layernorm_1p else self.weight 79 | # CPU path is here for unittest sake. 80 | if not input.is_cuda: 81 | print("WARNING! The input of FusedLayerNorm should be on the GPU." 82 | "This warning should only be triggered in the FusedLayerNorm unit tests.") 83 | return F.layer_norm(input, self.normalized_shape, weight, self.bias, self.eps) 84 | 85 | if self.no_persist_layer_norm: 86 | return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps) 87 | else: 88 | output = FastLayerNormFN.apply(input, weight, self.bias, self.eps) 89 | 90 | # Apex's fast layer norm function outputs a 'view' tensor (i.e., has 91 | # a populated '_base' field). This will result in schedule.py's 92 | # deallocate_output_tensor() throwing an error, so a viewless tensor is 93 | # created to prevent this. 94 | output = make_viewless_tensor(inp = output, 95 | requires_grad = input.requires_grad, 96 | keep_graph = True) 97 | 98 | return output 99 | -------------------------------------------------------------------------------- /src/megatron/model/multiple_choice.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Multiple choice model.""" 4 | 5 | import torch 6 | 7 | from megatron import get_args, print_rank_last 8 | from megatron.model.enums import AttnMaskType 9 | from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids 10 | from megatron.model.language_model import get_language_model 11 | from megatron.model.utils import get_linear_layer 12 | from megatron.model.utils import init_method_normal 13 | from megatron.model.utils import scaled_init_method_normal 14 | from .module import MegatronModule 15 | 16 | 17 | class MultipleChoice(MegatronModule): 18 | 19 | def __init__(self, 20 | config, 21 | num_tokentypes=2, 22 | pre_process=True, 23 | post_process=True): 24 | super(MultipleChoice, self).__init__(share_embeddings_and_output_weights=False) 25 | args = get_args() 26 | 27 | self.pre_process = pre_process 28 | self.post_process = post_process 29 | 30 | self.language_model, self._language_model_key = get_language_model( 31 | config=config, 32 | num_tokentypes=num_tokentypes, 33 | add_pooler=True, 34 | encoder_attn_mask_type=AttnMaskType.padding, 35 | pre_process=self.pre_process, 36 | post_process=self.post_process) 37 | 38 | # Multi-choice head. 39 | if self.post_process: 40 | self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout) 41 | self.multichoice_head = get_linear_layer(args.hidden_size, 1, 42 | init_method, 43 | gather_params_on_init=args.zero_stage == 3) 44 | self._multichoice_head_key = 'multichoice_head' 45 | 46 | def set_input_tensor(self, input_tensor): 47 | """See megatron.model.transformer.set_input_tensor()""" 48 | self.language_model.set_input_tensor(input_tensor) 49 | 50 | def forward(self, model_input, attention_mask, tokentype_ids=None): 51 | 52 | # [batch, choices, sequence] --> [batch * choices, sequence] --> 53 | # transformer --> [batch, choices] --> softmax 54 | 55 | # Ensure the shape is [batch-size, choices, sequence] 56 | assert len(attention_mask.shape) == 3 57 | num_choices = attention_mask.shape[1] 58 | 59 | # Reshape and treat choice dimension the same as batch. 60 | attention_mask = attention_mask.view(-1, attention_mask.size(-1)) 61 | extended_attention_mask = bert_extended_attention_mask(attention_mask) 62 | 63 | input_ids = model_input 64 | # Do the same as attention_mask for input_ids, tokentype_ids 65 | assert len(input_ids.shape) == 3 66 | assert len(tokentype_ids.shape) == 3 67 | input_ids = input_ids.view(-1, input_ids.size(-1)) 68 | tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1)) 69 | position_ids = bert_position_ids(input_ids) 70 | 71 | lm_output = self.language_model( 72 | input_ids, 73 | position_ids, 74 | extended_attention_mask, 75 | tokentype_ids=tokentype_ids 76 | ) 77 | if self.post_process: 78 | _, pooled_output = lm_output[0], lm_output[1] 79 | multichoice_output = self.multichoice_dropout(pooled_output) 80 | multichoice_logits = self.multichoice_head(multichoice_output) 81 | 82 | # Reshape back to separate choices. 83 | multichoice_logits = multichoice_logits.view(-1, num_choices) 84 | 85 | return multichoice_logits 86 | return lm_output 87 | 88 | def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): 89 | """For easy load when model is combined with other heads, 90 | add an extra key.""" 91 | 92 | state_dict_ = {} 93 | state_dict_[self._language_model_key] \ 94 | = self.language_model.state_dict_for_save_checkpoint(prefix=prefix, 95 | keep_vars=keep_vars) 96 | if self.post_process: 97 | state_dict_[self._multichoice_head_key] \ 98 | = self.multichoice_head.state_dict(prefix=prefix, keep_vars=keep_vars) 99 | return state_dict_ 100 | 101 | def load_state_dict(self, state_dict, strict=True): 102 | """Customized load.""" 103 | 104 | self.language_model.load_state_dict( 105 | state_dict[self._language_model_key], strict=strict) 106 | if self.post_process: 107 | if self._multichoice_head_key in state_dict: 108 | self.multichoice_head.load_state_dict( 109 | state_dict[self._multichoice_head_key], strict=strict) 110 | else: 111 | print_rank_last('***WARNING*** could not find {} in the checkpoint, ' 112 | 'initializing to random'.format( 113 | self._multichoice_head_key)) 114 | -------------------------------------------------------------------------------- /src/megatron/model/rotary_pos_embedding.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The following code has been taken from https://github.com/NVIDIA/NeMo/blob/ \ 4 | # 782b4e1652aaa43c8be390d9db0dc89544afa080/nemo/collections/nlp/modules/ \ 5 | # common/megatron/rotary_pos_embedding.py 6 | 7 | import importlib.util 8 | import torch 9 | 10 | from torch import einsum, nn 11 | 12 | __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb'] 13 | 14 | class RotaryEmbedding(nn.Module): 15 | def __init__(self, dim): 16 | super().__init__() 17 | inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) 18 | self.register_buffer('inv_freq', inv_freq) 19 | if importlib.util.find_spec('einops') is None: 20 | raise RuntimeError("einops is required for Rotary Embedding") 21 | 22 | def forward(self, max_seq_len, offset=0): 23 | seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset 24 | freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq) 25 | # first part even vector components, second part odd vector components, 26 | # 2 * dim in dimension size 27 | emb = torch.cat((freqs, freqs), dim=-1) 28 | # emb [seq_length, .., dim] 29 | from einops import rearrange 30 | return rearrange(emb, 'n d -> n 1 1 d') 31 | 32 | 33 | def _rotate_half(x): 34 | """ 35 | change sign so the last dimension becomes [-odd, +even] 36 | """ 37 | from einops import rearrange 38 | x = rearrange(x, '... (j d) -> ... j d', j=2) 39 | x1, x2 = x.unbind(dim=-2) 40 | return torch.cat((-x2, x1), dim=-1) 41 | 42 | 43 | def apply_rotary_pos_emb(t, freqs): 44 | """ 45 | input tensor t is of shape [seq_length, ..., dim] 46 | rotary positional embeding tensor freqs is of shape [seq_length, ..., dim] 47 | check https://kexue.fm/archives/8265 for detailed formulas 48 | """ 49 | rot_dim = freqs.shape[-1] 50 | # ideally t_pass is empty so rotary pos embedding is applied to all tensor t 51 | t, t_pass = t[..., :rot_dim], t[..., rot_dim:] 52 | 53 | # first part is cosine component 54 | # second part is sine component, need to change signs with _rotate_half method 55 | t = (t * freqs.cos().to(t.dtype)) + (_rotate_half(t) * freqs.sin().to(t.dtype)) 56 | return torch.cat((t, t_pass), dim=-1) 57 | -------------------------------------------------------------------------------- /src/megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for models.""" 4 | 5 | import math 6 | 7 | import torch 8 | 9 | from megatron import get_args 10 | 11 | from deepspeed.runtime.zero import GatheredParameters 12 | 13 | def init_method_normal(sigma): 14 | """Init method based on N(0, sigma).""" 15 | def init_(tensor): 16 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 17 | 18 | return init_ 19 | 20 | 21 | def scaled_init_method_normal(sigma, num_layers): 22 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 23 | std = sigma / math.sqrt(2.0 * num_layers) 24 | 25 | def init_(tensor): 26 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 27 | 28 | return init_ 29 | 30 | 31 | def gather_and_init(param, init_method): 32 | with GatheredParameters(param, modifier_rank=0): 33 | init_method(param) 34 | 35 | 36 | def attention_mask_func(attention_scores, attention_mask): 37 | args = get_args() 38 | if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: 39 | attention_mask_ = attention_mask 40 | actual_seqlen = attention_scores.size()[2] 41 | if actual_seqlen != attention_mask_.size()[2]: 42 | # attention_mask has size [1, 1, seqlen, seqlen] 43 | attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous() 44 | attention_scores.masked_fill_(attention_mask_, -10000.0) 45 | else: 46 | attention_scores.masked_fill_(attention_mask, -10000.0) 47 | return attention_scores 48 | 49 | 50 | def get_linear_layer(rows, columns, init_method, gather_params_on_init=False): 51 | """Simple linear layer with weight initialization.""" 52 | layer = torch.nn.Linear(rows, columns) 53 | if get_args().perform_initialization: 54 | with GatheredParameters(layer.weight, modifier_rank=0, enabled=gather_params_on_init): 55 | init_method(layer.weight) 56 | with torch.no_grad(): 57 | with GatheredParameters(layer.bias, modifier_rank=0, enabled=gather_params_on_init): 58 | layer.bias.zero_() 59 | return layer 60 | 61 | @torch.jit.script 62 | def gelu_impl(x): 63 | """OpenAI's gelu implementation.""" 64 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 65 | (1.0 + 0.044715 * x * x))) 66 | def openai_gelu(x): 67 | return gelu_impl(x) 68 | 69 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 70 | @torch.jit.script 71 | def erf_gelu(x): 72 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 73 | -------------------------------------------------------------------------------- /src/megatron/model/vision/classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Vision Transformer(VIT) model.""" 4 | 5 | import torch 6 | from torch.nn.init import trunc_normal_ 7 | from megatron import get_args 8 | from megatron.model.utils import get_linear_layer 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead 10 | from megatron.model.vision.mit_backbone import mit_b3_avg 11 | from megatron.model.module import MegatronModule 12 | 13 | class VitClassificationModel(MegatronModule): 14 | """Vision Transformer Model.""" 15 | 16 | def __init__(self, config, num_classes, finetune=False, 17 | pre_process=True, post_process=True): 18 | super(VitClassificationModel, self).__init__() 19 | args = get_args() 20 | 21 | self.hidden_size = args.hidden_size 22 | self.num_classes = num_classes 23 | self.finetune = finetune 24 | self.pre_process = pre_process 25 | self.post_process = post_process 26 | self.backbone = VitBackbone( 27 | config=config, 28 | pre_process=self.pre_process, 29 | post_process=self.post_process, 30 | single_token_output=True 31 | ) 32 | 33 | if self.post_process: 34 | if not self.finetune: 35 | self.head = VitMlpHead(self.hidden_size, self.num_classes) 36 | else: 37 | self.head = get_linear_layer( 38 | self.hidden_size, 39 | self.num_classes, 40 | torch.nn.init.zeros_, 41 | gather_params_on_init=args.zero_stage == 3 42 | ) 43 | 44 | def set_input_tensor(self, input_tensor): 45 | """See megatron.model.transformer.set_input_tensor()""" 46 | self.backbone.set_input_tensor(input_tensor) 47 | 48 | def forward(self, input): 49 | hidden_states = self.backbone(input) 50 | 51 | if self.post_process: 52 | hidden_states = self.head(hidden_states) 53 | 54 | return hidden_states 55 | 56 | 57 | class MitClassificationModel(MegatronModule): 58 | """Mix vision Transformer Model.""" 59 | 60 | def __init__(self, num_classes, 61 | pre_process=True, post_process=True): 62 | super(MitClassificationModel, self).__init__() 63 | args = get_args() 64 | 65 | self.hidden_size = args.hidden_size 66 | self.num_classes = num_classes 67 | 68 | self.backbone = mit_b3_avg() 69 | self.head = torch.nn.Linear(512, num_classes) 70 | self.apply(self._init_weights) 71 | 72 | def _init_weights(self, m): 73 | if isinstance(m, torch.nn.Linear): 74 | trunc_normal_(m.weight, std=.02) 75 | if isinstance(m, torch.nn.Linear) and m.bias is not None: 76 | torch.nn.init.constant_(m.bias, 0) 77 | 78 | def set_input_tensor(self, input_tensor): 79 | """See megatron.model.transformer.set_input_tensor()""" 80 | pass 81 | 82 | def forward(self, input): 83 | hidden_states = self.backbone(input) 84 | hidden_states = self.head(hidden_states) 85 | 86 | return hidden_states 87 | -------------------------------------------------------------------------------- /src/megatron/model/vision/inpainting.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # This source code is licensed under the BSD license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | i 6 | import math 7 | import apex 8 | import einops 9 | import torch 10 | import torch.nn.functional as F 11 | from megatron import get_args, print_rank_0 12 | from megatron.model.utils import get_linear_layer 13 | from megatron.model.vision.vit_backbone import VitBackbone 14 | from megatron.model.module import MegatronModule 15 | from megatron.model.vision.mit_backbone import mit_b3 16 | from megatron.model.vision.utils import resize_ 17 | 18 | 19 | class VitInpaintingModel(MegatronModule): 20 | 21 | def __init__(self, config, pre_process=True, post_process=True): 22 | super(VitInpaintingModel, self).__init__() 23 | args = get_args() 24 | 25 | self.pre_process = pre_process 26 | self.post_process = post_process 27 | self.hidden_size = config.hidden_size 28 | self.backbone = VitBackbone( 29 | config=config, 30 | pre_process=self.pre_process, 31 | post_process=self.post_process, 32 | class_token=False, 33 | ) 34 | self.patch_dim = args.patch_dim 35 | self.img_h = args.img_h 36 | self.img_w = args.img_w 37 | self.seq_length = args.seq_length 38 | # full mask 39 | 40 | if self.post_process: 41 | self.linear_decoder = get_linear_layer( 42 | self.hidden_size, 43 | self.backbone.flatten_dim, 44 | torch.nn.init.zeros_, 45 | gather_params_on_init=args.zero_stage == 3 46 | ) 47 | 48 | def set_input_tensor(self, input_tensor): 49 | self.backbone.set_input_tensor(input_tensor) 50 | 51 | def forward(self, input): 52 | 53 | hidden_states = self.backbone(input) 54 | 55 | if not self.post_process: 56 | return hidden_states 57 | decoded_output = self.linear_decoder(hidden_states) 58 | output = einops.rearrange( 59 | decoded_output, 60 | "b (h w) (p1 p2 c) -> b c (h p1) (w p2)", 61 | p1=self.patch_dim, 62 | p2=self.patch_dim, 63 | h=self.img_h//self.patch_dim, 64 | w=self.img_w//self.patch_dim, 65 | ) 66 | 67 | return output 68 | 69 | 70 | class MLP(torch.nn.Module): 71 | """ 72 | Linear Embedding 73 | """ 74 | def __init__(self, input_dim=2048, embed_dim=768): 75 | super().__init__() 76 | self.proj = torch.nn.Linear(input_dim, embed_dim) 77 | 78 | def forward(self, x): 79 | x = x.flatten(2).transpose(1, 2) 80 | x = self.proj(x) 81 | return x 82 | 83 | 84 | class MitInpaintingModel(MegatronModule): 85 | """Mix vision Transformer Model.""" 86 | 87 | def __init__(self, pre_process=True, post_process=True): 88 | super(MitInpaintingModel, self).__init__() 89 | self.pre_process = pre_process 90 | self.post_process = post_process 91 | 92 | args = get_args() 93 | self.patch_dim = args.patch_dim 94 | self.img_h = args.img_h 95 | self.img_w = args.img_w 96 | self.flatten_dim = self.patch_dim * self.patch_dim * 3 97 | self.backbone = mit_b3() 98 | 99 | self.in_channels = [64, 128, 320, 512] 100 | self.embedding_dim = 768 101 | 102 | c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels 103 | 104 | self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=self.embedding_dim) 105 | self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=self.embedding_dim) 106 | self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=self.embedding_dim) 107 | self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=self.embedding_dim) 108 | 109 | self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, self.embedding_dim, 1, 1, bias=False) 110 | self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim) 111 | self.dropout = torch.nn.Dropout2d(0.1) 112 | 113 | self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1) 114 | 115 | def set_input_tensor(self, input_tensor): 116 | """See megatron.model.transformer.set_input_tensor()""" 117 | pass 118 | 119 | def forward(self, input): 120 | c1, c2, c3, c4 = self.backbone(input) 121 | 122 | n, _, h, w = c4.shape 123 | _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3]) 124 | _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False) 125 | 126 | _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3]) 127 | _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False) 128 | 129 | _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3]) 130 | _c2 = resize(_c2, size=c1.size()[2:], mode='bilinear', align_corners=False) 131 | 132 | _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3]) 133 | 134 | _c = torch.cat([_c4, _c3, _c2, _c1], dim=1) 135 | _c = self.conv_fuse(_c) 136 | 137 | x = self.norm(_c) 138 | x = F.relu(x, inplace=True) 139 | x = self.dropout(x) 140 | 141 | x = self.linear_pred(x) 142 | 143 | output = einops.rearrange( 144 | x, 145 | "b (c p1 p2) h w -> b c (h p1) (w p2)", 146 | p1=self.patch_dim, 147 | p2=self.patch_dim, 148 | h=self.img_h//self.patch_dim, 149 | w=self.img_w//self.patch_dim, 150 | ) 151 | 152 | return output 153 | -------------------------------------------------------------------------------- /src/megatron/model/vision/knn_monitor.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | import torch 3 | from megatron import print_rank_0, get_args 4 | from megatron.core import mpu 5 | from megatron.data.vit_dataset import ClassificationTransform 6 | from megatron.data.image_folder import ImageFolder 7 | 8 | _FEATURE_BANK = None 9 | 10 | 11 | def build_data_loader(dataset, drop_last=True, shuffle=False): 12 | """Data loader. Note that batch-size is the local (per GPU) batch-size.""" 13 | # Sampler. 14 | args = get_args() 15 | micro_batch_size = 16 16 | num_workers = args.num_workers 17 | world_size = mpu.get_data_parallel_world_size() 18 | rank = mpu.get_data_parallel_rank() 19 | sampler = torch.utils.data.distributed.DistributedSampler( 20 | dataset, num_replicas=world_size, rank=rank, 21 | drop_last=drop_last, shuffle=shuffle 22 | ) 23 | 24 | # Data loader. Note that batch size is the per GPU batch size. 25 | data_loader = torch.utils.data.DataLoader( 26 | dataset, 27 | batch_size=micro_batch_size, 28 | sampler=sampler, 29 | shuffle=False, 30 | num_workers=num_workers, 31 | drop_last=not drop_last, 32 | pin_memory=True, 33 | ) 34 | return data_loader 35 | 36 | 37 | def compute_feature_bank(model): 38 | args = get_args() 39 | global _FEATURE_BANK 40 | feature_bank = [] 41 | feature_label = [] 42 | 43 | train_ds = ImageFolder( 44 | root=args.data_path[0], 45 | transform=ClassificationTransform((args.img_h, args.img_w), train=False), 46 | data_per_class_fraction=1.0 47 | ) 48 | classes = len(train_ds.classes) 49 | dataloader = build_data_loader(train_ds) 50 | 51 | for m in model: 52 | m.eval() 53 | 54 | with torch.no_grad(): 55 | for i, batch in enumerate(dataloader): 56 | images = batch[0].cuda().contiguous() 57 | labels = batch[1].cuda().contiguous() 58 | student_feature, teacher_feature = model[0](images) 59 | feature = F.normalize(teacher_feature.float(), dim=1) 60 | feature_bank.append(feature) 61 | feature_label.append(labels) 62 | 63 | for m in model: 64 | m.train() 65 | 66 | # [N', D] 67 | feature_bank = torch.cat(feature_bank, dim=0).contiguous() 68 | feature_label = torch.cat(feature_label, dim=0).contiguous() 69 | 70 | feature_banks = [torch.zeros_like(feature_bank) 71 | for i in range(mpu.get_data_parallel_world_size())] 72 | torch.distributed.all_gather(feature_banks, 73 | feature_bank, 74 | group=mpu.get_data_parallel_group()) 75 | 76 | assert torch.all(torch.eq(feature_banks[mpu.get_data_parallel_rank()], 77 | feature_bank)) 78 | 79 | feature_labels = [torch.zeros_like(feature_label) 80 | for i in range(mpu.get_data_parallel_world_size())] 81 | torch.distributed.all_gather(feature_labels, 82 | feature_label, 83 | group=mpu.get_data_parallel_group()) 84 | 85 | # [D, N] 86 | feature_banks = torch.cat(feature_banks, dim=0).t().contiguous() 87 | # [N] 88 | feature_labels = torch.cat(feature_labels, dim=0).contiguous() 89 | print_rank_0("feature_banks size is {}".format(feature_banks.size())) 90 | print_rank_0("feature labels size is {}".format(feature_labels.size())) 91 | 92 | _FEATURE_BANK = (feature_banks, feature_labels, classes) 93 | 94 | 95 | def get_feature_bank(): 96 | global _FEATURE_BANK 97 | assert _FEATURE_BANK is not None 98 | return _FEATURE_BANK 99 | 100 | 101 | # knn monitor as in InstDisc https://arxiv.org/abs/1805.01978 102 | # implementation follows http://github.com/zhirongw/lemniscate.pytorch and 103 | # https://github.com/leftthomas/SimCLR 104 | def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t): 105 | # compute cos similarity between each feature vector and feature bank ---> [B, N] 106 | sim_matrix = torch.mm(feature, feature_bank) 107 | # [B, K] 108 | sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1) 109 | # [B, K] 110 | sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1), 111 | dim=-1, 112 | index=sim_indices) 113 | sim_weight = (sim_weight / knn_t).exp() 114 | 115 | # counts for each class 116 | one_hot_label = torch.zeros(feature.size(0) * knn_k, 117 | classes, 118 | device=sim_labels.device) 119 | # [B*K, C] 120 | one_hot_label = one_hot_label.scatter(dim=-1, 121 | index=sim_labels.view(-1, 1), 122 | value=1.0) 123 | # weighted score ---> [B, C] 124 | pred_scores = torch.sum( 125 | one_hot_label.view(feature.size(0), -1, classes) * sim_weight.unsqueeze(dim=-1), 126 | dim=1) 127 | 128 | pred_labels = pred_scores.argsort(dim=-1, descending=True) 129 | return pred_labels 130 | -------------------------------------------------------------------------------- /src/megatron/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def resize(input, 7 | size=None, 8 | scale_factor=None, 9 | mode='nearest', 10 | align_corners=None, 11 | warning=True): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ((output_h > 1 and output_w > 1 and input_h > 1 18 | and input_w > 1) and (output_h - 1) % (input_h - 1) 19 | and (output_w - 1) % (input_w - 1)): 20 | warnings.warn( 21 | f'When align_corners={align_corners}, ' 22 | 'the output would more aligned if ' 23 | f'input size {(input_h, input_w)} is `x+1` and ' 24 | f'out size {(output_h, output_w)} is `nx+1`') 25 | if isinstance(size, torch.Size): 26 | size = tuple(int(x) for x in size) 27 | return F.interpolate(input, size, scale_factor, mode, align_corners) 28 | -------------------------------------------------------------------------------- /src/megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /src/megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import os 5 | import random 6 | import numpy 7 | import torch 8 | 9 | import mpu 10 | from deepspeed.accelerator import get_accelerator 11 | 12 | class IdentityLayer(torch.nn.Module): 13 | def __init__(self, size, scale=1.0): 14 | super(IdentityLayer, self).__init__() 15 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 16 | 17 | def forward(self): 18 | return self.weight 19 | 20 | 21 | def set_random_seed(seed): 22 | """Set random seed for reproducability.""" 23 | random.seed(seed) 24 | numpy.random.seed(seed) 25 | torch.manual_seed(seed) 26 | mpu.model_parallel_cuda_manual_seed(seed) 27 | 28 | 29 | def initialize_distributed(backend='nccl'): 30 | """Initialize torch.distributed.""" 31 | # Get local rank in case it is provided. 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--local_rank', type=int, default=None, 34 | help='local rank passed from distributed launcher') 35 | args = parser.parse_args() 36 | local_rank = args.local_rank 37 | 38 | # Get rank and world size. 39 | rank = int(os.getenv('RANK', '0')) 40 | world_size = int(os.getenv("WORLD_SIZE", '1')) 41 | 42 | print('> initializing torch.distributed with local rank: {}, ' 43 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 44 | 45 | # Set the device id. 46 | device = rank % get_accelerator().device_count() 47 | if local_rank is not None: 48 | device = local_rank 49 | get_accelerator().set_device(device) 50 | 51 | # Call the init process. 52 | init_method = 'tcp://' 53 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 54 | master_port = os.getenv('MASTER_PORT', '6000') 55 | init_method += master_ip + ':' + master_port 56 | torch.distributed.init_process_group( 57 | backend=backend, 58 | world_size=world_size, 59 | rank=rank, 60 | init_method=init_method) 61 | 62 | 63 | def print_separator(message): 64 | torch.distributed.barrier() 65 | filler_len = (78 - len(message)) // 2 66 | filler = '-' * filler_len 67 | string = '\n' + filler + ' {} '.format(message) + filler 68 | if torch.distributed.get_rank() == 0: 69 | print(string, flush=True) 70 | torch.distributed.barrier() 71 | -------------------------------------------------------------------------------- /src/megatron/mpu/tests/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import set_random_seed 4 | from commons import IdentityLayer 5 | from commons import print_separator 6 | from commons import initialize_distributed 7 | from mpu.cross_entropy import vocab_parallel_cross_entropy 8 | import mpu 9 | import torch.nn.functional as F 10 | import torch 11 | import random 12 | import sys 13 | from deepspeed.accelerator import get_accelerator 14 | sys.path.append("../..") 15 | 16 | 17 | def torch_cross_entropy(batch_size, seq_length, vocab_size, 18 | logits_scale, seed): 19 | set_random_seed(seed) 20 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 21 | scale=logits_scale).to(get_accelerator().device_name()) 22 | logits = identity() 23 | target = get_accelerator().LongTensor( 24 | size=(batch_size, seq_length)).random_(0, vocab_size) 25 | loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), 26 | target.view(-1), 27 | reduction='none').view_as(target).mean() 28 | loss.backward() 29 | return loss, identity.weight.grad 30 | 31 | 32 | def mpu_cross_entropy(batch_size, seq_length, vocab_size, 33 | logits_scale, seed): 34 | set_random_seed(seed) 35 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 36 | scale=logits_scale).to(get_accelerator().device_name()) 37 | logits = identity() 38 | logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits) 39 | target = get_accelerator().LongTensor( 40 | size=(batch_size, seq_length)).random_(0, vocab_size) 41 | loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() 42 | loss.backward() 43 | return loss, identity.weight.grad 44 | 45 | 46 | def test_cross_entropy(tensor_model_parallel_size): 47 | 48 | if torch.distributed.get_rank() == 0: 49 | print('> testing cross entropy with model parallel size {} ...'. 50 | format(tensor_model_parallel_size)) 51 | 52 | mpu.initialize_model_parallel(tensor_model_parallel_size) 53 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 54 | 55 | batch_size = 13 56 | seq_length = 17 57 | vocab_size_per_partition = 11 58 | logits_scale = 1000.0 59 | vocab_size = vocab_size_per_partition * tensor_model_parallel_size 60 | seed = 1234 61 | 62 | loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, 63 | vocab_size, logits_scale, 64 | seed) 65 | loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, 66 | vocab_size, logits_scale, 67 | seed) 68 | 69 | error = loss_torch.sub_(loss_mpu).abs().max() 70 | print(' max error in loss on global rank {}: {}'.format( 71 | torch.distributed.get_rank(), error)) 72 | assert error < 1.0e-6 73 | 74 | error = grad_torch.sub_(grad_mpu).abs().max() 75 | print(' max error in grad on global rank {}: {}'.format( 76 | torch.distributed.get_rank(), error)) 77 | assert error < 1.0e-6 78 | 79 | # Reset groups 80 | mpu.destroy_tensor_model_parallel() 81 | 82 | torch.distributed.barrier() 83 | if torch.distributed.get_rank() == 0: 84 | print('>> passed the test :-)') 85 | 86 | 87 | if __name__ == '__main__': 88 | 89 | initialize_distributed() 90 | world_size = torch.distributed.get_world_size() 91 | 92 | tensor_model_parallel_size = 1 93 | while tensor_model_parallel_size <= world_size: 94 | print_separator('test cross entropy') 95 | test_cross_entropy(tensor_model_parallel_size) 96 | tensor_model_parallel_size *= 2 97 | -------------------------------------------------------------------------------- /src/megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import print_separator 4 | from commons import initialize_distributed 5 | from deepspeed.accelerator import get_accelerator 6 | from mpu import data as data_utils 7 | import mpu 8 | import torch 9 | import functools 10 | import operator 11 | import sys 12 | sys.path.append("../..") 13 | 14 | 15 | def test_broadcast_data(tensor_model_parallel_size): 16 | 17 | if torch.distributed.get_rank() == 0: 18 | print('> testing broadcast_data with model parallel size {} ...'. 19 | format(tensor_model_parallel_size)) 20 | 21 | mpu.initialize_model_parallel(tensor_model_parallel_size) 22 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 23 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 24 | 25 | key_size_t = {'key1': [7, 11], 26 | 'key2': [8, 2, 1], 27 | 'key3': [13], 28 | 'key4': [5, 1, 2], 29 | 'key5': [5, 12]} 30 | keys = list(key_size_t.keys()) 31 | 32 | data = {} 33 | data_t = {} 34 | for key in key_size_t: 35 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 36 | data_t[key] = data[key].clone() 37 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 38 | data_t['keyX'] = data['keyX'].clone() 39 | if mpu.get_tensor_model_parallel_rank() != 0: 40 | data = None 41 | 42 | data_utils._check_data_types(keys, data_t, torch.int64) 43 | key_size, key_numel, \ 44 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 45 | for key in keys: 46 | assert key_size[key] == key_size_t[key] 47 | total_numel_t = 0 48 | for key in keys: 49 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 50 | assert key_numel[key] == target_size 51 | total_numel_t += target_size 52 | assert total_numel == total_numel_t 53 | 54 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 55 | for key in keys: 56 | tensor = data_t[key].to(get_accelerator().device_name()) 57 | assert data_b[key].sub(tensor).abs().max() == 0 58 | 59 | # Reset groups 60 | mpu.destroy_tensor_model_parallel() 61 | 62 | torch.distributed.barrier() 63 | if torch.distributed.get_rank() == 0: 64 | print('>> passed the test :-)') 65 | 66 | 67 | if __name__ == '__main__': 68 | 69 | initialize_distributed() 70 | world_size = torch.distributed.get_world_size() 71 | 72 | tensor_model_parallel_size = 1 73 | while tensor_model_parallel_size <= world_size: 74 | print_separator('test test broadcast data') 75 | test_broadcast_data(tensor_model_parallel_size) 76 | tensor_model_parallel_size *= 2 77 | -------------------------------------------------------------------------------- /src/megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import print_separator 4 | from commons import initialize_distributed 5 | import mpu 6 | import torch 7 | import sys 8 | sys.path.append("../..") 9 | 10 | 11 | def test_initialize_model_parallel(tensor_model_parallel_size): 12 | 13 | if torch.distributed.get_rank() == 0: 14 | print('> testing initialize_model_parallel with size {} ...'.format( 15 | tensor_model_parallel_size)) 16 | tensor_model_parallel_size_ = min(tensor_model_parallel_size, 17 | torch.distributed.get_world_size()) 18 | assert not mpu.model_parallel_is_initialized() 19 | mpu.initialize_model_parallel(tensor_model_parallel_size_) 20 | assert mpu.model_parallel_is_initialized() 21 | 22 | # Checks. 23 | def check(group, world_size, rank): 24 | assert world_size == torch.distributed.get_world_size(group=group) 25 | assert rank == torch.distributed.get_rank(group=group) 26 | 27 | # Model parallel. 28 | world_size = tensor_model_parallel_size_ 29 | rank = torch.distributed.get_rank() % tensor_model_parallel_size_ 30 | assert world_size == mpu.get_tensor_model_parallel_world_size() 31 | assert rank == mpu.get_tensor_model_parallel_rank() 32 | check(mpu.get_tensor_model_parallel_group(), world_size, rank) 33 | 34 | # Data parallel. 35 | world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_ 36 | rank = torch.distributed.get_rank() // tensor_model_parallel_size 37 | assert world_size == mpu.get_data_parallel_world_size() 38 | assert rank == mpu.get_data_parallel_rank() 39 | check(mpu.get_data_parallel_group(), world_size, rank) 40 | 41 | # Reset groups 42 | mpu.destroy_model_parallel() 43 | 44 | torch.distributed.barrier() 45 | if torch.distributed.get_rank() == 0: 46 | print('>> passed the test :-)') 47 | 48 | 49 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): 50 | 51 | if torch.distributed.get_rank() == 0: 52 | print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format( 53 | tensor_model_parallel_size_)) 54 | tensor_model_parallel_size = min(tensor_model_parallel_size_, 55 | torch.distributed.get_world_size()) 56 | assert not mpu.model_parallel_is_initialized() 57 | mpu.initialize_model_parallel(tensor_model_parallel_size) 58 | assert mpu.model_parallel_is_initialized() 59 | 60 | # Checks 61 | src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank() 62 | assert mpu.get_tensor_model_parallel_src_rank() == src_rank 63 | 64 | # Reset groups 65 | mpu.destroy_model_parallel() 66 | 67 | torch.distributed.barrier() 68 | if torch.distributed.get_rank() == 0: 69 | print('>> passed the test :-)') 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | initialize_distributed() 75 | world_size = torch.distributed.get_world_size() 76 | tensor_model_parallel_size = 1 77 | while tensor_model_parallel_size <= world_size: 78 | print_separator('test initialize model parallel') 79 | test_initialize_model_parallel(tensor_model_parallel_size) 80 | print_separator('test model parallel source rank') 81 | test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) 82 | tensor_model_parallel_size *= 2 83 | -------------------------------------------------------------------------------- /src/megatron/optimizer/clip_grads.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Gradient clipping.""" 4 | 5 | import torch 6 | try: 7 | from torch._six import inf as inf 8 | except ModuleNotFoundError: 9 | from torch import inf as inf 10 | 11 | from deepspeed.accelerator import get_accelerator 12 | if get_accelerator().device_name() == 'cuda': 13 | from apex.multi_tensor_apply import multi_tensor_applier 14 | import amp_C 15 | 16 | from megatron.model.module import param_is_not_shared 17 | from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate 18 | 19 | 20 | def clip_grad_norm_fp32(parameters, grads_for_norm, 21 | max_norm, norm_type=2, 22 | model_parallel_group=None): 23 | """Clips gradient norm of an iterable of parameters whose gradients 24 | are in fp32. 25 | 26 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and 27 | added functionality to handle model parallel parameters. Note that 28 | the gradients are modified in place. 29 | 30 | Arguments: 31 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a 32 | single Tensor that will have gradients normalized 33 | grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single 34 | Tensor that will be used for calculating the grad norm. 35 | max_norm (float or int): max norm of the gradients 36 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for 37 | infinity norm. 38 | model_parallel_group (group): given the nature of the distributed 39 | optimizer, this is passed as an argument. 40 | 41 | Returns: 42 | Total norm of the parameters (viewed as a single vector). 43 | """ 44 | 45 | if isinstance(parameters, torch.Tensor): 46 | parameters = [parameters] 47 | if isinstance(grads_for_norm, torch.Tensor): 48 | grads_for_norm = [grads_for_norm] 49 | 50 | # Grads. 51 | grads = [] 52 | for param in parameters: 53 | if param.grad is not None: 54 | assert param.grad.type() == 'torch.{}.FloatTensor'.format(get_accelerator().device_name()) 55 | grads.append(param.grad.detach()) 56 | 57 | # Norm parameters. 58 | max_norm = float(max_norm) 59 | norm_type = float(norm_type) 60 | total_norm = 0.0 61 | 62 | # Calculate norm. 63 | if norm_type == inf: 64 | total_norm = max(grad.abs().max() for grad in grads_for_norm) 65 | total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) 66 | # Take max across all model-parallel GPUs. 67 | torch.distributed.all_reduce(total_norm_cuda, 68 | op=torch.distributed.ReduceOp.MAX, 69 | group=model_parallel_group) 70 | total_norm = total_norm_cuda[0].item() 71 | 72 | else: 73 | if norm_type == 2.0: 74 | if get_accelerator().device_name() == 'cuda': 75 | dummy_overflow_buf = torch.cuda.IntTensor([0]) 76 | # Use apex's multi-tensor applier for efficiency reasons. 77 | # Multi-tensor applier takes a function and a list of list 78 | # and performs the operation on that list all in one kernel. 79 | if grads_for_norm: 80 | grad_norm, _ = multi_tensor_applier( 81 | amp_C.multi_tensor_l2norm, 82 | dummy_overflow_buf, 83 | [grads_for_norm], 84 | False # no per-parameter norm 85 | ) 86 | else: 87 | grad_norm = torch.cuda.FloatTensor([0]) 88 | else: 89 | grad_norm = torch.norm(grads_for_norm,p=2.0) 90 | # Since we will be summing across data parallel groups, 91 | # we need the pow(norm-type). 92 | total_norm = grad_norm ** norm_type 93 | else: 94 | for grad in grads_for_norm: 95 | grad_norm = torch.norm(grad, norm_type) 96 | total_norm += grad_norm ** norm_type 97 | 98 | # Sum across all model-parallel GPUs. 99 | torch.distributed.all_reduce(total_norm, 100 | op=torch.distributed.ReduceOp.SUM, 101 | group=model_parallel_group) 102 | total_norm = total_norm.item() ** (1.0 / norm_type) 103 | 104 | # Scale. 105 | clip_coeff = max_norm / (total_norm + 1.0e-6) 106 | if clip_coeff < 1.0: 107 | if get_accelerator().device_name() == 'cuda': 108 | dummy_overflow_buf = get_accelerator().IntTensor([0]) 109 | multi_tensor_applier(amp_C.multi_tensor_scale, 110 | dummy_overflow_buf, 111 | [grads, grads], 112 | clip_coeff) 113 | else: 114 | for g in grads: 115 | g.detach().mul_(clip_coeff.to(g.device)) 116 | 117 | return total_norm 118 | 119 | 120 | def count_zeros_fp32(parameters, model_parallel_group): 121 | 122 | if isinstance(parameters, torch.Tensor): 123 | parameters = [parameters] 124 | 125 | # Filter parameters based on: 126 | # - grad should not be none 127 | # - parameter should not be shared 128 | # - should not be a replica due to tensor model parallelism 129 | total_num_zeros = torch.cuda.FloatTensor([0.0]) 130 | for param in parameters: 131 | grad_not_none = param.grad is not None 132 | is_not_shared = param_is_not_shared(param) 133 | is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) 134 | if grad_not_none and is_not_shared and is_not_tp_duplicate: 135 | grad = param.grad.detach() 136 | num_zeros = grad.numel() - torch.count_nonzero(grad) 137 | total_num_zeros = num_zeros + total_num_zeros 138 | 139 | # Sum across all model-parallel GPUs. 140 | torch.distributed.all_reduce(total_num_zeros, 141 | op=torch.distributed.ReduceOp.SUM, 142 | group=model_parallel_group) 143 | 144 | total_num_zeros = total_num_zeros.item() 145 | 146 | return total_num_zeros 147 | -------------------------------------------------------------------------------- /src/megatron/optimizer/grad_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Megatron grad scaler.""" 4 | 5 | from abc import ABC 6 | from abc import abstractmethod 7 | 8 | import torch 9 | from deepspeed.accelerator import get_accelerator 10 | 11 | class MegatronGradScaler(ABC): 12 | 13 | def __init__(self, initial_scale): 14 | """Initialize scale value with the input initial scale.""" 15 | assert initial_scale > 0.0 16 | self._scale = get_accelerator().FloatTensor([initial_scale]) 17 | 18 | @property 19 | def scale(self): 20 | return self._scale 21 | 22 | @property 23 | def inv_scale(self): 24 | return self._scale.double().reciprocal().float() 25 | 26 | @abstractmethod 27 | def update(self, found_inf): 28 | pass 29 | 30 | @abstractmethod 31 | def state_dict(self): 32 | pass 33 | 34 | @abstractmethod 35 | def load_state_dict(self, state_dict): 36 | pass 37 | 38 | 39 | 40 | class ConstantGradScaler(MegatronGradScaler): 41 | 42 | def update(self, found_inf): 43 | pass 44 | 45 | def state_dict(self): 46 | return dict() 47 | 48 | def load_state_dict(self, state_dict): 49 | pass 50 | 51 | 52 | 53 | class DynamicGradScaler(MegatronGradScaler): 54 | 55 | def __init__(self, initial_scale, min_scale, 56 | growth_factor, backoff_factor, 57 | growth_interval, hysteresis): 58 | """"Grad scaler with dynamic scale that gets adjusted 59 | during training.""" 60 | super(DynamicGradScaler, self).__init__(initial_scale) 61 | 62 | # Lower bound on the scale. 63 | assert min_scale > 0.0 64 | assert min_scale <= initial_scale 65 | self.min_scale = get_accelerator().FloatTensor([min_scale]) 66 | # Growth and backoff factors for the scale. 67 | assert growth_factor > 1.0 68 | self.growth_factor = get_accelerator().FloatTensor([growth_factor]) 69 | assert backoff_factor < 1.0 70 | assert backoff_factor > 0.0 71 | self.backoff_factor = get_accelerator().FloatTensor([backoff_factor]) 72 | # Interval over which if we don't see any inf/nan, 73 | # we will scale the grad scale by the growth factor. 74 | assert growth_interval > 0 75 | self.growth_interval = growth_interval 76 | # Number of inf/nans we should see before scaling down 77 | # the grad scale by the backoff factor. 78 | assert hysteresis > 0 79 | self.hysteresis = hysteresis 80 | 81 | # Trackers. 82 | self._growth_tracker = 0 83 | self._hysteresis_tracker = self.hysteresis 84 | 85 | 86 | def update(self, found_inf): 87 | 88 | # If we have an inf/nan, growth tracker is set to 0 89 | # and hysterisis tracker is reduced by 1. 90 | if found_inf: 91 | self._growth_tracker = 0 92 | self._hysteresis_tracker -= 1 93 | # Now if we are out of hysteresis count, scale down the loss. 94 | if self._hysteresis_tracker <= 0: 95 | self._scale = torch.max(self._scale * self.backoff_factor, 96 | self.min_scale) 97 | else: 98 | # If there is no nan/inf, increment the growth tracker. 99 | self._growth_tracker += 1 100 | # If we have had enough consequitive intervals with no nan/inf: 101 | if self._growth_tracker == self.growth_interval: 102 | # Reset the tracker and hysteresis trackers, 103 | self._growth_tracker = 0 104 | self._hysteresis_tracker = self.hysteresis 105 | # and scale up the loss scale. 106 | self._scale = self._scale * self.growth_factor 107 | 108 | 109 | def state_dict(self): 110 | state_dict = {} 111 | state_dict['scale'] = self._scale 112 | state_dict['growth_tracker'] = self._growth_tracker 113 | state_dict['hysteresis_tracker'] = self._hysteresis_tracker 114 | return state_dict 115 | 116 | 117 | def load_state_dict(self, state_dict): 118 | self._scale = state_dict['scale'].to(get_accelerator().current_device_name()) 119 | self._growth_tracker = state_dict['growth_tracker'] 120 | self._hysteresis_tracker = state_dict['hysteresis_tracker'] 121 | -------------------------------------------------------------------------------- /src/megatron/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Megatron 9 | 71 | 72 | 73 |
74 |

Prompt Megatron

75 | 76 | 77 | 78 | 79 | 80 |
81 | 0 82 | / 1000 83 |
84 | 85 |
86 | 87 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /src/megatron/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /src/megatron/text_generation/beam_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | ## from huggingface beam search 19 | class BeamHypotheses(object): 20 | def __init__(self, num_beams, length_penalty=1.0, early_stopping=False): 21 | """ 22 | Initialize n-best list of hypotheses. 23 | """ 24 | self.length_penalty = length_penalty 25 | self.early_stopping = early_stopping 26 | self.num_beams = num_beams 27 | self.beams = [] 28 | self.worst_score = 1e9 29 | 30 | def __len__(self): 31 | """ 32 | Number of hypotheses in the list. 33 | """ 34 | return len(self.beams) 35 | 36 | def add(self, hyp, sum_logprobs, length): 37 | """ 38 | Add a new hypothesis to the list. 39 | """ 40 | score = sum_logprobs / length ** self.length_penalty 41 | if len(self) < self.num_beams or score > self.worst_score: 42 | self.beams.append((score, hyp)) 43 | if len(self) > self.num_beams: 44 | sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) 45 | del self.beams[sorted_scores[0][1]] 46 | self.worst_score = sorted_scores[1][0] 47 | else: 48 | self.worst_score = min(score, self.worst_score) 49 | 50 | def is_done(self, best_sum_logprobs, cur_len): 51 | """ 52 | If there are enough hypotheses and that none of the hypotheses being generated 53 | can become better than the worst one in the heap, then we are done with this sentence. 54 | """ 55 | 56 | if len(self) < self.num_beams: 57 | return False 58 | elif self.early_stopping: 59 | return True 60 | else: 61 | cur_score = best_sum_logprobs / cur_len ** self.length_penalty 62 | ret = self.worst_score >= cur_score 63 | return ret 64 | 65 | -------------------------------------------------------------------------------- /src/megatron/text_generation/sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Sampling utilities. 4 | Part of this code is inspired by: 5 | - https://github.com/ari-holtzman/degen/blob/master/gen.py 6 | - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html 7 | """ 8 | 9 | 10 | import torch 11 | 12 | 13 | 14 | def modify_logits_for_top_k_filtering(logits, top_k): 15 | """Set the logits for none top-k values to -inf.""" 16 | 17 | filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] 18 | logits.masked_fill_(filter_, float('-Inf')) 19 | 20 | 21 | 22 | def modify_logits_for_top_p_filtering(logits, top_p): 23 | """Set the logits for none top-p values to -inf.""" 24 | 25 | # First sort and calculate cumulative sum of probabilities. 26 | sorted_logits, sorted_indices = torch.sort(logits, descending=True) 27 | cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) 28 | 29 | # Filteration based on the cumulative sum. 30 | filter_ = cumulative_probs > top_p 31 | # This shift by 1 is weird and I cannot justify it. This existed 32 | # in the original implementation: 33 | # https://github.com/ari-holtzman/degen/blob/master/gen.py 34 | # and I guess it is needed so keeping it for now. 35 | filter_[:, 1:] = filter_[:, :-1].clone() 36 | # Make sure we at least have one token to select from. 37 | filter_[..., 0] = 0 38 | 39 | # Fill in the filtered part 40 | filter_ = filter_.scatter(1, sorted_indices, filter_) 41 | logits.masked_fill_(filter_, float('-Inf')) 42 | 43 | 44 | 45 | def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None): 46 | """ Sample and generate a token. 47 | Note: logits has the dimension [b, v] where b is the batch size 48 | and v is the vocabulary size. 49 | If vocab_size is provided, we will make sure the sample that is 50 | generated is in [0, vocab-size). This will avoid out of vocabulary 51 | generations due to padding. 52 | """ 53 | 54 | # Check logits for consistency. 55 | assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.' 56 | assert logits.type() == 'torch.cuda.FloatTensor', \ 57 | 'input logits should be floats.' 58 | 59 | 60 | # Greedy is just simple argmax. 61 | if top_k == 1: 62 | assert top_p == 0.0, 'cannot set both greedy and top-p samplings.' 63 | samples = torch.argmax(logits, dim=-1) 64 | 65 | # Top-k or top-p sampling. 66 | else: 67 | # Clone so we do not modify the inputs, 68 | logits = logits.clone() 69 | # Apply temperature in place. 70 | if temperature != 1.0: 71 | logits.div_(temperature) 72 | 73 | if top_k > 1: 74 | assert top_p == 0.0, 'cannot set both top-k and top-p samplings.' 75 | assert top_k <= logits.size(1), 'top-k is larger than logit size.' 76 | if vocab_size: 77 | assert top_k < vocab_size, 'top-k is larger than vocab size.' 78 | modify_logits_for_top_k_filtering(logits, top_k) 79 | 80 | elif top_p > 0.0: 81 | assert top_p <= 1.0, 'top-p should be in (0, 1].' 82 | modify_logits_for_top_p_filtering(logits, top_p) 83 | 84 | # After filtering, we need to recalculate the distribution. 85 | probs = logits.softmax(dim=-1) 86 | samples = torch.multinomial(probs, num_samples=1).view(-1) 87 | 88 | # If vocab size is provided, make sure the samples are in 89 | # in the range [0, vocab-size). 90 | if vocab_size: 91 | samples = torch.clamp(samples, min=0, max=(vocab_size - 1)) 92 | 93 | return samples 94 | -------------------------------------------------------------------------------- /src/megatron/text_generation/tokenization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Tokenization utilities.""" 4 | 5 | 6 | import torch 7 | 8 | 9 | from megatron import get_tokenizer, get_args 10 | from .communication import broadcast_int_list, broadcast_tensor 11 | 12 | 13 | def detokenize_generations(tokens_gpu_tensor, 14 | lengths_gpu_tensor, 15 | return_segments): 16 | """Detokenize the generated tokens.""" 17 | 18 | tokenizer = get_tokenizer() 19 | args = get_args() 20 | prompts_plus_generations = [] 21 | if return_segments: 22 | prompts_plus_generations_segments = [] 23 | 24 | tokens = tokens_gpu_tensor.cpu().numpy().tolist() 25 | lengths = lengths_gpu_tensor.cpu().numpy().tolist() 26 | for sequence_tokens, length in zip(tokens, lengths): 27 | sequence_tokens = sequence_tokens[:length] 28 | prompts_plus_generations.append( 29 | tokenizer.detokenize(sequence_tokens)) 30 | if return_segments: 31 | words = [] 32 | for token in sequence_tokens: 33 | if args.tokenizer_type in ['SentencePieceTokenizer', 34 | 'GPTSentencePieceTokenizer']: 35 | word = tokenizer.decoder[token] 36 | elif args.tokenizer_type == 'NullTokenizer': 37 | word = str(token) 38 | else: 39 | word = tokenizer.tokenizer.decoder[token] 40 | word = bytearray( 41 | [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode( 42 | 'utf-8', errors='replace') 43 | words.append(word) 44 | prompts_plus_generations_segments.append(words) 45 | 46 | if return_segments: 47 | return tokens, prompts_plus_generations, \ 48 | prompts_plus_generations_segments 49 | 50 | return tokens, prompts_plus_generations 51 | 52 | 53 | def tokenize_prompts(prompts=None, tokens_to_generate=None, 54 | add_BOS=None, rank=0): 55 | """Tokenize prompts and make them avaiable on all ranks.""" 56 | 57 | # On all ranks set to None so we can pass them to functions 58 | sizes_list = None 59 | prompts_tokens_cuda_long_tensor = None 60 | prompts_length_cuda_long_tensor = None 61 | 62 | # On the specified rank, build the above. 63 | if torch.distributed.get_rank() == rank: 64 | assert prompts is not None 65 | assert tokens_to_generate is not None 66 | # Tensor of tokens padded and their unpadded length. 67 | prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \ 68 | _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS) 69 | # We need the sizes of these tensors for the boradcast 70 | sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size 71 | prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght 72 | 73 | # First, broadcast the sizes. 74 | sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank) 75 | 76 | # Now that we have the sizes, we can boradcast the tokens 77 | # and length tensors. 78 | sizes = sizes_tensor.tolist() 79 | prompts_tokens_cuda_long_tensor = broadcast_tensor( 80 | sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank) 81 | prompts_length_cuda_long_tensor = broadcast_tensor( 82 | sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor, 83 | rank=rank) 84 | 85 | return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor 86 | 87 | 88 | def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS): 89 | """Given a set of prompts and number of tokens to generate: 90 | - tokenize prompts 91 | - set the sequence length to be the max of length of prompts 92 | plus the number of tokens we would like to generate 93 | - pad all the sequences to this length so we can convert them 94 | into a 2D tensor. 95 | """ 96 | 97 | # Tokenize all the prompts. 98 | tokenizer = get_tokenizer() 99 | if add_BOS: 100 | prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt) 101 | for prompt in prompts] 102 | else: 103 | prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts] 104 | 105 | # Now we have a list of list of tokens which each list has a different 106 | # size. We want to extend this list to: 107 | # - incorporate the tokens that need to be generated 108 | # - make all the sequences equal length. 109 | # Get the prompts length. 110 | prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens] 111 | # Get the max prompts length. 112 | max_prompt_len = max(prompts_length) 113 | # Number of tokens in the each sample of the batch. 114 | samples_length = max_prompt_len + tokens_to_generate 115 | # Now update the list of list to be of the same size: samples_length. 116 | for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length): 117 | padding_size = samples_length - prompt_length 118 | prompt_tokens.extend([tokenizer.eod] * padding_size) 119 | 120 | # Now we are in a structured format, we can convert to tensors. 121 | prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens) 122 | prompts_length_tensor = torch.cuda.LongTensor(prompts_length) 123 | 124 | return prompts_tokens_tensor, prompts_length_tensor 125 | -------------------------------------------------------------------------------- /src/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /src/scripts/generate_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export TORCH_CUDA_ARCH_LIST=8.6+PTX 3 | CHECKPOINT_PATH=dataset/checkpoints/gpt2_345m 4 | VOCAB_FILE=dataset/gpt2-vocab.json 5 | MERGE_FILE=dataset/gpt2-merges.txt 6 | b=8 7 | mp=1 8 | experts=1 9 | nodes=1 10 | gpus=1 11 | 12 | 13 | use_tutel="" 14 | #use_tutel="--use-tutel" 15 | 16 | 17 | ds_inference="" 18 | #ds_inference="--ds-inference" 19 | 20 | export CUDA_DEVICE_MAX_CONNECTIONS=1 21 | 22 | launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus" 23 | L=24 24 | H=1024 25 | A=16 26 | #experts1=${experts[$k]} 27 | program_cmd="tools/generate_samples_gpt.py \ 28 | --tensor-model-parallel-size $mp \ 29 | --num-layers $L \ 30 | --hidden-size $H \ 31 | --num-attention-heads $A \ 32 | --max-position-embeddings 1024 \ 33 | --tokenizer-type GPT2BPETokenizer \ 34 | --fp16 \ 35 | --num-experts ${experts} \ 36 | --mlp-type standard \ 37 | --micro-batch-size $b \ 38 | --seq-length 1024 \ 39 | --out-seq-length 1024 \ 40 | --temperature 1.0 \ 41 | --vocab-file $VOCAB_FILE \ 42 | --merge-file $MERGE_FILE \ 43 | --genfile unconditional_samples.json \ 44 | --top_p 0.9 \ 45 | --log-interval 1 \ 46 | --num-samples 0 \ 47 | --load $CHECKPOINT_PATH \ 48 | $use_tutel $ds_inference" 49 | 50 | echo $launch_cmd $program_cmd 51 | $launch_cmd $program_cmd 52 | -------------------------------------------------------------------------------- /src/scripts/gpt/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false 24 | } 25 | -------------------------------------------------------------------------------- /src/scripts/pretrain_llama_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example script is contributed by external user https://github.com/LydiaXiaohongLi 3 | set -ex 4 | 5 | ###################################### 6 | # Change the below configurations here 7 | BASE_PATH=./tmp 8 | DS_CONFIG=${BASE_PATH}/deepspeed.json 9 | DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence" 10 | DATASET="1 ${DATASET_1}" 11 | CHECKPOINT_PATH=./tmp 12 | TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model 13 | 14 | TP=2 15 | PP=2 16 | ZERO_STAGE=0 17 | 18 | GPUS_PER_NODE=8 19 | MASTER_ADDR=localhost 20 | MASTER_PORT=6000 21 | NNODES=1 22 | NODE_RANK=0 23 | 24 | HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 25 | FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 26 | NUM_LAYERS=24 # e.g. llama-13b: 40 27 | NUM_HEADS=16 # e.g. llama-13b: 40 28 | SEQ_LENGTH=2048 29 | 30 | MICRO_BATCH_SIZE=4 31 | GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens 32 | TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps 33 | LR=3e-4 34 | MIN_LR=3e-5 35 | LR_WARMUP_STEPS=2000 36 | WEIGHT_DECAY=0.1 37 | GRAD_CLIP=1 38 | 39 | ## Activation checkpointing saves GPU memory, but reduces training speed 40 | # activation_checkpoint="true" 41 | activation_checkpoint="false" 42 | 43 | # Below configuration required for llama model as per llama paper 44 | # --no-query-key-layer-scaling \ 45 | # --attention-dropout 0 \ 46 | # --hidden-dropout 0 \ 47 | # --use-rotary-position-embeddings \ 48 | # --untie-embeddings-and-output-weights \ 49 | # --swiglu \ 50 | # --normalization rmsnorm \ 51 | # --disable-bias-linear \ 52 | ###################################### 53 | 54 | 55 | 56 | cat < $DS_CONFIG 57 | { 58 | "train_batch_size" : $GLOBAL_BATCH_SIZE, 59 | "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, 60 | "steps_per_print": 1, 61 | "zero_optimization": { 62 | "stage": $ZERO_STAGE 63 | }, 64 | "bf16": { 65 | "enabled": true 66 | } 67 | } 68 | EOT 69 | 70 | ds_args="" 71 | ds_args=" --deepspeed ${ds_args}" 72 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 73 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 74 | 75 | if [ "${activation_checkpoint}" = "true" ]; then 76 | ds_args="--deepspeed-activation-checkpointing ${ds_args}" 77 | 78 | ## old argument for recomputing the transformer layer 79 | # ds_args="--checkpoint-activations ${ds_args}" 80 | 81 | ## new argument for recomputing the transformer layer 82 | ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}" 83 | ## new argument for recomputing only the attention layer 84 | # ds_args="--recompute-granularity selective ${ds_args}" 85 | fi 86 | 87 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 88 | 89 | torchrun $DISTRIBUTED_ARGS \ 90 | pretrain.py \ 91 | --tensor-model-parallel-size $TP \ 92 | --pipeline-model-parallel-size $PP \ 93 | --num-layers $NUM_LAYERS \ 94 | --hidden-size $HIDDEN_SIZE \ 95 | --ffn-hidden-size $FFN_HIDDEN_SIZE \ 96 | --num-attention-heads $NUM_HEADS \ 97 | --micro-batch-size $MICRO_BATCH_SIZE \ 98 | --global-batch-size $GLOBAL_BATCH_SIZE \ 99 | --seq-length $SEQ_LENGTH \ 100 | --max-position-embeddings $SEQ_LENGTH \ 101 | --train-iters $TRAIN_STEPS \ 102 | --save $CHECKPOINT_PATH \ 103 | --load $CHECKPOINT_PATH \ 104 | --data-path $DATASET \ 105 | --data-impl mmap \ 106 | --tokenizer-type GPTSentencePieceTokenizer \ 107 | --tokenizer-model $TOKENIZER_PATH \ 108 | --split 949,50,1 \ 109 | --distributed-backend nccl \ 110 | --lr $LR \ 111 | --lr-decay-style cosine \ 112 | --min-lr $MIN_LR \ 113 | --weight-decay $WEIGHT_DECAY \ 114 | --clip-grad $GRAD_CLIP \ 115 | --lr-warmup-iters $LR_WARMUP_STEPS \ 116 | --optimizer adam \ 117 | --adam-beta1 0.9 \ 118 | --adam-beta2 0.95 \ 119 | --log-interval 1 \ 120 | --save-interval 10000 \ 121 | --eval-interval 1000 \ 122 | --eval-iters 10 \ 123 | --bf16 \ 124 | --no-query-key-layer-scaling \ 125 | --attention-dropout 0 \ 126 | --hidden-dropout 0 \ 127 | --use-rotary-position-embeddings \ 128 | --untie-embeddings-and-output-weights \ 129 | --swiglu \ 130 | --normalization rmsnorm \ 131 | --disable-bias-linear \ 132 | $ds_args -------------------------------------------------------------------------------- /src/scripts/run_deepspeed_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | BASE_PATH=/vc_data/Megatron-LM/data 5 | DATA_PATH=${BASE_PATH}/indexed_datasets/megatron 6 | DS_CONFIG=ds_config.json 7 | 8 | TP=1 9 | PP=1 10 | NLAYERS=24 11 | HIDDEN=512 12 | 13 | GLOBAL_BATCH=64 14 | MICRO_BATCH=4 15 | 16 | ZERO_STAGE=2 17 | 18 | OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH} 19 | #OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH} 20 | mkdir -p $OUTPUT_DIR 21 | 22 | cat < $DS_CONFIG 23 | { 24 | "train_batch_size" : $GLOBAL_BATCH, 25 | "train_micro_batch_size_per_gpu": $MICRO_BATCH, 26 | "steps_per_print": 1, 27 | 28 | "zero_optimization": { 29 | "stage": $ZERO_STAGE 30 | }, 31 | 32 | "fp16": { 33 | "enabled": true, 34 | "initial_scale_power": 12 35 | }, 36 | 37 | "wall_clock_breakdown" : true 38 | } 39 | EOT 40 | 41 | export NCCL_DEBUG=warn 42 | 43 | ds_args="" 44 | ds_args=" --deepspeed ${ds_args}" 45 | ds_args=" --no-pipeline-parallel ${ds_args}" 46 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 47 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 48 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}" 49 | 50 | 51 | deepspeed ./src/pretrain.py \ 52 | --tensor-model-parallel-size $TP \ 53 | --pipeline-model-parallel-size $PP \ 54 | --num-layers $NLAYERS \ 55 | --hidden-size $HIDDEN \ 56 | --num-attention-heads 16 \ 57 | --seq-length 256 \ 58 | --loss-scale 12 \ 59 | --max-position-embeddings 1024 \ 60 | --micro-batch-size 4 \ 61 | --global-batch-size 1024 \ 62 | --train-iters 1000 \ 63 | --lr 6.0e-5 \ 64 | --min-lr 6.0e-6 \ 65 | --lr-decay-style cosine \ 66 | --log-interval 1 \ 67 | --eval-iters 40 \ 68 | --eval-interval 1000 \ 69 | --data-path $DATA_PATH \ 70 | --vocab-file $BASE_PATH/gpt2-vocab.json \ 71 | --merge-file $BASE_PATH/gpt2-merges.txt \ 72 | --save-interval 1000 \ 73 | --split 98,2,0 \ 74 | --clip-grad 1.0 \ 75 | --weight-decay 0.1 \ 76 | --adam-beta1 0.9 \ 77 | --adam-beta2 0.95 \ 78 | --init-method-std 0.006 \ 79 | --fp16 \ 80 | --checkpoint-activations \ 81 | --tensorboard-dir $OUTPUT_DIR \ 82 | $ds_args \ 83 | --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log 84 | 85 | -------------------------------------------------------------------------------- /src/scripts/sequence_parallel/README.md: -------------------------------------------------------------------------------- 1 | # Sequence Parallelism 2 | 3 | This folder contains examples that demonstrate how to use DeepSpeed's sequence parallelism. 4 | 5 | ## Setting Up the Environment for FlashAttention 6 | 7 | DeepSpeed's sequence parallelism can be combined with the following types of attention. 8 | 9 | - Classic attention 10 | - FlashAttention (enabled by `--use-flash-attn`) 11 | - FlashAttention + Triton (enabled by `--use-flash-attn-triton`) 12 | 13 | For the best performance, we recommend using FlashAttention + Triton. Here are the installation steps and the versions we have tested. Note that FlashAttention is compatible only with Turing, Ampere, Ada, or Hopper GPUs. 14 | 15 | ```shell 16 | # install triton 17 | git clone -b legacy-backend https://github.com/openai/triton 18 | cd triton/python/ 19 | pip install cmake 20 | pip install . 21 | 22 | # install 23 | cd ${WORK_DIR} 24 | git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention 25 | cd flash-attention 26 | python setup.py install 27 | ``` 28 | 29 | ## Enabling Sequence Parallelism 30 | 31 | To enable sequence parallelism, set the degree of parallelism using the `--ds-sequence-parallel-size` argument. Ensure that the number of attention heads is divisible by this value. 32 | Ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the document of [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/v1.0.4) for more details. 33 | 34 | Some working examples ([GPT1.3B](ds_pretrain_gpt_1.3B_seq_parallel_32k.sh), [GPT30B](ds_pretrain_gpt_30B_seq_parallel_32k.sh)), that enable sequence parallelism, are available in this foloder. 35 | 36 | Please note that our sequence parallelism feature is currently incompatible with Megatron-LM's tensor or pipeline parallelism. 37 | -------------------------------------------------------------------------------- /src/scripts/sequence_parallel/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": GBSIZE, 3 | "train_micro_batch_size_per_gpu": MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": true, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "wall_clock_breakdown" : false 24 | } 25 | -------------------------------------------------------------------------------- /src/tools/convert_checkpoint/deepspeed_to_megatron.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import torch 6 | from collections import OrderedDict 7 | from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint 8 | 9 | MODEL_KEY = 'model' 10 | ARGS_KEY = 'args' 11 | LANGUGAGE_MODEL_KEY = 'language_model' 12 | EMBEDDING_KEY = 'embedding' 13 | ENCODER_KEY = 'encoder' 14 | WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head' 15 | WORD_EMBEDDINGS_KEY = 'word_embeddings' 16 | FINAL_LAYER_NORM_KEY ='final_layernorm' 17 | CHECKPOINT_VERSION_KEY = 'checkpoint_version' 18 | CHECKPOINT_VERSION_VALUE = 3.0 19 | ITERATION_KEY = 'iteration' 20 | 21 | def parse_arguments(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--input_folder', default=None, type=str, help='Input DeepSpeed Checkpoint folder') 24 | parser.add_argument('--output_folder', default=None, type=str, help='Output Megatron checkpoint folder') 25 | parser.add_argument('--target_tp', default=1, type=int, help='Target TP degree') 26 | parser.add_argument('--target_pp', default=1, type=int, help='Target PP degree') 27 | parser.add_argument('--for_release', action='store_true', help='Convert for release purpose, reset some (progress) counters.') 28 | args = parser.parse_args() 29 | print(f'args = {args}') 30 | return args 31 | 32 | 33 | def _convert_ds_transformer_state(sd_list): 34 | new_sd = OrderedDict() 35 | for i, sd in enumerate(sd_list): 36 | for key, value in sd.items(): 37 | new_key = f'layers.{i}.{key}' 38 | new_sd[new_key] = value 39 | 40 | return new_sd 41 | 42 | def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree): 43 | path_list = [] 44 | iter_folder = f'iter_{iteration:07d}' 45 | for i in range(0, tp_degree): 46 | path_list.append([]) 47 | for j in range(0, pp_degree): 48 | rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}' 49 | ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt') 50 | path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path)) 51 | 52 | return path_list 53 | 54 | 55 | def _create_megatron_dict(): 56 | language_model_dict = { 57 | EMBEDDING_KEY: {}, 58 | ENCODER_KEY: {} 59 | } 60 | megatron_dict = { 61 | MODEL_KEY: {LANGUGAGE_MODEL_KEY: language_model_dict}, 62 | CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE 63 | } 64 | return megatron_dict 65 | 66 | 67 | def _save_checkpoint(file_path, chkpt_sd): 68 | dir, _ = os.path.split(file_path) 69 | os.makedirs(dir, exist_ok=True) 70 | torch.save(chkpt_sd, file_path) 71 | 72 | 73 | def _renest_sd(sd): 74 | new_sd = OrderedDict() 75 | for key, value in sd.items(): 76 | a, b = key.split('.') 77 | new_sd[a] = {b: value} 78 | return new_sd 79 | 80 | 81 | def _create_rank_checkpoint(ds_checkpoint, tp_index, pp_index, for_release=False): 82 | meg_encoder_sd = OrderedDict() 83 | meg_embedding_sd = OrderedDict() 84 | meg_embedding_for_head_sd = OrderedDict() 85 | 86 | transformer_sd = ds_checkpoint.get_transformer_state(tp_index, pp_index) 87 | meg_encoder_sd.update(_convert_ds_transformer_state(transformer_sd)) 88 | 89 | if pp_index in [0, ds_checkpoint.pp_degree - 1]: 90 | embedding_sd = ds_checkpoint.get_embedding_state(tp_index) 91 | nested_embedding_sd = _renest_sd(embedding_sd) 92 | if pp_index == 0: 93 | meg_embedding_sd.update(nested_embedding_sd) 94 | 95 | if pp_index == ds_checkpoint.pp_degree -1: 96 | for key, value in embedding_sd.items(): 97 | if key.startswith(WORD_EMBEDDINGS_KEY): 98 | fields = key.split('.') 99 | new_fields = fields[1:] 100 | new_key = '.'.join(new_fields) 101 | meg_embedding_for_head_sd[new_key] = value 102 | 103 | final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index) 104 | new_final_norm_sd = {f'{FINAL_LAYER_NORM_KEY}.{key}': value for key, value in final_norm_sd.items()} 105 | meg_encoder_sd.update(new_final_norm_sd) 106 | 107 | checkpoint_sd = _create_megatron_dict() 108 | 109 | iteration = ds_checkpoint.get_iteration() 110 | checkpoint_sd[ITERATION_KEY] = iteration 111 | if pp_index == 0: 112 | checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][EMBEDDING_KEY] = meg_embedding_sd 113 | checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd 114 | if pp_index == ds_checkpoint.pp_degree -1: 115 | checkpoint_sd[MODEL_KEY][WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd 116 | 117 | checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args() 118 | # Adjust specific fields 119 | checkpoint_sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree 120 | checkpoint_sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree 121 | if for_release: 122 | checkpoint_sd[ARGS_KEY].consumed_train_samples = 0 123 | checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0 124 | 125 | return checkpoint_sd 126 | 127 | 128 | def _create_latest_file(base_folder, iteration): 129 | file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt') 130 | os.makedirs(base_folder, exist_ok=True) 131 | with open(file_path, 'w') as f: 132 | f.write(str(iteration)) 133 | 134 | def main(): 135 | print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint') 136 | 137 | args = parse_arguments() 138 | print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}') 139 | 140 | ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp) 141 | iteration = ds_checkpoint.get_iteration() 142 | _create_latest_file(args.output_folder, iteration) 143 | checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree) 144 | for i in range(0, ds_checkpoint.tp_degree): 145 | for j in range(0, ds_checkpoint.pp_degree): 146 | sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release) 147 | _save_checkpoint(checkpoint_paths[i][j], sd) 148 | 149 | if __name__ == "__main__": 150 | main() -------------------------------------------------------------------------------- /src/tools/convert_checkpoint/deepspeed_to_transformers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import torch 5 | import json 6 | 7 | from deepspeed_checkpoint import DeepSpeedCheckpoint 8 | from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments 9 | 10 | # the import was tested to work with this version 11 | # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider 12 | # copying that version here instead 13 | from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint 14 | from transformers import GPT2Config 15 | 16 | def main(): 17 | 18 | # this first part comes mainly from deepspeed_to_megatron.main 19 | args = parse_arguments() 20 | print(f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}') 21 | 22 | ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp) 23 | iteration = ds_checkpoint.get_iteration() 24 | input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, args.for_release) 25 | 26 | # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main 27 | # Spell out all parameters in case the defaults change. 28 | config = GPT2Config( 29 | vocab_size=50257, 30 | n_positions=1024, 31 | n_ctx=1024, 32 | n_embd=1024, 33 | n_layer=24, 34 | n_head=16, 35 | n_inner=4096, 36 | activation_function="gelu", # used to be "gelu_new" in earlier versions 37 | resid_pdrop=0.1, 38 | embd_pdrop=0.1, 39 | attn_pdrop=0.1, 40 | layer_norm_epsilon=1e-5, 41 | initializer_range=0.02, 42 | summary_type="cls_index", 43 | summary_use_proj=True, 44 | summary_activation=None, 45 | summary_proj_to_labels=True, 46 | summary_first_dropout=0.1, 47 | scale_attn_weights=True, 48 | gradient_checkpointing=False, 49 | use_cache=True, 50 | bos_token_id=50256, 51 | eos_token_id=50256, 52 | ) 53 | 54 | # Convert. 55 | print("Converting to HF Checkpoint") 56 | output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) 57 | 58 | basename = args.output_folder 59 | os.makedirs(basename, exist_ok=True) 60 | 61 | # Print the structure of converted state dict. 62 | #if args.print_checkpoint_structure: 63 | # recursive_print(None, output_state_dict) 64 | 65 | # Store the config to file. 66 | output_config_file = os.path.join(basename, "config.json") 67 | output_config = config.to_dict() 68 | output_config["architectures"] = ["GPT2LMHeadModel"] 69 | output_config["model_type"] = "gpt2" 70 | print(f'Saving config to "{output_config_file}"') 71 | with open(output_config_file, "w") as f: 72 | json.dump(output_config, f) 73 | 74 | # Store the state_dict to file. 75 | output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") 76 | print(f'Saving checkpoint to "{output_checkpoint_file}"') 77 | torch.save(output_state_dict, output_checkpoint_file) 78 | 79 | print("Now add tokenizer files and upload to the hub") 80 | 81 | 82 | if __name__ == "__main__": 83 | main() -------------------------------------------------------------------------------- /src/tools/convert_checkpoint/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import importlib 5 | import torch.multiprocessing as mp 6 | import os 7 | import sys 8 | 9 | # A loader is a python file with at least two functions 10 | # - add_arguments - takes in a parser and adds any arguments needed 11 | # - load_checkpoint - takes in the queue and parsed arguments 12 | 13 | # A saver is similar but has save_checkpoint instead of 14 | # load_checkpoint 15 | 16 | # The loader and saver process are each given a queue, the loader 17 | # should load the checkpoint and send the weights in messages in the 18 | # following order, the saver should receive them in this order and 19 | # save the checkpoints. A message consists of a python dictionary with 20 | # a "name" for error checking and an entry for each tensor as 21 | # indicated below. Note that the weight sent over the queue are the 22 | # full model weights, nothing split. 23 | 24 | # If the loader ever sends "exit" to the queue, that means something 25 | # went wrong and it is exiting. 26 | 27 | # - Metadata Namespace with the following attributes: 28 | # model_type - GPT, BERT, T5, etc. (Part of protocol to allow this to be deduced later instead of given on command line) 29 | # num_layers - Number of transformer layers 30 | # hidden_size 31 | # seq_length 32 | # num_attention_heads 33 | # max_position_embeddings 34 | # tokenizer_type 35 | # iteration 36 | # params_dtype 37 | # bert_binary_head - Used only if model_type is BERT 38 | # previous_tensor_parallel_size - Optional 39 | # previous_pipeline_parallel_size - Optional 40 | # true_vocab_size 41 | # make_vocab_size_divisble_by 42 | # consumed_train_samples 43 | # consumed_valid_samples 44 | # messages 45 | # { 46 | # "name": "embeddings" 47 | # "position embeddings" 48 | # "word embeddings" 49 | # } 50 | # (for each transformer layer): 51 | # { 52 | # "name": "transformer layer N" 53 | # "input layernorm weight" 54 | # "input layernorm bias" 55 | # "qkv weight" 56 | # "qkv bias" 57 | # "dense weight" 58 | # "dense bias" 59 | # "post layernorm weight" 60 | # "post layernorm bias" 61 | # "mlp l0 weight" 62 | # "mlp l0 bias" 63 | # "mlp l1 weight" 64 | # "mlp l1 bias" 65 | # } 66 | # { 67 | # "name": "final layer norm" 68 | # "weight" 69 | # "bias" 70 | # } 71 | # if present (i.e. for BERT): 72 | # { 73 | # "name": "pooler" 74 | # "weight" 75 | # "bias" 76 | # } 77 | # { 78 | # "name": "lm head" 79 | # "dense weight" 80 | # "dense bias" 81 | # "layernorm weight" 82 | # "layernorm bias" 83 | # } 84 | # { 85 | # "name": "binary head" 86 | # "weight" 87 | # "bias" 88 | # } 89 | # - "done" 90 | 91 | def load_plugin(plugin_type, name): 92 | module_name = f"{plugin_type}_{name}" 93 | try: 94 | plugin = importlib.import_module(module_name) 95 | except ModuleNotFoundError: 96 | module_name = name 97 | try: 98 | plugin = importlib.import_module(module_name) 99 | except ModuleNotFoundError: 100 | sys.exit(f"Unable to load {plugin_type} plugin {name}. Exiting.") 101 | 102 | if not hasattr(plugin, 'add_arguments'): 103 | sys.exit(f"{module_name} module is not a plugin. Exiting.") 104 | 105 | print(f"Loaded {module_name} as the {plugin_type}.") 106 | return plugin 107 | 108 | def main(): 109 | import argparse 110 | parser = argparse.ArgumentParser(description="Megatron Checkpoint Utility Arguments", 111 | allow_abbrev=False, conflict_handler='resolve') 112 | 113 | parser.add_argument('--model-type', type=str, required=True, 114 | choices=['GPT', 'BERT'], 115 | help='Type of the model') 116 | parser.add_argument('--loader', type=str, default='megatron', 117 | help='Module name to load checkpoint, should be on python path') 118 | parser.add_argument('--saver', type=str, default='megatron', 119 | help='Module name to save checkpoint, shdoul be on python path') 120 | parser.add_argument('--load-dir', type=str, required=True, 121 | help='Directory to load model checkpoint from') 122 | parser.add_argument('--save-dir', type=str, required=True, 123 | help='Directory to save model checkpoint to') 124 | parser.add_argument('--max-queue-size', type=int, default=50, 125 | help='Maximum number of tensors in the queue') 126 | parser.add_argument('--no-checking', action='store_false', 127 | help='Do not perform checking on the name and ordering of weights', 128 | dest='checking') 129 | 130 | known_args, _ = parser.parse_known_args() 131 | loader = load_plugin('loader', known_args.loader) 132 | saver = load_plugin('saver', known_args.saver) 133 | 134 | loader.add_arguments(parser) 135 | saver.add_arguments(parser) 136 | 137 | args = parser.parse_args() 138 | 139 | queue = mp.Queue(maxsize=args.max_queue_size) 140 | 141 | print("Starting saver...") 142 | saver_proc = mp.Process(target=saver.save_checkpoint, args=(queue, args)) 143 | saver_proc.start() 144 | 145 | print("Starting loader...") 146 | loader.load_checkpoint(queue, args) 147 | 148 | print("Waiting for saver to complete...") 149 | saver_proc.join() 150 | 151 | 152 | if __name__ == '__main__': 153 | main() --------------------------------------------------------------------------------