├── .gitignore
├── 1.pre_train_math.py
├── 1.pre_train_math_moe.py
├── 3.pretrain_gpt125M.sh
├── 3.pretrain_llama2.sh
├── 4.aft_train_math.py
├── README.md
├── assets
    ├── pre_math.png
    └── title.png
├── scripts
    ├── kill_process.sh
    ├── sbatch.sh
    └── srun.sh
└── src
    ├── megatron
        ├── __init__.py
        ├── arguments.py
        ├── checkpointing.py
        ├── core
        │   ├── README.md
        │   ├── __init__.py
        │   ├── enums.py
        │   ├── fusions
        │   │   ├── __init__.py
        │   │   ├── fused_bias_dropout.py
        │   │   ├── fused_bias_gelu.py
        │   │   ├── fused_layer_norm.py
        │   │   └── fused_softmax.py
        │   ├── model_parallel_config.py
        │   ├── models
        │   │   ├── __init__.py
        │   │   └── gpt
        │   │   │   ├── __init__.py
        │   │   │   ├── gpt_embedding.py
        │   │   │   └── gpt_model.py
        │   ├── package_info.py
        │   ├── parallel_state.py
        │   ├── pipeline_parallel
        │   │   ├── __init__.py
        │   │   ├── p2p_communication.py
        │   │   └── schedules.py
        │   ├── requirements.txt
        │   ├── sequence_parallel
        │   │   ├── __init__.py
        │   │   └── cross_entropy.py
        │   ├── tensor_parallel
        │   │   ├── __init__.py
        │   │   ├── cross_entropy.py
        │   │   ├── data.py
        │   │   ├── layers.py
        │   │   ├── mappings.py
        │   │   ├── random.py
        │   │   └── utils.py
        │   ├── transformer
        │   │   ├── __init__.py
        │   │   ├── attention.py
        │   │   ├── core_attention.py
        │   │   ├── custom_layers
        │   │   │   └── transformer_engine.py
        │   │   ├── enums.py
        │   │   ├── mlp.py
        │   │   ├── module.py
        │   │   ├── transformer_block.py
        │   │   ├── transformer_config.py
        │   │   ├── transformer_layer.py
        │   │   └── utils.py
        │   └── utils.py
        ├── data
        │   ├── Makefile
        │   ├── __init__.py
        │   ├── autoaugment.py
        │   ├── bert_dataset.py
        │   ├── biencoder_dataset_utils.py
        │   ├── blendable_dataset.py
        │   ├── data_samplers.py
        │   ├── dataset_utils.py
        │   ├── gpt_dataset.py
        │   ├── helpers.cpp
        │   ├── ict_dataset.py
        │   ├── image_folder.py
        │   ├── indexed_dataset.py
        │   ├── orqa_wiki_dataset.py
        │   ├── realm_dataset_utils.py
        │   ├── realm_index.py
        │   ├── t5_dataset.py
        │   ├── test
        │   │   ├── test_indexed_dataset.py
        │   │   └── test_preprocess_data.sh
        │   └── vit_dataset.py
        ├── dist_signal_handler.py
        ├── enums.py
        ├── fp16_deprecated
        │   └── loss_scaler.py
        ├── fused_kernels
        │   ├── __init__.py
        │   ├── compat.h
        │   ├── scaled_masked_softmax.cpp
        │   ├── scaled_masked_softmax.h
        │   ├── scaled_masked_softmax_cuda.cu
        │   ├── scaled_softmax.cpp
        │   ├── scaled_softmax_cuda.cu
        │   ├── scaled_upper_triang_masked_softmax.cpp
        │   ├── scaled_upper_triang_masked_softmax.h
        │   ├── scaled_upper_triang_masked_softmax_cuda.cu
        │   ├── tests
        │   │   ├── __init__.py
        │   │   └── test_fused_kernels.py
        │   └── type_shim.h
        ├── global_vars.py
        ├── indexer.py
        ├── initialize.py
        ├── memory.py
        ├── microbatches.py
        ├── model
        │   ├── __init__.py
        │   ├── bert_model.py
        │   ├── biencoder_model.py
        │   ├── classification.py
        │   ├── distributed.py
        │   ├── enums.py
        │   ├── fused_bias_gelu.py
        │   ├── fused_layer_norm.py
        │   ├── fused_softmax.py
        │   ├── gpt_model.py
        │   ├── language_model.py
        │   ├── module.py
        │   ├── multiple_choice.py
        │   ├── realm_model.py
        │   ├── rotary_pos_embedding.py
        │   ├── t5_model.py
        │   ├── transformer.py
        │   ├── utils.py
        │   └── vision
        │   │   ├── classification.py
        │   │   ├── dino.py
        │   │   ├── esvit_swin_backbone.py
        │   │   ├── inpainting.py
        │   │   ├── knn_monitor.py
        │   │   ├── mit_backbone.py
        │   │   ├── swin_backbone.py
        │   │   ├── utils.py
        │   │   └── vit_backbone.py
        ├── mpu
        │   └── tests
        │   │   ├── __init__.py
        │   │   ├── commons.py
        │   │   ├── test_cross_entropy.py
        │   │   ├── test_data.py
        │   │   ├── test_initialize.py
        │   │   ├── test_layers.py
        │   │   └── test_random.py
        ├── optimizer
        │   ├── __init__.py
        │   ├── clip_grads.py
        │   ├── distrib_optimizer.py
        │   ├── grad_scaler.py
        │   └── optimizer.py
        ├── optimizer_param_scheduler.py
        ├── p2p_communication.py
        ├── static
        │   └── index.html
        ├── text_generation
        │   ├── __init__.py
        │   ├── api.py
        │   ├── beam_utils.py
        │   ├── communication.py
        │   ├── forward_step.py
        │   ├── generation.py
        │   ├── sampling.py
        │   └── tokenization.py
        ├── text_generation_server.py
        ├── text_generation_utils.py
        ├── timers.py
        ├── tokenizer
        │   ├── __init__.py
        │   ├── bert_tokenization.py
        │   ├── gpt2_tokenization.py
        │   └── tokenizer.py
        ├── training.py
        └── utils.py
    ├── pretrain.py
    ├── scripts
        ├── generate_text.sh
        ├── gpt
        │   └── ds_config_gpt_TEMPLATE.json
        ├── pretrain_llama_distributed.sh
        ├── run_deepspeed_example.sh
        └── sequence_parallel
        │   ├── README.md
        │   ├── ds_config_gpt_TEMPLATE.json
        │   ├── ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
        │   └── ds_pretrain_gpt_30B_seq_parallel_32k.sh
    └── tools
        ├── convert_checkpoint
            ├── convert_llama_weights_to_hf.py
            ├── deepspeed_to_megatron.py
            ├── deepspeed_to_transformers.py
            ├── loader_llama2_hf.py
            ├── saver_megatron.py
            └── utils.py
        └── preprocess_data.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # *.txt  ，*.xls  表示过滤某种类型的文件
 2 | # target/ ：表示过滤这个文件夹下的所有文件
 3 | # /test/a.txt ，/test/b.xls  表示指定过滤某个文件下具体文件
 4 | # !*.java , !/dir/test/     !开头表示不过滤
 5 | # *.[ab]    支持通配符：过滤所有以.a或者.b为扩展名的文件
 6 | # /test  仅仅忽略项目根目录下的 test 文件，不包括 child/test等非根目录的test目录
 7 | 
 8 | 
 9 | /ckpts/*
10 | 


--------------------------------------------------------------------------------
/3.pretrain_llama2.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # This example script is contributed by external user https://github.com/nrailgun
  3 | set -ex
  4 | 
  5 | ######################################
  6 | # Change the below configurations here
  7 | BASE_PATH=./tmp
  8 | DS_CONFIG=${BASE_PATH}/deepspeed.json
  9 | DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence"
 10 | DATASET="1 ${DATASET_1}"
 11 | CHECKPOINT_PATH=./tmp
 12 | TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model
 13 | 
 14 | TP=2
 15 | PP=2
 16 | ZERO_STAGE=0
 17 | 
 18 | GPUS_PER_NODE=8
 19 | MASTER_ADDR=localhost
 20 | MASTER_PORT=6000
 21 | NNODES=1
 22 | NODE_RANK=0
 23 | 
 24 | HIDDEN_SIZE=2048 # e.g. llama-13b: 5120
 25 | FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824
 26 | NUM_LAYERS=24 # e.g. llama-13b: 40
 27 | NUM_HEADS=16 # e.g. llama-13b: 40
 28 | SEQ_LENGTH=2048
 29 | NUM_KV_HEADS=4 # llama2 70B uses GQA
 30 | 
 31 | MICRO_BATCH_SIZE=4
 32 | GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens
 33 | TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps
 34 | LR=3e-4
 35 | MIN_LR=3e-5
 36 | LR_WARMUP_STEPS=2000
 37 | WEIGHT_DECAY=0.1
 38 | GRAD_CLIP=1
 39 | 
 40 | ## Activation checkpointing saves GPU memory, but reduces training speed
 41 | activation_checkpoint="true"
 42 | # activation_checkpoint="false"
 43 | 
 44 | # Below configuration required for llama model as per llama paper
 45 | # --no-query-key-layer-scaling \
 46 | # --attention-dropout 0 \
 47 | # --hidden-dropout 0 \
 48 | # --use-rotary-position-embeddings \
 49 | # --untie-embeddings-and-output-weights \
 50 | # --swiglu \
 51 | # --normalization rmsnorm \
 52 | # --disable-bias-linear \
 53 | ######################################
 54 | 
 55 | 
 56 | 
 57 | cat <<EOT > $DS_CONFIG
 58 | {
 59 |   "train_batch_size" : $GLOBAL_BATCH_SIZE,
 60 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
 61 |   "steps_per_print": 1,
 62 |   "zero_optimization": {
 63 |     "stage": $ZERO_STAGE
 64 |   },
 65 |   "bf16": {
 66 |     "enabled": true
 67 |   }
 68 | }
 69 | EOT
 70 | 
 71 | ds_args=""
 72 | ds_args=" --deepspeed ${ds_args}"
 73 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
 74 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 75 | 
 76 | if [ "${activation_checkpoint}" = "true" ]; then
 77 |   ds_args="--deepspeed-activation-checkpointing ${ds_args}"
 78 | 
 79 |   ## old argument for recomputing the transformer layer
 80 |   # ds_args="--checkpoint-activations ${ds_args}"
 81 | 
 82 |   ## new argument for recomputing the transformer layer
 83 |   ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}"
 84 |   ## new argument for recomputing only the attention layer
 85 |   # ds_args="--recompute-granularity selective ${ds_args}"
 86 | fi
 87 | 
 88 | 
 89 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 90 | 
 91 | torchrun $DISTRIBUTED_ARGS \
 92 |        ./src/pretrain.py \
 93 |        --tensor-model-parallel-size $TP \
 94 |        --pipeline-model-parallel-size $PP \
 95 |        --num-layers $NUM_LAYERS \
 96 |        --hidden-size $HIDDEN_SIZE \
 97 |        --ffn-hidden-size $FFN_HIDDEN_SIZE \
 98 |        --num-attention-heads $NUM_HEADS \
 99 |        --micro-batch-size $MICRO_BATCH_SIZE \
100 |        --global-batch-size $GLOBAL_BATCH_SIZE \
101 |        --seq-length $SEQ_LENGTH \
102 |        --max-position-embeddings $SEQ_LENGTH \
103 |        --train-iters $TRAIN_STEPS \
104 |        --save $CHECKPOINT_PATH \
105 |        --load $CHECKPOINT_PATH \
106 |        --data-path $DATASET \
107 |        --data-impl mmap \
108 |        --tokenizer-type GPTSentencePieceTokenizer \
109 |        --tokenizer-model $TOKENIZER_PATH \
110 |        --split 949,50,1 \
111 |        --distributed-backend nccl \
112 |        --lr $LR \
113 |        --lr-decay-style cosine \
114 |        --min-lr $MIN_LR \
115 |        --weight-decay $WEIGHT_DECAY \
116 |        --clip-grad $GRAD_CLIP \
117 |        --lr-warmup-iters $LR_WARMUP_STEPS \
118 |        --optimizer adam \
119 |        --adam-beta1 0.9 \
120 |        --adam-beta2 0.95 \
121 |        --log-interval 1 \
122 |        --save-interval 10000 \
123 |        --eval-interval 1000 \
124 |        --eval-iters 10 \
125 |        --bf16 \
126 |        --no-query-key-layer-scaling \
127 |        --attention-dropout 0 \
128 |        --hidden-dropout 0 \
129 |        --use-rotary-position-embeddings \
130 |        --untie-embeddings-and-output-weights \
131 |        --swiglu \
132 |        --normalization rmsnorm \
133 |        --disable-bias-linear \
134 |        --num-key-value-heads $NUM_KV_HEADS \
135 |        $ds_args
136 | 


--------------------------------------------------------------------------------
/4.aft_train_math.py:
--------------------------------------------------------------------------------
1 | model_size_in_B=7
2 | seqlen=4096
3 | global_batch_size=128
4 | time_in_sec_per_interation=6.8
5 | total_gpus=32
6 | 
7 | TFLOPS=model_size_in_B * 4 * 2 * seqlen * global_batch_size / (time_in_sec_per_interation * total_gpus * 1e3) # https://arxiv.org/pdf/2104.04473.pdf 
8 | print(f'TFLOPS:{TFLOPS:.2f}')


--------------------------------------------------------------------------------
/assets/pre_math.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/assets/pre_math.png


--------------------------------------------------------------------------------
/assets/title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/assets/title.png


--------------------------------------------------------------------------------
/scripts/kill_process.sh:
--------------------------------------------------------------------------------
1 | # 终止进程
2 | # pkill -f "python -u src/gpt4_eval.py"
3 | pkill -f "python"
4 | 
5 | if [ $? -eq 0 ]; then
6 |   echo "Process terminated."
7 | else
8 |   echo "No process found."
9 | fi


--------------------------------------------------------------------------------
/scripts/sbatch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -J test
 3 | #SBATCH -p p-A100
 4 | #SBATCH -N 1
 5 | #SBATCH --cpus-per-task=96
 6 | #SBATCH --reservation=root_114  # 仅限于wangbeny用户 其余用户不用这一行
 7 | #SBATCH -w pgpu17
 8 | #SBATCH --gres=gpu:8
 9 | 
10 | bash /mntcephfs/data/med/xidong/yaojishi/gen_ans.sh   # 要运行的命令行指令


--------------------------------------------------------------------------------
/scripts/srun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash 
2 | srun --job-name=test  --gres=gpu:2   -w pgpu20 -p p-A100 -c 24  --reservation=root_114 --pty bash 


--------------------------------------------------------------------------------
/src/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args, get_retro_args
 6 | from .global_vars import get_current_global_batch_size
 7 | from .global_vars import get_num_microbatches
 8 | from .global_vars import get_signal_handler
 9 | from .global_vars import update_num_microbatches
10 | from .global_vars import get_tokenizer
11 | from .global_vars import get_tensorboard_writer
12 | from .global_vars import get_adlr_autoresume
13 | from .global_vars import get_timers
14 | from .initialize  import initialize_megatron
15 | 
16 | from .utils import (print_rank_0,
17 |                     is_last_rank,
18 |                     print_rank_last,
19 |                     is_rank_0,
20 |                     is_aml)
21 | 


--------------------------------------------------------------------------------
/src/megatron/core/README.md:
--------------------------------------------------------------------------------
1 | Megatron Core is a library for efficient and scalable training of transformer based models.
2 | 


--------------------------------------------------------------------------------
/src/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import megatron.core.parallel_state
 2 | import megatron.core.tensor_parallel
 3 | import megatron.core.utils
 4 | 
 5 | from .model_parallel_config import ModelParallelConfig
 6 | 
 7 | # Alias parallel_state as mpu, its legacy name
 8 | mpu = parallel_state
 9 | 
10 | __all__ = [
11 |     "parallel_state",
12 |     "tensor_parallel",
13 |     "utils",
14 |     "ModelParallelConfig"
15 | ]
16 | 


--------------------------------------------------------------------------------
/src/megatron/core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class ModelType(enum.Enum):
 6 |     encoder_or_decoder = 1
 7 |     encoder_and_decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 | 


--------------------------------------------------------------------------------
/src/megatron/core/fusions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/core/fusions/__init__.py


--------------------------------------------------------------------------------
/src/megatron/core/fusions/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from typing import Tuple, Optional
 5 | 
 6 | def _bias_dropout_add_func(x, bias, residual, prob, training):
 7 |     # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
 8 |     # NOTE: Previously, the argument `bias` used to be passed as
 9 |     # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
10 |     # transformer layer but broadcasting should automatically take care of that.
11 |     # Also, looking at broadcasting semantics, `expand_as` and broadcasting
12 |     # seem to be identical performance-wise (both just change the view).
13 |     if bias is not None:
14 |         x = x + bias
15 |     out = torch.nn.functional.dropout(x, p=prob, training=training)
16 |     out = residual + out
17 |     return out
18 | 
19 | def get_bias_dropout_add(training, fused):
20 | 
21 |     def unfused_bias_dropout_add(x_with_bias, residual, prob):
22 |         x, bias = x_with_bias # unpack
23 |         return _bias_dropout_add_func(x, bias, residual, prob, training)
24 | 
25 |     @torch.jit.script
26 |     def bias_dropout_add_fused_train(
27 |         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
28 |         residual: torch.Tensor,
29 |         prob: float
30 |     ) -> torch.Tensor:
31 |         x, bias = x_with_bias # unpack
32 |         return _bias_dropout_add_func(x, bias, residual, prob, True)
33 | 
34 |     @torch.jit.script
35 |     def bias_dropout_add_fused_inference(
36 |         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
37 |         residual: torch.Tensor,
38 |         prob: float
39 |     ) -> torch.Tensor:
40 |         x, bias = x_with_bias # unpack
41 |         return _bias_dropout_add_func(x, bias, residual, prob, False)
42 | 
43 |     if fused:
44 |         # jit scripting for a nn.module (with dropout) is not
45 |         # triggering the fusion kernel. For now, we use two
46 |         # different nn.functional routines to account for varying
47 |         # dropout semantics during training and inference phases.
48 |         if training:
49 |             return bias_dropout_add_fused_train
50 |         else:
51 |             return bias_dropout_add_fused_inference
52 |     else:
53 |         return unfused_bias_dropout_add
54 | 


--------------------------------------------------------------------------------
/src/megatron/core/fusions/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 7 | # 1/sqrt(2*pi)-> 0.3989423
 8 | # 1/sqrt(2)   -> 0.70710678
 9 | # sqrt(2/pi)  -> 0.79788456
10 | # this function is tanh approximation of gelu
11 | # actual gelu is:
12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
13 | 
14 | @torch.jit.script
15 | def bias_gelu(bias, y):
16 |     x = bias + y
17 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18 | 
19 | # gradient of tanh approximation of gelu
20 | # gradient of actual gelu is:
21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
22 | @torch.jit.script
23 | def bias_gelu_back(g, bias, y):
24 |     x = bias + y
25 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
26 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
27 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
28 |     return ff*g
29 | 
30 | class GeLUFunction(torch.autograd.Function):
31 |     @staticmethod
32 |     # bias is an optional argument
33 |     def forward(ctx, input, bias):
34 |         ctx.save_for_backward(input, bias)
35 |         return bias_gelu(bias, input)
36 | 
37 |     @staticmethod
38 |     def backward(ctx, grad_output):
39 |         input, bias = ctx.saved_tensors
40 |         tmp = bias_gelu_back(grad_output, bias, input)
41 |         return tmp, tmp
42 | 
43 | bias_gelu_impl = GeLUFunction.apply
44 | 


--------------------------------------------------------------------------------
/src/megatron/core/fusions/fused_layer_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import numbers
 4 | import torch
 5 | from torch.nn.parameter import Parameter
 6 | from torch.nn import init
 7 | import importlib
 8 | 
 9 | from megatron.core.utils import make_viewless_tensor
10 | 
11 | try:
12 |     from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
13 |     HAVE_PERSIST_LAYER_NORM = True
14 | except:
15 |     HAVE_PERSIST_LAYER_NORM = False
16 | 
17 | try:
18 |     from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
19 |     HAVE_FUSED_LAYER_NORM = True
20 | except:
21 |     HAVE_FUSED_LAYER_NORM = False
22 | 
23 | 
24 | class FusedLayerNorm(torch.nn.Module):
25 | 
26 |   def __init__(self, hidden_size, eps=1e-5,
27 |                persist_layer_norm=True,
28 |                sequence_parallel=False,
29 |                zero_centered_gamma=False):
30 |         super().__init__()
31 | 
32 |         self.zero_centered_gamma = zero_centered_gamma
33 | 
34 |         # List of hiddens sizes supported in the persistent layer norm kernel
35 |         # If the hidden size is not supported, fall back to the non-persistent
36 |         # kernel.
37 |         persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
38 |             5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
39 |             24576, 25600, 30720, 32768, 40960, 49152, 65536]
40 |         if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
41 |             persist_layer_norm = False
42 | 
43 |         if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM:
44 |             # TODO: Add pytorch only layer norm
45 |             raise ValueError(f'Apex must currently be installed to use megatron core.')
46 | 
47 |         if isinstance(hidden_size, numbers.Integral):
48 |             hidden_size = (hidden_size,)
49 |         self.hidden_size = torch.Size(hidden_size)
50 |         self.eps = eps
51 |         self.weight = Parameter(torch.Tensor(*hidden_size))
52 |         self.bias = Parameter(torch.Tensor(*hidden_size))
53 |         self.reset_parameters()
54 |         self.persist_layer_norm = persist_layer_norm
55 |         self.sequence_parallel = sequence_parallel
56 | 
57 |         # set sequence parallelism flag on weight and bias parameters
58 |         setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
59 |         setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
60 | 
61 | 
62 |   def reset_parameters(self):
63 | 
64 |     if self.zero_centered_gamma:
65 |         init.zeros_(self.weight)
66 |         init.zeros_(self.bias)
67 |     else:
68 |         init.ones_(self.weight)
69 |         init.zeros_(self.bias)
70 | 
71 |   def forward(self, input):
72 | 
73 |     weight = self.weight + 1 if self.zero_centered_gamma else self.weight
74 | 
75 |     if self.persist_layer_norm:
76 |         output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
77 | 
78 |         # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
79 |         # a populated '_base' field). This will result in schedule.py's
80 |         # deallocate_output_tensor() throwing an error, so a viewless tensor is
81 |         # created to prevent this.
82 |         output = make_viewless_tensor(inp = output,
83 |                                       requires_grad = input.requires_grad,
84 |                                       keep_graph = True)
85 | 
86 |     else:
87 |         output = FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.hidden_size, self.eps)
88 | 
89 |     return output
90 | 


--------------------------------------------------------------------------------
/src/megatron/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/core/models/__init__.py


--------------------------------------------------------------------------------
/src/megatron/core/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_model import GPTModel
2 | 


--------------------------------------------------------------------------------
/src/megatron/core/models/gpt/gpt_embedding.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | import torch
  4 | 
  5 | from megatron.core import tensor_parallel
  6 | 
  7 | from megatron.core.transformer.module import MegatronModule
  8 | from megatron.core.transformer.transformer_config import TransformerConfig
  9 | 
 10 | 
 11 | class GPTEmbedding(MegatronModule):
 12 |     """Language model embeddings.
 13 | 
 14 |     Arguments:
 15 |         config (TransformerConfig): config object with all necessary configs for TransformerBlock
 16 |         vocab_size (int): vocabulary size
 17 |         max_sequence_length (int): maximum size of sequence. This
 18 |                              is used for positional embedding
 19 |         embedding_dropout_prob float): dropout probability for embeddings
 20 |     """
 21 | 
 22 |     def __init__(self, config: TransformerConfig, vocab_size: int, max_sequence_length: int):
 23 |         super().__init__(config=config)
 24 | 
 25 |         self.config: TransformerConfig = config
 26 |         self.vocab_size: int = vocab_size
 27 |         self.max_sequence_length: int = max_sequence_length
 28 | 
 29 |         # Word embeddings (parallel).
 30 |         self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
 31 |             num_embeddings=self.vocab_size,
 32 |             embedding_dim=self.config.hidden_size,
 33 |             init_method=self.config.init_method,
 34 |             config=self.config
 35 |         )
 36 |         # @jcasper are these keys needed?
 37 |         self._word_embeddings_key = 'word_embeddings'
 38 | 
 39 |         # Position embedding (serial).
 40 |         self.position_embeddings = torch.nn.Embedding(self.max_sequence_length, self.config.hidden_size)
 41 |         self._position_embeddings_key = 'position_embeddings'
 42 | 
 43 |         # Initialize the position embeddings.
 44 |         if self.config.perform_initialization:
 45 |             self.config.init_method(self.position_embeddings.weight)
 46 | 
 47 |         # Embeddings dropout
 48 |         self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
 49 | 
 50 |     def zero_parameters(self):
 51 |         """Zero out all parameters in embedding."""
 52 |         self.word_embeddings.weight.data.fill_(0)
 53 |         self.word_embeddings.weight.shared = True
 54 |         self.position_embeddings.weight.data.fill_(0)
 55 |         self.position_embeddings.weight.shared = True
 56 | 
 57 |     def forward(self, input_ids, position_ids):
 58 |         # Embeddings.
 59 |         words_embeddings = self.word_embeddings(input_ids)
 60 |         position_embeddings = self.position_embeddings(position_ids)
 61 |         embeddings = words_embeddings + position_embeddings
 62 | 
 63 |         # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
 64 |         embeddings = embeddings.transpose(0, 1).contiguous()
 65 | 
 66 |         # If the input flag for fp32 residual connection is set, convert for float.
 67 |         if self.config.fp32_residual_connection:
 68 |             embeddings = embeddings.float()
 69 | 
 70 |         # Dropout.
 71 |         if self.config.sequence_parallel:
 72 |             embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
 73 |             with tensor_parallel.get_cuda_rng_tracker().fork():
 74 |                 embeddings = self.embedding_dropout(embeddings)
 75 |         else:
 76 |             embeddings = self.embedding_dropout(embeddings)
 77 | 
 78 |         return embeddings
 79 | 
 80 |     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
 81 |         """For easy load."""
 82 | 
 83 |         state_dict_ = {}
 84 |         state_dict_[self._word_embeddings_key] = self.word_embeddings.state_dict(prefix=prefix, keep_vars=keep_vars)
 85 |         state_dict_[self._position_embeddings_key] = self.position_embeddings.state_dict(
 86 |             prefix=prefix, keep_vars=keep_vars
 87 |         )
 88 | 
 89 |         return state_dict_
 90 | 
 91 |     def load_state_dict(self, state_dict, strict=True):
 92 |         """Customized load."""
 93 | 
 94 |         # Word embedding.
 95 |         if self._word_embeddings_key in state_dict:
 96 |             state_dict_ = state_dict[self._word_embeddings_key]
 97 |         else:
 98 |             # for backward compatibility.
 99 |             state_dict_ = {}
100 |             for key in state_dict.keys():
101 |                 if 'word_embeddings' in key:
102 |                     state_dict_[key.split('word_embeddings.')[1]] = state_dict[key]
103 |         self.word_embeddings.load_state_dict(state_dict_, strict=strict)
104 | 
105 |         # Position embedding.
106 |         if self._position_embeddings_key in state_dict:
107 |             state_dict_ = state_dict[self._position_embeddings_key]
108 |         else:
109 |             # for backward compatibility.
110 |             state_dict_ = {}
111 |             for key in state_dict.keys():
112 |                 if 'position_embeddings' in key:
113 |                     state_dict_[key.split('position_embeddings.')[1]] = state_dict[key]
114 |         self.position_embeddings.load_state_dict(state_dict_, strict=strict)
115 | 


--------------------------------------------------------------------------------
/src/megatron/core/package_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | 
 4 | MAJOR = 0
 5 | MINOR = 2
 6 | PATCH = 0
 7 | PRE_RELEASE = ''
 8 | 
 9 | # Use the following formatting: (major, minor, patch, pre-release)
10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
11 | 
12 | __shortversion__ = '.'.join(map(str, VERSION[:3]))
13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
14 | 
15 | __package_name__ = 'megatron_core'
16 | __contact_names__ = 'NVIDIA'
17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email
18 | __homepage__ = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage
19 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
20 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
21 | __description__ = 'Megatron Core - a library for efficient and scalable training of transformer based models'
22 | __license__ = 'BSD-3'
23 | __keywords__ = 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
24 | 


--------------------------------------------------------------------------------
/src/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedules import get_forward_backward_func
2 | 


--------------------------------------------------------------------------------
/src/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | pybind11
2 | torch
3 | regex


--------------------------------------------------------------------------------
/src/megatron/core/sequence_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .cross_entropy import vocab_sequence_parallel_cross_entropy
2 | 


--------------------------------------------------------------------------------
/src/megatron/core/sequence_parallel/cross_entropy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from megatron.core.parallel_state import (
 4 |     get_sequence_parallel_group,
 5 |     get_sequence_parallel_rank,
 6 |     get_sequence_parallel_world_size
 7 | )
 8 | 
 9 | class _VocabSequenceParallelCrossEntropy(torch.autograd.Function):
10 | 
11 |     @staticmethod
12 |     def forward(ctx, vocab_seq_parallel_logits, target, label_smoothing=0.0):
13 |         # vocab_seq_parallel_logits: [S/P, B, V]
14 |         # target: [S/P, B]
15 |         # return: [S, B]
16 | 
17 |         # Need softmax for backward
18 |         softmax = torch.nn.functional.softmax(vocab_seq_parallel_logits, dim=-1)
19 |         ctx.vocab_size = vocab_seq_parallel_logits.size(2)
20 |         loss = torch.nn.functional.nll_loss(softmax.log().view(-1, ctx.vocab_size), target.view(-1), reduction='none')
21 |        
22 |         ctx.seqlen = vocab_seq_parallel_logits.size(0) * get_sequence_parallel_world_size()
23 |         batch_size = vocab_seq_parallel_logits.size(1)
24 | 
25 |         loss_all = torch.empty(ctx.seqlen, batch_size, dtype=vocab_seq_parallel_logits.dtype, device=vocab_seq_parallel_logits.device)
26 |         torch.distributed.all_gather_into_tensor(loss_all, loss, group=get_sequence_parallel_group())
27 | 
28 |         ctx.save_for_backward(softmax, target)
29 | 
30 |         return loss_all
31 | 
32 |     @staticmethod
33 |     def backward(ctx, grad_output):
34 |         softmax, target = ctx.saved_tensors
35 | 
36 |         step_seqlen = ctx.seqlen // get_sequence_parallel_world_size()
37 |         sp_rank = get_sequence_parallel_rank()
38 |         grad_output_part = grad_output[step_seqlen*sp_rank:step_seqlen*(sp_rank + 1), :]
39 | 
40 |         grad_input = softmax
41 |         grad_2d = grad_input.view(-1, ctx.vocab_size)
42 |         arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
43 |                                  device=grad_2d.device)
44 | 
45 |         grad_2d[arange_1d, target.view(-1)] -= 1
46 |         grad_input.mul_(grad_output_part.unsqueeze(dim=-1))
47 | 
48 |         return grad_input, None, None
49 | 
50 | 
51 | def vocab_sequence_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0):
52 |     return _VocabSequenceParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
53 | 


--------------------------------------------------------------------------------
/src/megatron/core/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cross_entropy import vocab_parallel_cross_entropy
 2 | from .data import broadcast_data
 3 | 
 4 | from .layers import (
 5 |     ColumnParallelLinear,
 6 |     RowParallelLinear,
 7 |     VocabParallelEmbedding,
 8 |     set_tensor_model_parallel_attributes,
 9 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
10 |     copy_tensor_model_parallel_attributes,
11 |     param_is_not_tensor_parallel_duplicate,
12 |     linear_with_grad_accumulation_and_async_allreduce
13 | 
14 | )
15 | 
16 | from .mappings import (
17 |     copy_to_tensor_model_parallel_region,
18 |     gather_from_tensor_model_parallel_region,
19 |     gather_from_sequence_parallel_region,
20 |     scatter_to_tensor_model_parallel_region,
21 |     scatter_to_sequence_parallel_region,
22 | )
23 | 
24 | from .random import (
25 |     checkpoint,
26 |     get_cuda_rng_tracker,
27 |     model_parallel_cuda_manual_seed,
28 |     init_checkpointed_activations_memory_buffer,
29 |     reset_checkpointed_activations_memory_buffer,
30 | )
31 | 
32 | from .utils import (
33 |     split_tensor_along_last_dim,
34 |     split_tensor_into_1d_equal_chunks,
35 |     gather_split_1d_tensor,
36 | )
37 | 
38 | __all__ = [
39 |     # cross_entropy.py
40 |     "vocab_parallel_cross_entropy",
41 |     # data.py
42 |     "broadcast_data",
43 |     #layers.py
44 |     "ColumnParallelLinear",
45 |     "RowParallelLinear",
46 |     "VocabParallelEmbedding",
47 |     "set_tensor_model_parallel_attributes",
48 |     "set_defaults_if_not_set_tensor_model_parallel_attributes",
49 |     "copy_tensor_model_parallel_attributes",
50 |     "param_is_not_tensor_parallel_duplicate",
51 |     "linear_with_grad_accumulation_and_async_allreduce",
52 |     # mappings.py
53 |     "copy_to_tensor_model_parallel_region",
54 |     "gather_from_tensor_model_parallel_region",
55 |     "gather_from_sequence_parallel_region",
56 | #    "reduce_from_tensor_model_parallel_region",
57 |     "scatter_to_tensor_model_parallel_region",
58 |     "scatter_to_sequence_parallel_region",
59 |     # random.py
60 |     "checkpoint",
61 |     "get_cuda_rng_tracker",
62 |     "model_parallel_cuda_manual_seed",
63 |     "init_checkpointed_activations_memory_buffer",
64 |     "reset_checkpointed_activations_memory_buffer",
65 |     # utils.py
66 |     "split_tensor_along_last_dim",
67 |     "split_tensor_into_1d_equal_chunks",
68 |     "gather_split_1d_tensor",
69 | ]
70 | 


--------------------------------------------------------------------------------
/src/megatron/core/tensor_parallel/data.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | import torch
  4 | 
  5 | from megatron.core.parallel_state import (
  6 |     get_tensor_model_parallel_group,
  7 |     get_tensor_model_parallel_rank,
  8 |     get_tensor_model_parallel_src_rank,
  9 |     get_sequence_parallel_group,
 10 |     get_sequence_parallel_world_size,
 11 |     get_sequence_parallel_rank,
 12 |     get_sequence_parallel_src_rank,
 13 | )
 14 | from deepspeed.accelerator import get_accelerator
 15 | 
 16 | _MAX_DATA_DIM = 5
 17 | 
 18 | 
 19 | def _check_data_types(keys, data, target_dtype):
 20 |     """Check that all the keys have the same target data type."""
 21 |     for key in keys:
 22 |         assert data[key].dtype == target_dtype, '{} has data type {} which '\
 23 |             'is different than {}'.format(key, data[key].dtype, target_dtype)
 24 | 
 25 | 
 26 | def _build_key_size_numel_dictionaries(keys, data, group=None, rank=-1, src_rank=-1):
 27 |     if group is None:
 28 |         group = get_tensor_model_parallel_group()
 29 |     if src_rank < 0:
 30 |         src_rank = get_tensor_model_parallel_src_rank()
 31 |     if rank < 0:
 32 |         rank = get_tensor_model_parallel_rank()
 33 |                     
 34 |     """Build the size on rank 0 and broadcast."""
 35 |     max_dim = _MAX_DATA_DIM
 36 |     sizes = [0 for _ in range(max_dim) for _ in keys]
 37 | 
 38 |     # Pack the sizes on rank zero.
 39 |     if rank == 0:
 40 |         offset = 0
 41 |         for key in keys:
 42 |             assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
 43 |             size = data[key].size()
 44 |             for i, s in enumerate(size):
 45 |                 sizes[i + offset] = s
 46 |             offset += max_dim
 47 | 
 48 |     # Move to GPU and broadcast.
 49 |     sizes_cuda = get_accelerator().LongTensor(sizes)
 50 |     torch.distributed.broadcast(sizes_cuda, src_rank, group=group)
 51 | 
 52 |     # Move back to cpu and unpack.
 53 |     sizes_cpu = sizes_cuda.cpu()
 54 |     key_size = {}
 55 |     key_numel = {}
 56 |     total_numel = 0
 57 |     offset = 0
 58 |     for key in keys:
 59 |         i = 0
 60 |         size = []
 61 |         numel = 1
 62 |         while sizes_cpu[offset + i] > 0:
 63 |             this_size = sizes_cpu[offset + i]
 64 |             size.append(this_size)
 65 |             numel *= this_size
 66 |             i += 1
 67 |         key_size[key] = size
 68 |         key_numel[key] = numel
 69 |         total_numel += numel
 70 |         offset += max_dim
 71 | 
 72 |     return key_size, key_numel, total_numel
 73 | 
 74 | 
 75 | def broadcast_data(keys, data, datatype):
 76 |     """Broadcast data from rank zero of each model parallel group to the
 77 |     members of the same model parallel group.
 78 | 
 79 |     Arguments:
 80 |         keys: list of keys in the data disctionary to be broadcasted
 81 |         data: data dictionary of string keys and cpu tensor values.
 82 |         datatype: torch data type of all tensors in data associated
 83 |                   with keys.
 84 |     """
 85 |     # Build (key, size) and (key, number of elements) dictionaries along
 86 |     # with the total number of elements on all ranks.
 87 |     if get_sequence_parallel_world_size() > 1:
 88 |         rank = get_sequence_parallel_rank()
 89 |         src_rank = get_sequence_parallel_src_rank()
 90 |         group = get_sequence_parallel_group()
 91 |     else:
 92 |         rank = get_tensor_model_parallel_rank()
 93 |         src_rank = get_tensor_model_parallel_src_rank()
 94 |         group = get_tensor_model_parallel_group()
 95 | 
 96 |     key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
 97 |         keys, data, group=group, rank=rank, src_rank=src_rank)
 98 | 
 99 |     # Pack on rank zero.
100 |     if rank == 0:
101 |         # Check that all keys have the same data type.
102 |         _check_data_types(keys, data, datatype)
103 |         # Flatten the data associated with the keys
104 |         flatten_data = torch.cat(
105 |             [data[key].contiguous().view(-1) for key in keys], dim=0).to(get_accelerator().device_name())
106 |     else:
107 |         flatten_data = torch.empty(total_numel,
108 |                                    device=get_accelerator().current_device_name(),
109 |                                    dtype=datatype)
110 | 
111 |     # Broadcast
112 |     torch.distributed.broadcast(flatten_data, src_rank, group=group)
113 | 
114 |     # Unpack
115 |     output = {}
116 |     offset = 0
117 |     for key in keys:
118 |         size = key_size[key]
119 |         numel = key_numel[key]
120 |         output[key] = flatten_data.narrow(0, offset, numel).view(size)
121 |         offset += numel
122 | 
123 |     return output
124 | 


--------------------------------------------------------------------------------
/src/megatron/core/tensor_parallel/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | import torch
  4 | from typing import List, Sequence
  5 | 
  6 | from megatron.core.utils import divide
  7 | from megatron.core import parallel_state
  8 | 
  9 | def split_tensor_along_last_dim(
 10 |     tensor: torch.Tensor,
 11 |     num_partitions: int,
 12 |     contiguous_split_chunks: bool = False,
 13 | ) -> List[torch.Tensor]:
 14 |     """ Split a tensor along its last dimension.
 15 | 
 16 |         Arguments:
 17 |             tensor: input tensor.
 18 |             num_partitions: number of partitions to split the tensor
 19 |             contiguous_split_chunks: If True, make each chunk contiguous
 20 |                                      in memory.
 21 | 
 22 |         Returns:
 23 |             A list of Tensors
 24 |     """
 25 |     # Get the size and dimension.
 26 |     last_dim = tensor.dim() - 1
 27 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
 28 |     # Split.
 29 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
 30 |     # Note: torch.split does not create contiguous tensors by default.
 31 |     if contiguous_split_chunks:
 32 |         return tuple(chunk.contiguous() for chunk in tensor_list)
 33 | 
 34 |     return tensor_list
 35 | 
 36 | def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
 37 |     """ Break a tensor into equal 1D chunks across tensor parallel ranks.
 38 | 
 39 |         Returns a Tensor or View with this rank's portion of the data.
 40 | 
 41 |         Arguments:
 42 |             tensor: The tensor to split
 43 | 
 44 |         Keyword Arguments:
 45 |             new_buffer (bool): If True, returns a new Tensor.
 46 |                                If False, returns a view into the existing Tensor.
 47 |                                Default is False
 48 | 
 49 |     """
 50 |     partition_size = torch.numel(tensor) // \
 51 |         parallel_state.get_tensor_model_parallel_world_size()
 52 |     start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
 53 |     end_index = start_index + partition_size
 54 |     if new_buffer:
 55 |         data = torch.empty(partition_size, dtype=tensor.dtype,
 56 |                            device=torch.cuda.current_device(),
 57 |                            requires_grad=False)
 58 |         data.copy_(tensor.view(-1)[start_index:end_index])
 59 |     else:
 60 |         data = tensor.view(-1)[start_index:end_index]
 61 |     return data
 62 | 
 63 | 
 64 | def gather_split_1d_tensor(tensor):
 65 |     """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
 66 |         model parallel ranks.
 67 | 
 68 |         Returns a new Tensor with the gathered data.
 69 | 
 70 |         Arguments:
 71 |             tensor: A Tensor or view of this rank's portion of the data.
 72 |     """
 73 |     numel_gathered = torch.numel(tensor) * \
 74 |         parallel_state.get_tensor_model_parallel_world_size()
 75 |     gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
 76 |                            device=torch.cuda.current_device(),
 77 |                            requires_grad=False)
 78 |     # TODO: This API is experimental in pytorch (as of Feb 2022) and
 79 |     # this might break in future pytorch releases. We chose this API
 80 |     # as opposed to torch.distributed.all_gather for efficiency reasons.
 81 |     # This API calls directly NCCL all-gather versus the former does
 82 |     # internal copies and can potentially cause slow down.
 83 |     torch.distributed._all_gather_base(gathered, tensor,
 84 |                                        group=parallel_state.get_tensor_model_parallel_group())
 85 |     return gathered
 86 | 
 87 | 
 88 | class VocabUtility:
 89 |     """ Split the vocabulary into `world_size` chunks and return the first
 90 |         and last index of the vocabulary belonging to the `rank`
 91 |         partition: Note that indices in [fist, last)
 92 | 
 93 |     """
 94 | 
 95 |     @staticmethod
 96 |     def vocab_range_from_per_partition_vocab_size(
 97 |         per_partition_vocab_size: int, rank, world_size: int
 98 |     ) -> Sequence[int]:
 99 |         index_f = rank * per_partition_vocab_size
100 |         index_l = index_f + per_partition_vocab_size
101 |         return index_f, index_l
102 | 
103 |     @staticmethod
104 |     def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
105 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
106 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
107 |             per_partition_vocab_size, rank, world_size
108 |         )
109 | 


--------------------------------------------------------------------------------
/src/megatron/core/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .transformer_config import TransformerConfig
4 | from .core_attention import CoreAttention
5 | 


--------------------------------------------------------------------------------
/src/megatron/core/transformer/custom_layers/transformer_engine.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import transformer_engine as te
  3 | from typing import Callable
  4 | 
  5 | from megatron.core.transformer.transformer_config import TransformerConfig
  6 | from megatron.core.transformer.enums import AttnMaskType
  7 | from megatron.core.parallel_state import get_tensor_model_parallel_group
  8 | from megatron.core.tensor_parallel import get_cuda_rng_tracker
  9 | 
 10 | class TELayerNorm(te.pytorch.module.LayerNorm):
 11 |     """
 12 |     Wrapper for the Transformer-Engine's `LayerNorm`.
 13 |     """
 14 |     def __init__(self,
 15 |                  hidden_size: int,
 16 |                  eps: float = 1e-5,
 17 |                  sequence_parallel: bool = False,
 18 |                  **kwargs):
 19 |         super().__init__(
 20 |             hidden_size=hidden_size,
 21 |             eps=eps,
 22 |             sequence_parallel=sequence_parallel
 23 |         )
 24 | 
 25 | class TELinear(te.pytorch.module.Linear):
 26 |     """
 27 |     Wrapper for the Transformer-Engine's `Linear` layer.
 28 | 
 29 |     Note that if Megatron's parallel_state has not been initialized
 30 |     yet, the tp_group passed to TE will be None and must be set later
 31 |     via set_tensor_parallel_group().
 32 |     """
 33 |     def __init__(self,
 34 |                  input_size: int,
 35 |                  output_size: int,
 36 |                  config: TransformerConfig,
 37 |                  parallel_mode: str,
 38 |                  init_method: Callable, *,
 39 |                  bias: bool = True,
 40 |                  skip_bias_add: bool = False,
 41 |                  **kwargs):
 42 |         self.config = config
 43 | 
 44 |         # TE returns a zero length Tensor when bias=False and
 45 |         # return_bias=True, but we prefer None.  So in that case we
 46 |         # tell TE to not return the bias, and return None
 47 |         # ourselves. This way our forward always returns two values
 48 |         # and we don't have to deal with the zero length Tensor.
 49 |         self.te_return_bias = skip_bias_add and bias
 50 | 
 51 |         super().__init__(
 52 |             in_features=input_size,
 53 |             out_features=output_size,
 54 |             sequence_parallel=self.config.sequence_parallel,
 55 |             fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
 56 |             tp_group=get_tensor_model_parallel_group(check_initialized=False),
 57 |             tp_size=self.config.tensor_model_parallel_size,
 58 |             get_rng_state_tracker=get_cuda_rng_tracker,
 59 |             init_method=init_method,
 60 |             params_dtype=self.config.params_dtype,
 61 |             parallel_mode=parallel_mode,
 62 |             bias=bias,
 63 |             return_bias=self.te_return_bias,
 64 |             **kwargs
 65 |         )
 66 | 
 67 |     def forward(self, x):
 68 |         out = super().forward(x)
 69 | 
 70 |         # TE only returns a tuple when return_bias is True, otherwise
 71 |         # it returns a single Tensor, we always want to return two
 72 |         # values regardless of the arguments.
 73 |         if self.te_return_bias:
 74 |             return out
 75 |         return out, None
 76 | 
 77 | class TEColumnParallelLinear(TELinear):
 78 |     """
 79 |     Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
 80 |     to megatron's `ColumnParallelLinear` layer.
 81 |     """
 82 |     def __init__(self,
 83 |                  input_size: int,
 84 |                  output_size: int,
 85 |                  config: TransformerConfig,
 86 |                  **kwargs):
 87 |         self.config = config
 88 |         super().__init__(
 89 |             input_size=input_size,
 90 |             output_size=output_size,
 91 |             config=self.config,
 92 |             parallel_mode="column",
 93 |             **kwargs
 94 |         )
 95 | 
 96 | class TERowParallelLinear(TELinear):
 97 |     """
 98 |     Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
 99 |     to megatron's `RowParallelLinear` layer.
100 |     """
101 |     def __init__(self,
102 |                  input_size: int,
103 |                  output_size: int,
104 |                  config: TransformerConfig,
105 |                  **kwargs):
106 |         self.config = config
107 |         super().__init__(
108 |             input_size=input_size,
109 |             output_size=output_size,
110 |             config=self.config,
111 |             parallel_mode="row",
112 |             **kwargs
113 |         )
114 | 
115 | class TECoreAttention(te.pytorch.transformer.DotProductAttention):
116 |     """
117 |     Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
118 |     has "flash attention" enabled.
119 | 
120 |     Note that if Megatron's parallel_state has not been initialized
121 |     yet, the tp_group passed to TE will be None and must be set later
122 |     via set_tensor_parallel_group().
123 |     """
124 |     def __init__(self,
125 |                  config: TransformerConfig,
126 |                  layer_number: int = 1,
127 |                  attn_mask_type: AttnMaskType = AttnMaskType.padding,
128 |                  **kwargs):
129 |         self.config = config
130 |         super().__init__(
131 |             num_attention_heads=self.config.num_attention_heads,
132 |             kv_channels=self.config.kv_channels,
133 |             attention_dropout=self.config.attention_dropout,
134 |             layer_number=layer_number,
135 |             attn_mask_type=attn_mask_type.name,
136 |             sequence_parallel=self.config.sequence_parallel,
137 |             tp_size=self.config.tensor_model_parallel_size,
138 |             get_rng_state_tracker=get_cuda_rng_tracker,
139 |             tp_group=get_tensor_model_parallel_group(check_initialized=False),
140 |             **kwargs
141 |         )
142 | 


--------------------------------------------------------------------------------
/src/megatron/core/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | # can we get rid of this?
 7 | # it's being used in pipeline schedules
 8 | class ModelType(enum.Enum):
 9 |     encoder_or_decoder = 1
10 |     encoder_and_decoder = 2
11 | 
12 | 
13 | # class LayerType(enum.Enum):
14 | #     encoder = 1
15 | #     decoder = 2
16 | 
17 | 
18 | class AttnType(enum.Enum):
19 |     self_attn = 1
20 |     cross_attn = 2
21 | 
22 | 
23 | class AttnMaskType(enum.Enum):
24 |     padding = 1
25 |     causal = 2
26 | 


--------------------------------------------------------------------------------
/src/megatron/core/transformer/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from megatron.core import tensor_parallel
 7 | from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
 8 | from megatron.core.transformer.module import MegatronModule
 9 | from megatron.core.transformer.transformer_config import TransformerConfig
10 | from megatron.core.transformer.custom_layers.transformer_engine import \
11 |         TERowParallelLinear, TEColumnParallelLinear
12 | 
13 | class MLP(MegatronModule):
14 |     """
15 |     MLP will take the input with h hidden state, project it to 4*h
16 |     hidden dimension, perform nonlinear transformation, and project the
17 |     state back into h hidden dimension.
18 | 
19 | 
20 |     Returns an output and a bias to be added to the output.
21 |     If config.add_bias_linear is False, the bias returned is None.
22 | 
23 |     We use the following notation:
24 |      h: hidden size
25 |      p: number of tensor model parallel partitions
26 |      b: batch size
27 |      s: sequence length
28 |     """
29 | 
30 |     def __init__(self, config: TransformerConfig):
31 |         super().__init__(config=config)
32 | 
33 |         self.config: TransformerConfig = config
34 | 
35 |         # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
36 |         ffn_hidden_size = self.config.ffn_hidden_size
37 |         if self.config.gated_linear_unit:
38 |             ffn_hidden_size *= 2
39 | 
40 |         self.linear_fc1 = TEColumnParallelLinear(
41 |             self.config.hidden_size,
42 |             ffn_hidden_size,
43 |             config=self.config,
44 |             init_method=self.config.init_method,
45 |             bias=self.config.add_bias_linear,
46 |             skip_bias_add=True,
47 |         )
48 | 
49 |         if self.config.gated_linear_unit:
50 |             def glu(x):
51 |                 x = torch.chunk(x, 2, dim=-1)
52 |                 return self.config.activation_func(x[0]) * x[1]
53 |             self.activation_func = glu
54 |         else:
55 |             self.activation_func = self.config.activation_func
56 | 
57 |         self.linear_fc2 = TERowParallelLinear(
58 |             self.config.ffn_hidden_size,
59 |             self.config.hidden_size,
60 |             config=self.config,
61 |             init_method=self.config.output_layer_init_method,
62 |             bias=self.config.add_bias_linear,
63 |             skip_bias_add=True,
64 |         )
65 | 
66 |     def forward(self, hidden_states):
67 | 
68 |         # [s, b, 4 * h/p]
69 |         intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
70 | 
71 |         if self.config.bias_gelu_fusion:
72 |             assert self.config.add_bias_linear is True
73 |             assert self.activation_func == F.gelu
74 |             intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
75 |         else:
76 |             if bias_parallel is not None:
77 |                 intermediate_parallel = intermediate_parallel + bias_parallel
78 |             intermediate_parallel = self.activation_func(intermediate_parallel)
79 | 
80 |         # [s, b, h]
81 |         output, output_bias = self.linear_fc2(intermediate_parallel)
82 |         return output, output_bias
83 | 


--------------------------------------------------------------------------------
/src/megatron/core/transformer/module.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Megatron Module"""
  4 | 
  5 | import torch
  6 | from torch.autograd import Variable
  7 | from torch.nn.parameter import Parameter
  8 | 
  9 | from megatron.core import parallel_state, tensor_parallel
 10 | from megatron.core.transformer.transformer_config import TransformerConfig
 11 | 
 12 | 
 13 | _FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
 14 | _HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
 15 | _BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
 16 | 
 17 | 
 18 | def param_is_not_shared(param):
 19 |     return not hasattr(param, 'shared') or not param.shared
 20 | 
 21 | 
 22 | class MegatronModule(torch.nn.Module):
 23 |     """Megatron specific extensions of torch Module with support
 24 |     for pipelining."""
 25 | 
 26 |     # def __init__(self, config: TransformerConfig, share_word_embeddings=True):
 27 |     def __init__(self, config: TransformerConfig):
 28 |         super().__init__()
 29 |         self.config = config
 30 | 
 31 |     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
 32 |         """Use this function to override the state dict for
 33 |         saving checkpoints."""
 34 |         return self.state_dict(prefix=prefix, keep_vars=keep_vars)
 35 | 
 36 | 
 37 | def conversion_helper(val, conversion):
 38 |     """Apply conversion to val. Recursively apply conversion if `val`
 39 |     #is a nested tuple/list structure."""
 40 |     if not isinstance(val, (tuple, list)):
 41 |         return conversion(val)
 42 |     rtn = [conversion_helper(v, conversion) for v in val]
 43 |     if isinstance(val, tuple):
 44 |         rtn = tuple(rtn)
 45 |     return rtn
 46 | 
 47 | 
 48 | def fp32_to_float16(val, float16_convertor):
 49 |     """Convert fp32 `val` to fp16/bf16"""
 50 | 
 51 |     def half_conversion(val):
 52 |         val_typecheck = val
 53 |         if isinstance(val_typecheck, (Parameter, Variable)):
 54 |             val_typecheck = val.data
 55 |         if isinstance(val_typecheck, _FLOAT_TYPES):
 56 |             val = float16_convertor(val)
 57 |         return val
 58 | 
 59 |     return conversion_helper(val, half_conversion)
 60 | 
 61 | 
 62 | def float16_to_fp32(val):
 63 |     """Convert fp16/bf16 `val` to fp32"""
 64 | 
 65 |     def float_conversion(val):
 66 |         val_typecheck = val
 67 |         if isinstance(val_typecheck, (Parameter, Variable)):
 68 |             val_typecheck = val.data
 69 |         if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
 70 |             val = val.float()
 71 |         return val
 72 | 
 73 |     return conversion_helper(val, float_conversion)
 74 | 
 75 | 
 76 | class Float16Module(MegatronModule):
 77 |     def __init__(self, config: TransformerConfig, module: torch.nn.Module):
 78 |         super(Float16Module, self).__init__(config)
 79 |         self.config = config
 80 |         self.fp16 = config.fp16
 81 |         self.bf16 = config.bf16
 82 | 
 83 |         if self.fp16:
 84 |             self.add_module('module', module.half())
 85 | 
 86 |             def float16_convertor(val):
 87 |                 return val.half()
 88 | 
 89 |         elif self.bf16:
 90 |             self.add_module('module', module.bfloat16())
 91 | 
 92 |             def float16_convertor(val):
 93 |                 return val.bfloat16()
 94 | 
 95 |         else:
 96 |             raise Exception('Either config.fp16 or config.bf16 should be True.')
 97 | 
 98 |         self.float16_convertor = float16_convertor
 99 | 
100 |     def set_input_tensor(self, input_tensor):
101 |         return self.module.set_input_tensor(input_tensor)
102 | 
103 |     def forward(self, *inputs, **kwargs):
104 |         if parallel_state.is_pipeline_first_stage():
105 |             inputs = fp32_to_float16(inputs, self.float16_convertor)
106 |         outputs = self.module(*inputs, **kwargs)
107 |         if parallel_state.is_pipeline_last_stage():
108 |             outputs = float16_to_fp32(outputs)
109 |         return outputs
110 | 
111 |     def state_dict(self, destination=None, prefix='', keep_vars=False):
112 |         return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
113 | 
114 |     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
115 |         return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
116 | 
117 |     def load_state_dict(self, state_dict, strict=True):
118 |         self.module.load_state_dict(state_dict, strict=strict)
119 | 


--------------------------------------------------------------------------------
/src/megatron/core/transformer/transformer_layer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | import torch
  4 | 
  5 | from megatron.core.transformer.module import MegatronModule
  6 | from megatron.core.transformer.transformer_config import TransformerConfig
  7 | from megatron.core.transformer.enums import AttnType, AttnMaskType
  8 | from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
  9 | from megatron.core.transformer.attention import SelfAttention
 10 | from megatron.core.transformer.mlp import MLP
 11 | from megatron.core.utils import make_viewless_tensor
 12 | from megatron.core.transformer.custom_layers.transformer_engine import TELayerNorm
 13 | 
 14 | class TransformerLayer(MegatronModule):
 15 |     """A single transformer layer.
 16 | 
 17 |     Transformer layer takes input with size [s, b, h] and returns an
 18 |     output of the same size.
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self, config: TransformerConfig, layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding,
 23 |     ):
 24 |         super().__init__(config=config)
 25 |         self.config: TransformerConfig = config
 26 | 
 27 |         self.layer_number = layer_number
 28 |         self.self_attn_mask_type = self_attn_mask_type
 29 | 
 30 |         # Layernorm on the input data.
 31 |         # TODO: add pytorch only layernorm
 32 |         self.input_layernorm = TELayerNorm(
 33 |             hidden_size=self.config.hidden_size,
 34 |             eps=self.config.layernorm_epsilon,
 35 |             persist_layer_norm=self.config.persist_layer_norm,
 36 |             sequence_parallel=self.config.sequence_parallel,
 37 |             zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
 38 |         )
 39 | 
 40 |         # Self attention.
 41 |         self.self_attention = SelfAttention(
 42 |             config=self.config,
 43 |             layer_number=layer_number,
 44 |             attn_mask_type=self_attn_mask_type,
 45 |         )
 46 | 
 47 |         # Layernorm on the attention output
 48 |         self.post_self_attn_layernorm = TELayerNorm(
 49 |             hidden_size=self.config.hidden_size,
 50 |             eps=self.config.layernorm_epsilon,
 51 |             persist_layer_norm=self.config.persist_layer_norm,
 52 |             sequence_parallel=self.config.sequence_parallel,
 53 |             zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
 54 |         )
 55 | 
 56 |         # MLP
 57 |         self.mlp = MLP(config=self.config)
 58 | 
 59 |         # @jcasper how should we handle nvfuser?
 60 |         # Set bias+dropout+add fusion grad_enable execution handler.
 61 |         # TORCH_MAJOR = int(torch.__version__.split('.')[0])
 62 |         # TORCH_MINOR = int(torch.__version__.split('.')[1])
 63 |         # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
 64 |         # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
 65 |         self.bias_dropout_add_exec_handler = torch.enable_grad
 66 | 
 67 |         self.bias_dropout_add_func = get_bias_dropout_add(
 68 |             self.training,
 69 |             self.config.bias_dropout_fusion
 70 |         )
 71 | 
 72 |     # TODO: decide how to do inference_params
 73 |     def forward(
 74 |         self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None
 75 |     ):
 76 |         # hidden_states: [s, b, h]
 77 | 
 78 |         # Layer norm at the beginning of the transformer layer.
 79 |         layernorm_output = self.input_layernorm(hidden_states)
 80 |         # Self attention.
 81 |         attention_output_with_bias = self.self_attention(
 82 |             layernorm_output, attention_mask, inference_params=inference_params
 83 |         )
 84 | 
 85 |         # Residual connection.
 86 |         if self.config.apply_residual_connection_post_layernorm:
 87 |             residual = layernorm_output
 88 |         else:
 89 |             residual = hidden_states
 90 | 
 91 |         # bias_dropout_add fusion returning fp32 instead of bf16
 92 |         with self.bias_dropout_add_exec_handler():
 93 |             layernorm_input = self.bias_dropout_add_func(
 94 |                 attention_output_with_bias, residual, self.config.hidden_dropout
 95 |             )
 96 | 
 97 |         # Layer norm post the self attention.
 98 |         layernorm_output = self.post_self_attn_layernorm(layernorm_input)
 99 | 
100 |         # MLP.
101 |         mlp_output_with_bias = self.mlp(layernorm_output)
102 | 
103 |         # Second residual connection.
104 |         if self.config.apply_residual_connection_post_layernorm:
105 |             residual = layernorm_output
106 |         else:
107 |             residual = layernorm_input
108 | 
109 |         with self.bias_dropout_add_exec_handler():
110 |             output = self.bias_dropout_add_func(
111 |                 mlp_output_with_bias, residual, self.config.hidden_dropout
112 |             )
113 | 
114 |         # Jit compiled function creates 'view' tensor. This tensor
115 |         # potentially gets saved in the MPU checkpoint function context,
116 |         # which rejects view tensors. While making a viewless tensor here
117 |         # won't result in memory savings (like the data loader, or
118 |         # p2p_communication), it serves to document the origin of this
119 |         # 'view' tensor.
120 |         output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True)
121 | 
122 |         return output
123 | 


--------------------------------------------------------------------------------
/src/megatron/core/transformer/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Utilities for transformer layers."""
 4 | 
 5 | import torch
 6 | 
 7 | from megatron import get_args
 8 | 
 9 | from deepspeed.runtime.zero import GatheredParameters
10 | 
11 | def attention_mask_func(attention_scores, attention_mask):
12 |     attention_scores.masked_fill_(attention_mask, -10000.0)
13 |     return attention_scores
14 | 
15 | 
16 | def get_linear_layer(rows, columns, init_method, gather_params_on_init=False):
17 |     """Simple linear layer with weight initialization."""
18 |     layer = torch.nn.Linear(rows, columns)
19 |     if get_args().perform_initialization:
20 |         with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
21 |             init_method(layer.weight)
22 |     with torch.no_grad():
23 |         with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
24 |             layer.bias.zero_()
25 |     return layer
26 | 
27 | 
28 | @torch.jit.script
29 | def gelu_impl(x):
30 |     """OpenAI's gelu implementation."""
31 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
32 | 
33 | 
34 | def openai_gelu(x):
35 |     return gelu_impl(x)
36 | 
37 | 
38 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
39 | @torch.jit.script
40 | def erf_gelu(x):
41 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
42 | 


--------------------------------------------------------------------------------
/src/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/src/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/src/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Blendable dataset."""
  4 | 
  5 | import hashlib
  6 | import os
  7 | import time
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from megatron import print_rank_0
 13 | from megatron.core import mpu
 14 | 
 15 | class BlendableDataset(torch.utils.data.Dataset):
 16 | 
 17 | 
 18 |     def __init__(self, datasets, weights, size, *,
 19 |                  data_cache_path=None):
 20 | 
 21 |         self.datasets = datasets
 22 |         num_datasets = len(datasets)
 23 |         assert num_datasets == len(weights)
 24 | 
 25 |         self.size = size
 26 | 
 27 |         # Normalize weights.
 28 |         weights = np.array(weights, dtype=np.float64)
 29 |         sum_weights = np.sum(weights)
 30 |         assert sum_weights > 0.0
 31 |         weights /= sum_weights
 32 | 
 33 |         # Build indicies.
 34 |         def _build_indices():
 35 |             start_time = time.time()
 36 |             assert num_datasets < 255
 37 |             dataset_index = np.zeros(self.size, dtype=np.uint8)
 38 |             dataset_sample_index = np.zeros(self.size, dtype=np.int64)
 39 | 
 40 |             from megatron.data import helpers
 41 |             helpers.build_blending_indices(dataset_index, dataset_sample_index,
 42 |                                            weights, num_datasets, self.size,
 43 |                                            torch.distributed.get_rank() == 0)
 44 |             print_rank_0('> elapsed time for building blendable dataset indices: '
 45 |                          '{:.2f} (sec)'.format(time.time() - start_time))
 46 |             return dataset_index, dataset_sample_index
 47 | 
 48 |         desc = "Blendable dataset\n\n"
 49 |         desc += "Datasets:\n"
 50 |         for dataset in datasets:
 51 |             desc += dataset.desc + "\n\n"
 52 |         desc += f"Weights: {weights}\n"
 53 |         desc += f"Size: {size}\n"
 54 |         self.desc = desc
 55 | 
 56 |         if data_cache_path:
 57 |             desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
 58 |             desc_path = os.path.join(data_cache_path, desc_hash + ".dsc")
 59 |             index_path = os.path.join(data_cache_path, desc_hash + "_index.npy")
 60 |             sample_index_path = os.path.join(data_cache_path, desc_hash + "_sample_index.npy")
 61 |             cache_hit = os.path.isfile(index_path) and os.path.isfile(sample_index_path)
 62 |             cache_success = True
 63 |             if torch.distributed.get_rank() == 0 and not cache_hit:
 64 |                 print(' > WARNING: could not find index map files for blendable'
 65 |                       ' dataset, building indices on rank 0 ...', flush=True)
 66 |                 dataset_index, dataset_sample_index = _build_indices()
 67 |                 try:
 68 |                     os.makedirs(os.path.dirname(index_path), exist_ok=True)
 69 |                     with open(desc_path, 'wt') as fd:
 70 |                         fd.write(desc)
 71 |                         np.save(index_path, dataset_index, allow_pickle=True)
 72 |                         np.save(sample_index_path, dataset_sample_index,
 73 |                                 allow_pickle=True)
 74 |                 except OSError:
 75 |                     print(f'There was an error trying to create the data cache directory ({data_cache_path})')
 76 |                     print('or a file in it. This is set with the --data-cache-path argument. Please')
 77 |                     print('ensure you have write access to this directory or specify one that you do have')
 78 |                     print('write access to.')
 79 |                     cache_success = False
 80 | 
 81 | 
 82 |             counts = torch.cuda.LongTensor([cache_success])
 83 |             torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
 84 |             torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
 85 |             if counts[0].item() != (
 86 |                 torch.distributed.get_world_size() //
 87 |                 torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) //
 88 |                 torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())):
 89 |                 print_rank_0("Data index creation unsuccessful, exiting.")
 90 |                 exit()
 91 | 
 92 |             # Load on all ranks.
 93 |             print_rank_0(f'> loading blendable dataset index: {index_path}')
 94 |             self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r')
 95 |             assert self.dataset_index.size == self.size
 96 | 
 97 |             print_rank_0(f'> loading blendable dataset sample index: {sample_index_path}')
 98 |             self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r')
 99 |             assert self.dataset_sample_index.size == self.size
100 |         else:
101 |             self.dataset_index, self.dataset_sample_index = _build_indices()
102 | 
103 | 
104 |         # Check size
105 |         _ = self.__getitem__(self.size - 1)
106 |         try:
107 |             _ = self.__getitem__(self.size)
108 |             raise RuntimeError('BlendedDataset size is improperly bounded')
109 |         except IndexError:
110 |             pass
111 |         print_rank_0('> size of blendable dataset: '
112 |                      '{} samples'.format(self.size))
113 | 
114 | 
115 |     def __len__(self):
116 |         return self.size
117 | 
118 | 
119 |     def __getitem__(self, idx):
120 |         dataset_idx = self.dataset_index[idx]
121 |         sample_idx = self.dataset_sample_index[idx]
122 |         return {
123 |             "dataset_idx" : dataset_idx,
124 |             **self.datasets[dataset_idx][sample_idx],
125 |         }
126 | 


--------------------------------------------------------------------------------
/src/megatron/data/test/test_indexed_dataset.py:
--------------------------------------------------------------------------------
  1 | # This file isn't really a formal automated test, it's just a place to
  2 | # put some code used during development and manual testing of
  3 | # indexed_dataset.
  4 | 
  5 | from megatron.data import indexed_dataset
  6 | from megatron.tokenizer import build_tokenizer
  7 | import argparse
  8 | import os
  9 | import sys
 10 | 
 11 | import torch
 12 | 
 13 | script_dir = os.path.dirname(os.path.realpath(__file__))
 14 | sys.path.append(os.path.join(script_dir, "../../../"))
 15 | 
 16 | 
 17 | def test_indexed_dataset(args):
 18 |     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 19 |     tokenizer = build_tokenizer(args)
 20 |     print(len(ds.doc_idx))
 21 |     print(len(ds))
 22 |     print(ds.doc_idx[-1])
 23 |     if ds.supports_prefetch:
 24 |         # just prefetch the whole thing in test (so assume it is small)
 25 |         ds.prefetch(range(len(ds)))
 26 |     if args.count > len(ds.doc_idx) - 1:
 27 |         args.count = len(ds.doc_idx) - 1
 28 | 
 29 |     for i in range(args.count):
 30 |         start = ds.doc_idx[i]
 31 |         end = ds.doc_idx[i + 1]
 32 |         ids = ds[start:end]
 33 |         print(f"Document {i}:")
 34 |         print("--------------")
 35 |         for s in ids:
 36 |             assert len(s) > 0
 37 |             l = s.data.tolist()
 38 |             text = tokenizer.detokenize(l)
 39 |             print(text)
 40 |             print("---")
 41 | 
 42 | 
 43 | def test_indexed_dataset_get(args):
 44 |     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 45 |     tokenizer = build_tokenizer(args)
 46 |     size = ds.sizes[0]
 47 |     print(f"size: {size}")
 48 |     full = ds.get(0)
 49 |     print(full)
 50 |     # print(tokenizer.detokenize(full.data.tolist()))
 51 |     print("---")
 52 |     end = ds.get(0, offset=size - 10)
 53 |     print(end)
 54 |     # print(tokenizer.detokenize(end.data.tolist()))
 55 | 
 56 |     start = ds.get(0, length=10)
 57 |     print(start)
 58 |     # print(tokenizer.detokenize(start.data.tolist()))
 59 | 
 60 |     part = ds.get(0, offset=2, length=8)
 61 |     print(part)
 62 |     # print(tokenizer.detokenize(part.data.tolist()))
 63 | 
 64 | # def test_albert_dataset(args):
 65 | #     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
 66 | #     # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 67 | #     # ds = AlbertDataset(idataset, tokenizer)
 68 | #     ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
 69 | #                                   args.epochs, args.max_num_samples,
 70 | #                                   args.masked_lm_prob, args.seq_length,
 71 | #                                   args.short_seq_prob, args.seed)
 72 | #     truncated = 0
 73 | #     total = 0
 74 | #     for i, s in enumerate(ds):
 75 | #         ids = s['text']
 76 | #         tokens = ds.tokenizer.convert_ids_to_tokens(ids)
 77 | #         print(tokens)
 78 | #         if i >= args.count-1:
 79 | #             exit()
 80 | 
 81 | 
 82 | def main():
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument('--data', type=str, help='prefix to data files')
 85 |     parser.add_argument('--dataset-impl', type=str, default='infer',
 86 |                         choices=['lazy', 'cached', 'mmap', 'infer'])
 87 |     parser.add_argument('--count', type=int, default=10,
 88 |                         help='Number of samples/documents to print')
 89 | 
 90 |     group = parser.add_argument_group(title='tokenizer')
 91 |     group.add_argument('--tokenizer-type', type=str, required=True,
 92 |                        choices=['BertWordPieceLowerCase',
 93 |                                 'GPT2BPETokenizer'],
 94 |                        help='What type of tokenizer to use.')
 95 |     group.add_argument('--vocab-file', type=str, default=None,
 96 |                        help='Path to the vocab file')
 97 |     group.add_argument('--merge-file', type=str, default=None,
 98 |                        help='Path to the BPE merge file (if necessary).')
 99 | 
100 |     parser.add_argument('--epochs', type=int, default=5,
101 |                         help='Number of epochs to plan for')
102 |     parser.add_argument('--max-num-samples', type=int, default=None,
103 |                         help='Maximum number of samples to plan for')
104 |     parser.add_argument('--masked-lm-prob', type=float, default=0.15,
105 |                         help='probability of masking tokens')
106 |     parser.add_argument('--seq-length', type=int, default=512,
107 |                         help='maximum sequence length')
108 |     parser.add_argument('--short-seq-prob', type=float, default=0.1,
109 |                         help='probability of creating a short sequence')
110 |     parser.add_argument('--seed', type=int, default=1234,
111 |                         help='random seed')
112 |     args = parser.parse_args()
113 |     args.rank = 0
114 |     args.make_vocab_size_divisible_by = 128
115 |     args.tensor_model_parallel_size = 1
116 | 
117 |     if args.dataset_impl == "infer":
118 |         args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
119 | 
120 | #    test_albert_dataset(args)
121 |     test_indexed_dataset_get(args)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/src/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/src/megatron/dist_signal_handler.py:
--------------------------------------------------------------------------------
 1 | import signal
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_world_size():
 7 |     if torch.distributed.is_available() and torch.distributed.is_initialized():
 8 |         world_size = torch.distributed.get_world_size()
 9 |     else:
10 |         world_size = 1
11 |     return world_size
12 | 
13 | 
14 | def get_device(local_rank=None):
15 |     backend = torch.distributed.get_backend()
16 |     if backend == 'nccl':
17 |         if local_rank is None:
18 |             device = torch.device('cuda')
19 |         else:
20 |             device = torch.device(f'cuda:{local_rank}')
21 |     elif backend == 'gloo':
22 |         device = torch.device('cpu')
23 |     else:
24 |         raise RuntimeError
25 |     return device
26 | 
27 | 
28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None):
29 |     if not torch.distributed.is_available() or \
30 |        not torch.distributed.is_initialized():
31 |         return [item]
32 | 
33 |     device = get_device(local_rank)
34 | 
35 |     if group is not None:
36 |         group_size = group.size()
37 |     else:
38 |         group_size = get_world_size()
39 | 
40 |     tensor = torch.tensor([item], device=device, dtype=dtype)
41 |     output_tensors = [
42 |         torch.zeros(1, dtype=tensor.dtype, device=tensor.device)
43 |         for _ in range(group_size)
44 |     ]
45 |     torch.distributed.all_gather(output_tensors, tensor, group, async_op)
46 |     output = [elem.item() for elem in output_tensors]
47 |     return output
48 | 
49 | 
50 | class DistributedSignalHandler:
51 |     def __init__(self, sig=signal.SIGTERM):
52 |         self.sig = sig
53 | 
54 |     def signals_received(self):
55 |         all_received = all_gather_item(
56 |             self._signal_received, dtype=torch.int32
57 |         )
58 |         return all_received
59 | 
60 |     def __enter__(self):
61 |         self._signal_received = False
62 |         self.released = False
63 |         self.original_handler = signal.getsignal(self.sig)
64 | 
65 |         def handler(signum, frame):
66 |             self._signal_received = True
67 | 
68 |         signal.signal(self.sig, handler)
69 | 
70 |         return self
71 | 
72 |     def __exit__(self, type, value, tb):
73 |         self.release()
74 | 
75 |     def release(self):
76 |         if self.released:
77 |             return False
78 | 
79 |         signal.signal(self.sig, self.original_handler)
80 |         self.released = True
81 |         return True
82 | 


--------------------------------------------------------------------------------
/src/megatron/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import enum
17 | 
18 | class LayerType(enum.Enum):
19 |     encoder = 1
20 |     decoder = 2
21 |  
22 | class AttnType(enum.Enum):
23 |     self_attn = 1
24 |     cross_attn = 2
25 | 
26 | class AttnMaskType(enum.Enum):
27 |     padding = 1
28 |     causal = 2
29 |     prefix = 3
30 | 
31 | class PositionEmbeddingType(enum.Enum):
32 |     rotary = 1
33 |     absolute = 2
34 |     alibi = 3
35 | 


--------------------------------------------------------------------------------
/src/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | import os
  4 | import pathlib
  5 | import subprocess
  6 | 
  7 | import torch
  8 | from torch.utils import cpp_extension
  9 | 
 10 | # Setting this param to a list has a problem of generating different
 11 | # compilation commands (with diferent order of architectures) and
 12 | # leading to recompilation of fused kernels. Set it to empty string
 13 | # to avoid recompilation and assign arch flags explicity in
 14 | # extra_cuda_cflags below
 15 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 16 | 
 17 | 
 18 | def load(args):
 19 | 
 20 |     # Check if cuda 11 is installed for compute capability 8.0
 21 |     cc_flag = []
 22 |     if torch.version.hip is None:
 23 |         _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
 24 |             cpp_extension.CUDA_HOME)
 25 |         if int(bare_metal_major) >= 11:
 26 |             cc_flag.append('-gencode')
 27 |             cc_flag.append('arch=compute_80,code=sm_80')
 28 |             if int(bare_metal_minor) >= 1:
 29 |                 cc_flag.append('-gencode')
 30 |                 cc_flag.append('arch=compute_86,code=sm_86')
 31 |             if int(bare_metal_minor) >= 4:
 32 |                 cc_flag.append('-gencode')
 33 |                 cc_flag.append('arch=compute_87,code=sm_87')
 34 |             if int(bare_metal_minor) >= 8:
 35 |                 cc_flag.append('-gencode')
 36 |                 cc_flag.append('arch=compute_89,code=sm_89')
 37 |         if int(bare_metal_major) >= 12:
 38 |             cc_flag.append('-gencode')
 39 |             cc_flag.append('arch=compute_90,code=sm_90')
 40 | 
 41 |     # Build path
 42 |     srcpath = pathlib.Path(__file__).parent.absolute()
 43 |     buildpath = srcpath / 'build'
 44 |     _create_build_dir(buildpath)
 45 | 
 46 |     # Helper function to build the kernels.
 47 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags, extra_include_paths):
 48 |         if torch.version.hip is not None:
 49 |             extra_cuda_cflags=['-O3'] + extra_cuda_flags + cc_flag
 50 |         else:
 51 |             extra_cuda_cflags=['-O3',
 52 |                                '-gencode', 'arch=compute_70,code=sm_70',
 53 |                                '--use_fast_math'] + extra_cuda_flags + cc_flag
 54 | 
 55 |         return cpp_extension.load(
 56 |             name=name,
 57 |             sources=sources,
 58 |             build_directory=buildpath,
 59 |             extra_cflags=['-O3',],
 60 |             extra_cuda_cflags=extra_cuda_cflags,
 61 |             extra_include_paths=extra_include_paths,
 62 |             verbose=(args.rank == 0)
 63 |         )
 64 | 
 65 |     # ==============
 66 |     # Fused softmax.
 67 |     # ==============
 68 | 
 69 |     if torch.version.hip is not None:
 70 |         extra_include_paths=[os.path.abspath(srcpath)]
 71 |     else:
 72 |         extra_include_paths=[]
 73 | 
 74 |     if args.masked_softmax_fusion:
 75 |         if torch.version.hip is not None:
 76 |              extra_cuda_flags = ['-D__HIP_NO_HALF_OPERATORS__=1',
 77 |                                 '-D__HIP_NO_HALF_CONVERSIONS__=1']
 78 |         else:
 79 |              extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
 80 |                                 '-U__CUDA_NO_HALF_CONVERSIONS__',
 81 |                                 '--expt-relaxed-constexpr',
 82 |                                 '--expt-extended-lambda']
 83 |         
 84 |         # Upper triangular softmax.
 85 |         sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
 86 |                  srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
 87 |         scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
 88 |             "scaled_upper_triang_masked_softmax_cuda",
 89 |             sources, extra_cuda_flags, extra_include_paths)
 90 | 
 91 |         # Masked softmax.
 92 |         sources=[srcpath / 'scaled_masked_softmax.cpp',
 93 |                  srcpath / 'scaled_masked_softmax_cuda.cu']
 94 |         scaled_masked_softmax_cuda = _cpp_extention_load_helper(
 95 |             "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths)
 96 | 
 97 |         # Softmax
 98 |         sources=[srcpath / 'scaled_softmax.cpp',
 99 |                  srcpath / 'scaled_softmax_cuda.cu']
100 |         scaled_softmax_cuda = _cpp_extention_load_helper(
101 |             "scaled_softmax_cuda", sources, extra_cuda_flags, extra_include_paths)
102 | 
103 | 
104 | def _get_cuda_bare_metal_version(cuda_dir):
105 |     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
106 |                                          universal_newlines=True)
107 |     output = raw_output.split()
108 |     release_idx = output.index("release") + 1
109 |     release = output[release_idx].split(".")
110 |     bare_metal_major = release[0]
111 |     bare_metal_minor = release[1][0]
112 | 
113 |     return raw_output, bare_metal_major, bare_metal_minor
114 | 
115 | 
116 | def _create_build_dir(buildpath):
117 |     try:
118 |         os.mkdir(buildpath)
119 |     except OSError:
120 |         if not os.path.isdir(buildpath):
121 |             print(f"Creation of the build directory {buildpath} failed")
122 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_masked_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     torch::Tensor const& mask,
14 |     float scale_factor);
15 | 
16 | torch::Tensor bwd_cuda(
17 |     torch::Tensor const& output_grads, 
18 |     torch::Tensor const& softmax_results,
19 |     float scale_factor);
20 | 
21 | int get_batch_per_block_cuda(
22 |     int query_seq_len,
23 |     int key_seq_len,
24 |     int batches,
25 |     int attn_heads);
26 | 
27 | torch::Tensor fwd(
28 |     torch::Tensor const& input,
29 |     torch::Tensor const& mask,
30 |     float scale_factor) {
31 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
32 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
33 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
34 |       "Only fp16 and bf16 are supported");
35 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
36 | 
37 |   return fwd_cuda(input, mask, scale_factor);
38 | }
39 | 
40 | torch::Tensor bwd(
41 |     torch::Tensor const& output_grads, 
42 |     torch::Tensor const& softmax_results,
43 |     float scale_factor) {
44 | 
45 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
46 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
47 | 
48 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
49 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
50 |       "Only fp16 and bf16 are supported");
51 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
52 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
53 |       "Only fp16 and bf16 are supported");
54 | 
55 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 | 
58 | int get_batch_per_block(
59 |     int query_seq_len,
60 |     int key_seq_len,
61 |     int batches,
62 |     int attn_heads) {
63 |     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
64 | }
65 | 
66 | } // end namespace scaled_masked_softmax
67 | } // end namespace fused_softmax
68 | } // end namespace multihead_attn
69 | 
70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
71 |   m.def("forward", 
72 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
73 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
74 | 
75 |   m.def("backward",
76 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
77 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
78 | 
79 |   m.def("get_batch_per_block",
80 |         &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
81 |         "Return Batch per block size."
82 |   );
83 | }
84 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/scaled_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
  2 | 
  3 | #include <ATen/ATen.h>
  4 | #include <cuda.h>
  5 | #include <cuda_runtime.h>
  6 | #include <cuda_fp16.h>
  7 | #ifndef __HIP_PLATFORM_HCC__
  8 | #include <cuda_profiler_api.h>
  9 | #endif
 10 | #include <ATen/cuda/CUDAContext.h>
 11 | #include <torch/extension.h>
 12 | #include "scaled_masked_softmax.h"
 13 | #include "type_shim.h"
 14 | 
 15 | namespace multihead_attn {
 16 | namespace fused_softmax {
 17 | namespace scaled_masked_softmax {
 18 | 
 19 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
 20 |     return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
 21 | }
 22 | 
 23 | 
 24 | torch::Tensor fwd_cuda(
 25 |     torch::Tensor const& input,
 26 |     torch::Tensor const& mask,
 27 |     float scale_factor)
 28 | {
 29 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 30 |   const int batches = input.size(0);
 31 |   const int pad_batches = mask.size(0);
 32 |   const int attn_heads = input.size(1);
 33 |   const int query_seq_len = input.size(2);
 34 |   const int key_seq_len = input.size(3);
 35 |   TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
 36 |   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
 37 |   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
 38 |   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
 39 |   TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
 40 |   TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
 41 | 
 42 |   // Output 
 43 |   auto act_options = input.options().requires_grad(false);
 44 |   torch::Tensor softmax_results = 
 45 |       torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
 46 | 
 47 |   // Softmax Intermediate Result Ptr
 48 |   void* input_ptr = static_cast<void*>(input.data_ptr());
 49 |   void* mask_ptr = static_cast<void*>(mask.data_ptr());
 50 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 51 | 
 52 |   DISPATCH_HALF_AND_BFLOAT(
 53 |       input.scalar_type(),
 54 |       "dispatch_scaled_masked_softmax_forward",
 55 |       dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
 56 |       reinterpret_cast<scalar_t*>(softmax_results_ptr),
 57 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
 58 | 	  reinterpret_cast<const uint8_t*>(mask_ptr),
 59 | 	  scale_factor,
 60 | 	  query_seq_len,
 61 | 	  key_seq_len,
 62 | 	  batches,
 63 | 	  attn_heads,
 64 | 	  pad_batches);
 65 |       );
 66 |   return softmax_results;
 67 | }
 68 | 
 69 | torch::Tensor bwd_cuda(
 70 |     torch::Tensor const& output_grads_, 
 71 |     torch::Tensor const& softmax_results_, 
 72 |     float scale_factor)  {
 73 | 	
 74 |   auto output_grads = output_grads_.contiguous();
 75 |   auto softmax_results = softmax_results_.contiguous();
 76 | 
 77 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 78 |   const int batches = output_grads.size(0);
 79 |   const int attn_heads = output_grads.size(1);
 80 |   const int query_seq_len = output_grads.size(2);
 81 |   const int key_seq_len = output_grads.size(3);
 82 | 
 83 |   auto act_options = output_grads.options().requires_grad(false);
 84 |   torch::Tensor input_grads = 
 85 |             torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);  
 86 | 
 87 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 88 |   void* input_grads_ptr = static_cast<void*>(input_grads.data_ptr());
 89 | 
 90 |   //Softmax Grad
 91 |   DISPATCH_HALF_AND_BFLOAT(
 92 |       output_grads_.scalar_type(),
 93 |       "dispatch_scaled_masked_softmax_backward",
 94 |       dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
 95 |       reinterpret_cast<scalar_t*>(input_grads_ptr), 
 96 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
 97 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
 98 | 	  scale_factor,
 99 | 	  query_seq_len,
100 | 	  key_seq_len,
101 | 	  batches,
102 | 	  attn_heads);
103 |       );
104 |   
105 |   return input_grads;
106 | }
107 | }
108 | }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/scaled_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     float scale_factor);
14 | 
15 | torch::Tensor bwd_cuda(
16 |     torch::Tensor const& output_grads, 
17 |     torch::Tensor const& softmax_results,
18 |     float scale_factor);
19 | 
20 | torch::Tensor fwd(
21 |     torch::Tensor const& input,
22 |     float scale_factor) {
23 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
24 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
25 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
26 |       "Only fp16 and bf16 are supported");
27 | 
28 |   return fwd_cuda(input, scale_factor);
29 | }
30 | 
31 | torch::Tensor bwd(
32 |     torch::Tensor const& output_grads, 
33 |     torch::Tensor const& softmax_results,
34 |     float scale_factor) {
35 | 
36 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
37 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
38 | 
39 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
40 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
41 |       "Only fp16 and bf16 are supported");
42 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
43 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
44 |       "Only fp16 and bf16 are supported");
45 | 
46 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
47 | }
48 | 
49 | } // end namespace scaled_softmax
50 | } // end namespace fused_softmax
51 | } // end namespace multihead_attn
52 | 
53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
54 |   m.def("forward", 
55 |         &multihead_attn::fused_softmax::scaled_softmax::fwd, 
56 | 	"Self Multihead Attention scaled, softmax -- Forward.");
57 |   m.def("backward", 
58 |         &multihead_attn::fused_softmax::scaled_softmax::bwd,
59 | 	"Self Multihead Attention scaled, softmax -- Backward.");
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/scaled_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <cuda.h>
 5 | #include <cuda_runtime.h>
 6 | #include <cuda_fp16.h>
 7 | #ifndef __HIP_PLATFORM_HCC__
 8 | #include <cuda_profiler_api.h>
 9 | #endif
10 | #include <ATen/cuda/CUDAContext.h>
11 | #include <torch/extension.h>
12 | #include "scaled_masked_softmax.h"
13 | #include "type_shim.h"
14 | 
15 | namespace multihead_attn {
16 | namespace fused_softmax {
17 | namespace scaled_softmax {
18 | 
19 | torch::Tensor fwd_cuda(
20 |     torch::Tensor const& input,
21 |     float scale_factor)
22 | {
23 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
24 |   const int batches = input.size(0);
25 |   const int attn_heads = input.size(1);
26 |   const int query_seq_len = input.size(2);
27 |   const int key_seq_len = input.size(3);
28 |   TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
29 |   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
30 | 
31 |   // Output 
32 |   auto act_options = input.options().requires_grad(false);
33 |   torch::Tensor softmax_results = 
34 |       torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
35 | 
36 |   // Softmax Intermediate Result Ptr
37 |   void* input_ptr = static_cast<void*>(input.data_ptr());
38 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
39 | 
40 |   DISPATCH_HALF_AND_BFLOAT(
41 |       input.scalar_type(),
42 |       "dispatch_scaled_softmax_forward",
43 |       dispatch_scaled_softmax_forward<scalar_t, scalar_t, float>(
44 |           reinterpret_cast<scalar_t*>(softmax_results_ptr),
45 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
46 | 	  scale_factor,
47 | 	  query_seq_len,
48 | 	  key_seq_len,
49 | 	  batches,
50 | 	  attn_heads);
51 |       );
52 |   return softmax_results;
53 | }
54 | 
55 | torch::Tensor bwd_cuda(
56 |     torch::Tensor const& output_grads_, 
57 |     torch::Tensor const& softmax_results_, 
58 |     float scale_factor)  {
59 | 	
60 |   auto output_grads = output_grads_.contiguous();
61 |   auto softmax_results = softmax_results_.contiguous();
62 | 
63 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
64 |   const int batches = output_grads.size(0);
65 |   const int attn_heads = output_grads.size(1);
66 |   const int query_seq_len = output_grads.size(2);
67 |   const int key_seq_len = output_grads.size(3);
68 | 
69 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
70 | 
71 |   //Softmax Grad
72 |   DISPATCH_HALF_AND_BFLOAT(
73 |       output_grads_.scalar_type(),
74 |       "dispatch_scaled_masked_softmax_backward",
75 |       dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
76 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
77 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
78 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
79 | 	  scale_factor,
80 | 	  query_seq_len,
81 | 	  key_seq_len,
82 | 	  batches,
83 | 	  attn_heads);
84 | 			   );
85 |   
86 |   //backward pass is completely in-place
87 |   return output_grads;
88 | }
89 | }
90 | }
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_upper_triang_masked_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     float scale_factor);
14 | 
15 | torch::Tensor bwd_cuda(
16 |     torch::Tensor const& output_grads, 
17 |     torch::Tensor const& softmax_results,
18 |     float scale_factor);
19 | 
20 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
21 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
22 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
23 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
24 |       "Only fp16 and bf16 are supported");
25 | 
26 |   return fwd_cuda(input, scale_factor);
27 | }
28 | 
29 | torch::Tensor bwd(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor) {
33 | 
34 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
35 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
36 | 
37 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
38 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
39 |       "Only fp16 and bf16 are supported");
40 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
41 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
42 |       "Only fp16 and bf16 are supported");
43 | 
44 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
45 | }
46 | 
47 | } // end namespace scaled_upper_triang_masked_softmax
48 | } // end namespace fused_softmax
49 | } // end namespace multihead_attn
50 | 
51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
52 |   m.def("forward", 
53 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
54 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
55 |   m.def("backward", 
56 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
57 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
58 | }
59 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <cuda.h>
 5 | #include <cuda_runtime.h>
 6 | #include <cuda_fp16.h>
 7 | #ifndef __HIP_PLATFORM_HCC__
 8 | #include <cuda_profiler_api.h>
 9 | #endif
10 | #include <ATen/cuda/CUDAContext.h>
11 | #include <torch/extension.h>
12 | #include "scaled_upper_triang_masked_softmax.h"
13 | #include "type_shim.h"
14 | 
15 | namespace multihead_attn {
16 | namespace fused_softmax {
17 | namespace scaled_upper_triang_masked_softmax {
18 | 
19 | torch::Tensor fwd_cuda(
20 |     torch::Tensor const& input, 
21 |     float scale_factor)
22 | {
23 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
24 |   const int attn_batches = input.size(0);
25 |   const int seq_len = input.size(1);
26 |   TORCH_INTERNAL_ASSERT(seq_len <= 16384);
27 | 
28 |   // Output 
29 |   auto act_options = input.options().requires_grad(false);
30 |   torch::Tensor softmax_results = 
31 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
32 | 
33 |   // Softmax Intermediate Result Ptr
34 |   void* input_ptr = static_cast<void*>(input.data_ptr());
35 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
36 | 
37 |   DISPATCH_HALF_AND_BFLOAT(
38 |       input.scalar_type(),
39 |       "dispatch_scaled_upper_triang_masked_softmax_forward",
40 |       dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
41 | 	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
42 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
43 | 	  scale_factor,
44 | 	  seq_len,
45 | 	  seq_len,
46 | 	  attn_batches);
47 |       );
48 |   return softmax_results;
49 | }
50 | 				      
51 | 
52 | torch::Tensor bwd_cuda(
53 |     torch::Tensor const& output_grads_, 
54 |     torch::Tensor const& softmax_results_, 
55 |     float scale_factor)  {
56 | 	
57 |   auto output_grads = output_grads_.contiguous();
58 |   auto softmax_results = softmax_results_.contiguous();
59 | 
60 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
61 |   const int attn_batches = output_grads.size(0);
62 |   const int seq_len = output_grads.size(1);
63 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
64 | 
65 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
66 | 
67 |   //Softmax Grad
68 |   DISPATCH_HALF_AND_BFLOAT(
69 |       output_grads_.scalar_type(),
70 |       "dispatch_scaled_upper_triang_masked_softmax_backward",
71 |       dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
72 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
73 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
74 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
75 | 	  scale_factor,
76 | 	  seq_len,
77 | 	  seq_len,
78 | 	  attn_batches);
79 |       );
80 |   
81 |   //backward pass is completely in-place
82 |   return output_grads;
83 | }
84 | }
85 | }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/src/megatron/fused_kernels/type_shim.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
  2 | 
  3 | 
  4 | #include <ATen/ATen.h>
  5 | #include "compat.h"
  6 | 
  7 | 
  8 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
  9 |   switch(TYPE)								\
 10 |     {									\
 11 |     case at::ScalarType::Half:						\
 12 |       {									\
 13 | 	using scalar_t = at::Half;					\
 14 | 	__VA_ARGS__;							\
 15 | 	break;								\
 16 |       }									\
 17 |     case at::ScalarType::BFloat16:					\
 18 |       {									\
 19 | 	using scalar_t = at::BFloat16;					\
 20 | 	__VA_ARGS__;							\
 21 | 	break;								\
 22 |       }									\
 23 |     default:								\
 24 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
 25 |       }
 26 | 
 27 | 
 28 | #define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...)			\
 29 |   switch(TYPE)								\
 30 |     {									\
 31 |     case at::ScalarType::Half:						\
 32 |       {									\
 33 | 	using scalar_t = at::Half;					\
 34 | 	__VA_ARGS__;							\
 35 | 	break;								\
 36 |       }									\
 37 |     case at::ScalarType::BFloat16:					\
 38 |       {									\
 39 | 	using scalar_t = at::BFloat16;					\
 40 | 	__VA_ARGS__;							\
 41 | 	break;								\
 42 |       }									\
 43 |     case at::ScalarType::Float:						\
 44 |       {									\
 45 | 	using scalar_t = float;					\
 46 | 	__VA_ARGS__;							\
 47 | 	break;								\
 48 |       }									\
 49 |     default:								\
 50 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
 51 |       }
 52 | 
 53 | 
 54 | 
 55 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
 56 |   switch(TYPEIN)							\
 57 |     {									\
 58 |     case at::ScalarType::Float:						\
 59 |       {									\
 60 | 	using scalar_t_in = float;					\
 61 | 	switch(TYPEOUT)							\
 62 | 	  {								\
 63 | 	  case at::ScalarType::Float:					\
 64 | 	    {								\
 65 | 	      using scalar_t_out = float;				\
 66 | 	      __VA_ARGS__;						\
 67 | 	      break;							\
 68 | 	    }								\
 69 | 	  case at::ScalarType::Half:					\
 70 | 	    {								\
 71 | 	      using scalar_t_out = at::Half;				\
 72 | 	      __VA_ARGS__;						\
 73 | 	      break;							\
 74 | 	    }								\
 75 | 	  case at::ScalarType::BFloat16:				\
 76 | 	    {								\
 77 | 	      using scalar_t_out = at::BFloat16;			\
 78 | 	      __VA_ARGS__;						\
 79 | 	      break;							\
 80 | 	    }								\
 81 | 	  default:							\
 82 | 	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
 83 | 	  }								\
 84 | 	break;								\
 85 |       }									\
 86 |     case at::ScalarType::Half:						\
 87 |       {									\
 88 | 	using scalar_t_in = at::Half;					\
 89 | 	using scalar_t_out = at::Half;					\
 90 | 	__VA_ARGS__;							\
 91 | 	break;								\
 92 |       }									\
 93 |     case at::ScalarType::BFloat16:					\
 94 |       {									\
 95 | 	using scalar_t_in = at::BFloat16;				\
 96 | 	using scalar_t_out = at::BFloat16;				\
 97 | 	__VA_ARGS__;							\
 98 | 	break;								\
 99 |       }									\
100 |     default:								\
101 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
102 |     }
103 | 
104 | 


--------------------------------------------------------------------------------
/src/megatron/global_vars.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Megatron global variables."""
  4 | 
  5 | import os
  6 | import sys
  7 | import torch
  8 | 
  9 | from megatron import dist_signal_handler
 10 | from megatron.tokenizer import build_tokenizer
 11 | from .microbatches import build_num_microbatches_calculator
 12 | from .timers import Timers
 13 | 
 14 | _GLOBAL_ARGS = None
 15 | _GLOBAL_RETRO_ARGS = None
 16 | _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
 17 | _GLOBAL_TOKENIZER = None
 18 | _GLOBAL_TENSORBOARD_WRITER = None
 19 | _GLOBAL_ADLR_AUTORESUME = None
 20 | _GLOBAL_TIMERS = None
 21 | _GLOBAL_SIGNAL_HANDLER = None
 22 | 
 23 | def get_args():
 24 |     """Return arguments."""
 25 |     _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
 26 |     return _GLOBAL_ARGS
 27 | 
 28 | 
 29 | def get_retro_args():
 30 |     """Return retro arguments."""
 31 |     return _GLOBAL_RETRO_ARGS
 32 | 
 33 | 
 34 | def get_num_microbatches():
 35 |     return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get()
 36 | 
 37 | 
 38 | def get_current_global_batch_size():
 39 |     return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size()
 40 | 
 41 | 
 42 | def update_num_microbatches(consumed_samples, consistency_check=True):
 43 |     _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples,
 44 |                                                consistency_check)
 45 | 
 46 | 
 47 | def get_tokenizer():
 48 |     """Return tokenizer."""
 49 |     _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
 50 |     return _GLOBAL_TOKENIZER
 51 | 
 52 | 
 53 | def get_tensorboard_writer():
 54 |     """Return tensorboard writer. It can be None so no need
 55 |     to check if it is initialized."""
 56 |     return _GLOBAL_TENSORBOARD_WRITER
 57 | 
 58 | 
 59 | def get_adlr_autoresume():
 60 |     """ADLR autoresume object. It can be None so no need
 61 |     to check if it is initialized."""
 62 |     return _GLOBAL_ADLR_AUTORESUME
 63 | 
 64 | 
 65 | def get_timers():
 66 |     """Return timers."""
 67 |     _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
 68 |     return _GLOBAL_TIMERS
 69 | 
 70 | 
 71 | def get_signal_handler():
 72 |     _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
 73 |     return _GLOBAL_SIGNAL_HANDLER
 74 | 
 75 | 
 76 | def _set_signal_handler():
 77 |     global _GLOBAL_SIGNAL_HANDLER
 78 |     _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
 79 |     _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__()
 80 | 
 81 | 
 82 | 
 83 | def set_global_variables(args):
 84 |     """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
 85 | 
 86 |     assert args is not None
 87 | 
 88 |     _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
 89 |     set_args(args)
 90 | 
 91 |     _build_num_microbatches_calculator(args)
 92 |     _ = _build_tokenizer(args)
 93 |     _set_tensorboard_writer(args)
 94 |     _set_adlr_autoresume(args)
 95 |     _set_timers(args)
 96 | 
 97 |     if args.exit_signal_handler:
 98 |         _set_signal_handler()
 99 |     
100 | 
101 | def set_args(args):
102 |     global _GLOBAL_ARGS
103 |     _GLOBAL_ARGS = args
104 | 
105 | 
106 | def set_retro_args(retro_args):
107 |     global _GLOBAL_RETRO_ARGS
108 |     _GLOBAL_RETRO_ARGS = retro_args
109 | 
110 | 
111 | def _build_num_microbatches_calculator(args):
112 | 
113 |     global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
114 |     _ensure_var_is_not_initialized(_GLOBAL_NUM_MICROBATCHES_CALCULATOR,
115 |                                    'num microbatches calculator')
116 | 
117 |     _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator(
118 |         args)
119 | 
120 | 
121 | def _build_tokenizer(args):
122 |     """Initialize tokenizer."""
123 |     global _GLOBAL_TOKENIZER
124 |     _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
125 |     _GLOBAL_TOKENIZER = build_tokenizer(args)
126 |     return _GLOBAL_TOKENIZER
127 | 
128 | 
129 | def rebuild_tokenizer(args):
130 |     global _GLOBAL_TOKENIZER
131 |     _GLOBAL_TOKENIZER = None
132 |     return _build_tokenizer(args)
133 | 
134 | 
135 | def _set_tensorboard_writer(args):
136 |     """Set tensorboard writer."""
137 |     global _GLOBAL_TENSORBOARD_WRITER
138 |     _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
139 |                                    'tensorboard writer')
140 | 
141 |     if hasattr(args, 'tensorboard_dir') and \
142 |        args.tensorboard_dir and args.rank == (args.world_size - 1):
143 |         try:
144 |             from torch.utils.tensorboard import SummaryWriter
145 |             print('> setting tensorboard ...')
146 |             _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
147 |                 log_dir=args.tensorboard_dir,
148 |                 max_queue=args.tensorboard_queue_size)
149 |         except ModuleNotFoundError:
150 |             print('WARNING: TensorBoard writing requested but is not '
151 |                   'available (are you using PyTorch 1.1.0 or later?), '
152 |                   'no TensorBoard logs will be written.', flush=True)
153 | 
154 | 
155 | def _set_adlr_autoresume(args):
156 |     """Initialize ADLR autoresume."""
157 |     global _GLOBAL_ADLR_AUTORESUME
158 |     _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
159 | 
160 |     if args.adlr_autoresume:
161 |         if args.rank == 0:
162 |             print('enabling autoresume ...', flush=True)
163 |         sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
164 |         try:
165 |             from userlib.auto_resume import AutoResume
166 |         except BaseException:
167 |             print('ADLR autoresume is not available, exiting ...')
168 |             sys.exit()
169 | 
170 |         _GLOBAL_ADLR_AUTORESUME = AutoResume
171 | 
172 | 
173 | def _set_timers(args):
174 |     """Initialize timers."""
175 |     global _GLOBAL_TIMERS
176 |     _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
177 |     _GLOBAL_TIMERS = Timers(args.timing_log_level, args.timing_log_option)
178 | 
179 | 
180 | def _ensure_var_is_initialized(var, name):
181 |     """Make sure the input variable is not None."""
182 |     assert var is not None, '{} is not initialized.'.format(name)
183 | 
184 | 
185 | def _ensure_var_is_not_initialized(var, name):
186 |     """Make sure the input variable is not None."""
187 |     assert var is None, '{} is already initialized.'.format(name)
188 | 


--------------------------------------------------------------------------------
/src/megatron/indexer.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import torch
  4 | import torch.distributed as dist
  5 | 
  6 | from megatron import get_args, print_rank_0
  7 | from megatron.core import mpu
  8 | from megatron.checkpointing import load_biencoder_checkpoint
  9 | from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
 10 | from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch
 11 | from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader
 12 | from megatron.data.realm_index import detach, OpenRetreivalDataStore
 13 | from megatron.model.biencoder_model import get_model_provider
 14 | from megatron.training import get_model
 15 | 
 16 | 
 17 | class IndexBuilder(object):
 18 |     """
 19 |     Object for taking one pass over a dataset and creating a BlockData of its
 20 |     embeddings
 21 |     """
 22 |     def __init__(self):
 23 |         args = get_args()
 24 |         self.model = None
 25 |         self.dataloader = None
 26 |         self.evidence_embedder_obj = None
 27 |         self.biencoder_shared_query_context_model = \
 28 |             args.biencoder_shared_query_context_model
 29 | 
 30 |         # need to know whether we're using a REALM checkpoint (args.load)
 31 |         # or ICT checkpoint
 32 |         assert not (args.load and args.ict_load)
 33 | 
 34 |         self.log_interval = args.indexer_log_interval
 35 |         self.batch_size = args.indexer_batch_size
 36 | 
 37 |         self.load_attributes()
 38 |         self.is_main_builder = mpu.get_data_parallel_rank() == 0
 39 |         self.num_total_builders = mpu.get_data_parallel_world_size()
 40 |         self.iteration = self.total_processed = 0
 41 | 
 42 |     def load_attributes(self):
 43 |         """
 44 |         Load the necessary attributes: model, dataloader and empty BlockData
 45 |         """
 46 |         only_context_model = True
 47 |         if self.biencoder_shared_query_context_model:
 48 |             only_context_model = False
 49 | 
 50 |         model = get_model(get_model_provider(only_context_model=\
 51 |             only_context_model, biencoder_shared_query_context_model=\
 52 |             self.biencoder_shared_query_context_model))
 53 | 
 54 |         self.model = load_biencoder_checkpoint(model,
 55 |                 only_context_model=only_context_model)
 56 | 
 57 |         assert len(self.model) == 1
 58 |         self.model[0].eval()
 59 | 
 60 |         self.dataset = get_open_retrieval_wiki_dataset()
 61 |         self.dataloader = iter(get_one_epoch_dataloader(self.dataset, \
 62 |             self.batch_size))
 63 | 
 64 |         self.evidence_embedder_obj = OpenRetreivalDataStore( \
 65 |             load_from_path=False)
 66 | 
 67 |     def track_and_report_progress(self, batch_size):
 68 |         """
 69 |         Utility function for tracking progress
 70 |         """
 71 |         self.iteration += 1
 72 |         self.total_processed += batch_size * self.num_total_builders
 73 |         if self.is_main_builder and self.iteration % self.log_interval == 0:
 74 |             print('Batch {:10d} | Total {:10d}'.format(self.iteration,
 75 |                 self.total_processed), flush=True)
 76 | 
 77 |     def build_and_save_index(self):
 78 |         """
 79 |         Goes through one epoch of the dataloader and adds all data to this
 80 |         instance's BlockData.
 81 | 
 82 |         The copy of BlockData is saved as a shard, which when run in a
 83 |         distributed setting will be consolidated by the rank 0 process
 84 |         and saved as a final pickled BlockData.
 85 |         """
 86 |         assert len(self.model) == 1
 87 |         unwrapped_model = self.model[0]
 88 | 
 89 |         while not hasattr(unwrapped_model, 'embed_text'):
 90 |             unwrapped_model = unwrapped_model.module
 91 | 
 92 |         while True:
 93 |             try:
 94 |                 # batch also has query_tokens and query_pad_data
 95 |                 row_id, context_tokens, context_mask, context_types, \
 96 |                     context_pad_mask = get_open_retrieval_batch( \
 97 |                     self.dataloader)
 98 |             except (StopIteration, IndexError):
 99 |                 break
100 | 
101 |             # TODO: can we add with torch.no_grad() to reduce memory usage
102 |             # detach, separate fields and add to BlockData
103 |             assert context_mask.dtype == torch.bool
104 |             context_logits = unwrapped_model.embed_text(
105 |                 unwrapped_model.context_model, context_tokens, context_mask,
106 |                 context_types)
107 | 
108 |             context_logits = detach(context_logits)
109 |             row_id = detach(row_id)
110 | 
111 |             self.evidence_embedder_obj.add_block_data(row_id, context_logits)
112 |             self.track_and_report_progress(batch_size=len(row_id))
113 | 
114 |         # This process signals to finalize its shard and then synchronize with
115 |         # the other processes
116 |         self.evidence_embedder_obj.save_shard()
117 |         torch.distributed.barrier()
118 |         del self.model
119 | 
120 |         # rank 0 process builds the final copy
121 |         if self.is_main_builder:
122 |             self.evidence_embedder_obj.merge_shards_and_save()
123 |             # make sure that every single piece of data was embedded
124 |             assert len(self.evidence_embedder_obj.embed_data) == \
125 |                 len(self.dataset)
126 |         self.evidence_embedder_obj.clear()
127 | 
128 |         # complete building the final copy
129 |         torch.distributed.barrier()
130 | 


--------------------------------------------------------------------------------
/src/megatron/memory.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | 
  4 | import torch
  5 | from deepspeed.accelerator import get_accelerator
  6 | 
  7 | # A dictionary of all the memory buffers allocated.
  8 | _MEM_BUFFS = dict()
  9 | 
 10 | 
 11 | def allocate_mem_buff(name, numel, dtype, track_usage):
 12 |     """Allocate a memory buffer."""
 13 |     assert name not in _MEM_BUFFS, \
 14 |         'memory buffer {} already allocated.'.format(name)
 15 |     _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
 16 |     return _MEM_BUFFS[name]
 17 | 
 18 | 
 19 | def get_mem_buff(name):
 20 |     """Get the memory buffer."""
 21 |     return _MEM_BUFFS[name]
 22 | 
 23 | 
 24 | class MemoryBuffer:
 25 |     """Contiguous memory buffer.
 26 |     Allocate a contiguous memory of type `dtype` and size `numel`. It is
 27 |     used to reduce memory fragmentation.
 28 | 
 29 |     Usage: After the allocation, the `_start` index is set tot the first
 30 |            index of the memory. A memory chunk starting from `_start` index
 31 |            can be `allocated` for an input tensor, with the elements of the
 32 |            tensor being coppied. The buffer can be reused by resetting the
 33 |            `_start` index.
 34 | 
 35 |     """
 36 |     def __init__(self, name, numel, dtype, track_usage):
 37 |         if torch.distributed.get_rank() == 0:
 38 |             element_size = torch.tensor([], dtype=dtype).element_size()
 39 |             print('> building the {} memory buffer with {} num elements '
 40 |                   'and {} dtype ({:.1f} MB)...'.format(
 41 |                       name, numel, dtype, numel*element_size/1024/1024),
 42 |                   flush=True)
 43 |         self.name = name
 44 |         self.numel = numel
 45 |         self.dtype = dtype
 46 |         self.data = torch.empty(self.numel,
 47 |                                 dtype=self.dtype,
 48 |                                 device=get_accelerator().current_device_name(),
 49 |                                 requires_grad=False)
 50 | 
 51 |         # Index tracking the start of the free memory.
 52 |         self._start = 0
 53 | 
 54 |         # Values used for tracking usage.
 55 |         self.track_usage = track_usage
 56 |         if self.track_usage:
 57 |             self.in_use_value = 0.0
 58 |             self.total_value = 0.0
 59 | 
 60 | 
 61 |     def reset(self):
 62 |         """Reset the buffer start index to the beginning of the buffer."""
 63 |         self._start = 0
 64 | 
 65 | 
 66 |     def is_in_use(self):
 67 |         """Whether the current buffer hold on to any memory."""
 68 |         return self._start > 0
 69 | 
 70 | 
 71 |     def numel_in_use(self):
 72 |         """Return number of elements in use."""
 73 |         return self._start
 74 | 
 75 | 
 76 |     def add(self, tensor):
 77 |         """Allocate a chunk of memory from the buffer to tensor and copy
 78 |         the values."""
 79 |         assert tensor.dtype == self.dtype, \
 80 |             'Input tensor type {} different from buffer type {}'.format(
 81 |                 tensor.dtype, self.dtype)
 82 |         # Number of elements of the input tensor.
 83 |         tensor_numel = torch.numel(tensor)
 84 |         new_start = self._start + tensor_numel
 85 |         assert new_start <= self.numel, \
 86 |             'Not enough memory left in the buffer ({} > {})'.format(
 87 |                 tensor_numel, self.numel - self._start)
 88 |         # New tensor is a view into the memory.
 89 |         new_tensor = self.data[self._start:new_start]
 90 |         self._start = new_start
 91 |         new_tensor = new_tensor.view(tensor.shape)
 92 |         new_tensor.copy_(tensor)
 93 |         # Return a pointer to the new tensor.
 94 |         return new_tensor
 95 | 
 96 | 
 97 |     def get_data(self):
 98 |         """Return the data currently in use."""
 99 |         if self.track_usage:
100 |             self.in_use_value += float(self._start)
101 |             self.total_value += float(self.numel)
102 |         return self.data[:self._start]
103 | 
104 | 
105 |     def print_average_usage(self):
106 |         """Print memory usage average over time. We would like this value
107 |         to be as high as possible."""
108 |         assert self.track_usage, 'You need to enable track usage.'
109 |         if torch.distributed.get_rank() == 0:
110 |             print(' > usage of {} memory buffer: {:.2f} %'.format(
111 |                 self.name, self.in_use_value * 100.0 / self.total_value),
112 |                   flush=True)
113 | 
114 | 
115 | 
116 | class RingMemBuffer:
117 |     """A ring of memory buffers."""
118 | 
119 |     def __init__(self, name, num_buffers, numel, dtype, track_usage):
120 |         self.num_buffers = num_buffers
121 |         self.buffers = [
122 |             allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage)
123 |             for i in range(num_buffers)]
124 |         self._index = -1
125 | 
126 | 
127 |     def get_next_buffer(self):
128 |         self._index += 1
129 |         self._index = self._index % self.num_buffers
130 |         buff = self.buffers[self._index]
131 |         assert not buff.is_in_use(), 'buffer is already in use.'
132 |         return buff
133 | 


--------------------------------------------------------------------------------
/src/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from deepspeed.accelerator.real_accelerator import get_accelerator
 4 | if get_accelerator().device_name() == 'cuda':
 5 |     from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 6 | else:
 7 |     from torch.nn import LayerNorm
 8 | 
 9 | from .distributed import DistributedDataParallel
10 | from .bert_model import BertModel
11 | from .gpt_model import GPTModel, GPTModelPipe
12 | from .t5_model import T5Model
13 | from .language_model import get_language_model
14 | from .module import Float16Module
15 | 


--------------------------------------------------------------------------------
/src/megatron/model/classification.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Classification model."""
  4 | 
  5 | import torch
  6 | 
  7 | from megatron import get_args, print_rank_last
  8 | from megatron.model.enums import AttnMaskType
  9 | from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
 10 | from megatron.model.language_model import get_language_model
 11 | from megatron.model.utils import get_linear_layer
 12 | from megatron.model.utils import init_method_normal
 13 | from megatron.model.utils import scaled_init_method_normal
 14 | from .module import MegatronModule
 15 | 
 16 | 
 17 | class Classification(MegatronModule):
 18 | 
 19 |     def __init__(self,
 20 |                  config,
 21 |                  num_classes,
 22 |                  num_tokentypes=2,
 23 |                  pre_process=True,
 24 |                  post_process=True):
 25 |         super().__init__(config=config, share_embeddings_and_output_weights=False)
 26 |         args = get_args()
 27 | 
 28 |         self.num_classes = num_classes
 29 |         self.pre_process = pre_process
 30 |         self.post_process = post_process
 31 | 
 32 |         self.language_model, self._language_model_key = get_language_model(
 33 |             config=config,
 34 |             num_tokentypes=num_tokentypes,
 35 |             add_pooler=True,
 36 |             encoder_attn_mask_type=AttnMaskType.padding,
 37 |             pre_process=self.pre_process,
 38 |             post_process=self.post_process)
 39 | 
 40 |         # Multi-choice head.
 41 |         if self.post_process:
 42 |             self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
 43 |             self.classification_head = get_linear_layer(args.hidden_size,
 44 |                                                         self.num_classes,
 45 |                                                         init_method,
 46 |                                                         gather_params_on_init=args.zero_stage == 3)
 47 |             self._classification_head_key = 'classification_head'
 48 | 
 49 |     def set_input_tensor(self, input_tensor):
 50 |         """See megatron.model.transformer.set_input_tensor()"""
 51 |         self.language_model.set_input_tensor(input_tensor)
 52 | 
 53 |     def forward(self, model_input, attention_mask, tokentype_ids=None):
 54 | 
 55 |         extended_attention_mask = bert_extended_attention_mask(attention_mask)
 56 |         input_ids = model_input
 57 |         position_ids = bert_position_ids(input_ids)
 58 | 
 59 |         lm_output = self.language_model(
 60 |             input_ids,
 61 |             position_ids,
 62 |             extended_attention_mask,
 63 |             tokentype_ids=tokentype_ids
 64 |         )
 65 | 
 66 |         if self.post_process:
 67 |             _, pooled_output = lm_output[0], lm_output[1]
 68 |             classification_output = self.classification_dropout(pooled_output)
 69 |             classification_logits = self.classification_head(classification_output)
 70 | 
 71 |             # Reshape back to separate choices.
 72 |             classification_logits = classification_logits.view(-1, self.num_classes)
 73 | 
 74 |             return classification_logits
 75 |         return lm_output
 76 | 
 77 |     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
 78 |         """For easy load when model is combined with other heads,
 79 |         add an extra key."""
 80 | 
 81 |         state_dict_ = {}
 82 |         state_dict_[self._language_model_key] \
 83 |             = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
 84 |                                                                  keep_vars=keep_vars)
 85 |         if self.post_process:
 86 |             state_dict_[self._classification_head_key] \
 87 |                 = self.classification_head.state_dict(prefix=prefix, keep_vars=keep_vars)
 88 |         return state_dict_
 89 | 
 90 |     def load_state_dict(self, state_dict, strict=True):
 91 |         """Customized load."""
 92 | 
 93 |         self.language_model.load_state_dict(
 94 |             state_dict[self._language_model_key], strict=strict)
 95 |         if self.post_process:
 96 |             if self._classification_head_key in state_dict:
 97 |                 self.classification_head.load_state_dict(
 98 |                     state_dict[self._classification_head_key], strict=strict)
 99 |             else:
100 |                 print_rank_last('***WARNING*** could not find {} in the checkpoint, '
101 |                                 'initializing to random'.format(
102 |                                     self._classification_head_key))
103 | 


--------------------------------------------------------------------------------
/src/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 |     retro_decoder_with_retriever = 5
11 |  
12 | class AttnType(enum.Enum):
13 |     self_attn = 1
14 |     cross_attn = 2
15 | 
16 | class AttnMaskType(enum.Enum):
17 |     padding = 1
18 |     causal = 2
19 | 
20 | # For backward compatibility with old model checkpoints
21 | from megatron.core.enums import ModelType
22 | 


--------------------------------------------------------------------------------
/src/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 7 | # 1/sqrt(2*pi)-> 0.3989423
 8 | # 1/sqrt(2)   -> 0.70710678
 9 | # sqrt(2/pi)  -> 0.79788456
10 | # this function is tanh approximation of gelu
11 | # actual gelu is:
12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
13 | 
14 | @torch.jit.script
15 | def bias_gelu(bias, y):
16 |     x = bias + y
17 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18 | 
19 | # gradient of tanh approximation of gelu
20 | # gradient of actual gelu is:
21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
22 | @torch.jit.script
23 | def bias_gelu_back(g, bias, y):
24 |     x = bias + y
25 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
26 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
27 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
28 |     return ff*g
29 | 
30 | class GeLUFunction(torch.autograd.Function):
31 |     @staticmethod
32 |     # bias is an optional argument
33 |     def forward(ctx, input, bias):
34 |         ctx.save_for_backward(input, bias)
35 |         return bias_gelu(bias, input)
36 | 
37 |     @staticmethod
38 |     def backward(ctx, grad_output):
39 |         input, bias = ctx.saved_tensors
40 |         tmp = bias_gelu_back(grad_output, bias, input)
41 |         return tmp, tmp
42 | 
43 | bias_gelu_impl = GeLUFunction.apply
44 | 


--------------------------------------------------------------------------------
/src/megatron/model/fused_layer_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """This code is copied fron NVIDIA apex:
 4 |       https://github.com/NVIDIA/apex
 5 |    with some changes. """
 6 | 
 7 | import numbers
 8 | import torch
 9 | from torch.nn.parameter import Parameter
10 | from torch.nn import init
11 | import importlib
12 | from torch.nn import functional as F
13 | 
14 | from megatron.core.utils import make_viewless_tensor
15 | 
16 | try:
17 |     from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
18 |     HAVE_PERSIST_LAYER_NORM = True
19 | except:
20 |     HAVE_PERSIST_LAYER_NORM = False
21 | 
22 | from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
23 | 
24 | 
25 | global fused_layer_norm_cuda
26 | fused_layer_norm_cuda = None
27 | 
28 | 
29 | class MixedFusedLayerNorm(torch.nn.Module):
30 | 
31 |   def __init__(self, normalized_shape, eps=1e-5,
32 |                no_persist_layer_norm=True,
33 |                sequence_parallel=False,
34 |                apply_layernorm_1p=False):
35 |         super(MixedFusedLayerNorm, self).__init__()
36 | 
37 |         self.apply_layernorm_1p = apply_layernorm_1p
38 | 
39 |         global fused_layer_norm_cuda
40 |         fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
41 | 
42 |         # List of hiddens sizes supported in the persistent layer norm kernel
43 |         # If the hidden size is not supported, fall back to the non-persistent
44 |         # kernel.
45 |         persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
46 |             5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
47 |             24576, 25600, 30720, 32768, 40960, 49152, 65536]
48 |         if normalized_shape not in persist_ln_hidden_sizes or \
49 |                 not HAVE_PERSIST_LAYER_NORM:
50 |             no_persist_layer_norm = True
51 | 
52 |         if isinstance(normalized_shape, numbers.Integral):
53 |             normalized_shape = (normalized_shape,)
54 |         self.normalized_shape = torch.Size(normalized_shape)
55 |         self.eps = eps
56 |         self.weight = Parameter(torch.Tensor(*normalized_shape))
57 |         self.bias = Parameter(torch.Tensor(*normalized_shape))
58 |         self.reset_parameters()
59 |         self.no_persist_layer_norm = no_persist_layer_norm
60 |         self.sequence_parallel = sequence_parallel
61 | 
62 |         # set sequence parallelism flag on weight and bias parameters
63 |         setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
64 |         setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
65 | 
66 | 
67 |   def reset_parameters(self):
68 | 
69 |     if self.apply_layernorm_1p:
70 |         init.zeros_(self.weight)
71 |         init.zeros_(self.bias)
72 |     else:
73 |         init.ones_(self.weight)
74 |         init.zeros_(self.bias)
75 | 
76 |   def forward(self, input):
77 | 
78 |     weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
79 |     # CPU path is here for unittest sake.
80 |     if not input.is_cuda:
81 |         print("WARNING! The input of FusedLayerNorm should be on the GPU."
82 |               "This warning should only be triggered in the FusedLayerNorm unit tests.")
83 |         return F.layer_norm(input, self.normalized_shape, weight, self.bias, self.eps)
84 | 
85 |     if self.no_persist_layer_norm:
86 |         return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps)
87 |     else:
88 |         output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
89 | 
90 |         # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
91 |         # a populated '_base' field). This will result in schedule.py's
92 |         # deallocate_output_tensor() throwing an error, so a viewless tensor is
93 |         # created to prevent this.
94 |         output = make_viewless_tensor(inp = output,
95 |                                       requires_grad = input.requires_grad,
96 |                                       keep_graph = True)
97 | 
98 |         return output
99 | 


--------------------------------------------------------------------------------
/src/megatron/model/multiple_choice.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Multiple choice model."""
  4 | 
  5 | import torch
  6 | 
  7 | from megatron import get_args, print_rank_last
  8 | from megatron.model.enums import AttnMaskType
  9 | from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
 10 | from megatron.model.language_model import get_language_model
 11 | from megatron.model.utils import get_linear_layer
 12 | from megatron.model.utils import init_method_normal
 13 | from megatron.model.utils import scaled_init_method_normal
 14 | from .module import MegatronModule
 15 | 
 16 | 
 17 | class MultipleChoice(MegatronModule):
 18 | 
 19 |     def __init__(self,
 20 |                  config,
 21 |                  num_tokentypes=2,
 22 |                  pre_process=True,
 23 |                  post_process=True):
 24 |         super(MultipleChoice, self).__init__(share_embeddings_and_output_weights=False)
 25 |         args = get_args()
 26 | 
 27 |         self.pre_process = pre_process
 28 |         self.post_process = post_process
 29 | 
 30 |         self.language_model, self._language_model_key = get_language_model(
 31 |             config=config,
 32 |             num_tokentypes=num_tokentypes,
 33 |             add_pooler=True,
 34 |             encoder_attn_mask_type=AttnMaskType.padding,
 35 |             pre_process=self.pre_process,
 36 |             post_process=self.post_process)
 37 | 
 38 |         # Multi-choice head.
 39 |         if self.post_process:
 40 |             self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
 41 |             self.multichoice_head = get_linear_layer(args.hidden_size, 1,
 42 |                                                      init_method,
 43 |                                                      gather_params_on_init=args.zero_stage == 3)
 44 |             self._multichoice_head_key = 'multichoice_head'
 45 | 
 46 |     def set_input_tensor(self, input_tensor):
 47 |         """See megatron.model.transformer.set_input_tensor()"""
 48 |         self.language_model.set_input_tensor(input_tensor)
 49 | 
 50 |     def forward(self, model_input, attention_mask, tokentype_ids=None):
 51 | 
 52 |         # [batch, choices, sequence] --> [batch * choices, sequence] -->
 53 |         #    transformer --> [batch, choices] --> softmax
 54 | 
 55 |         # Ensure the shape is [batch-size, choices, sequence]
 56 |         assert len(attention_mask.shape) == 3
 57 |         num_choices = attention_mask.shape[1]
 58 | 
 59 |         # Reshape and treat choice dimension the same as batch.
 60 |         attention_mask = attention_mask.view(-1, attention_mask.size(-1))
 61 |         extended_attention_mask = bert_extended_attention_mask(attention_mask)
 62 | 
 63 |         input_ids = model_input
 64 |         # Do the same as attention_mask for input_ids, tokentype_ids
 65 |         assert len(input_ids.shape) == 3
 66 |         assert len(tokentype_ids.shape) == 3
 67 |         input_ids = input_ids.view(-1, input_ids.size(-1))
 68 |         tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
 69 |         position_ids = bert_position_ids(input_ids)
 70 | 
 71 |         lm_output = self.language_model(
 72 |             input_ids,
 73 |             position_ids,
 74 |             extended_attention_mask,
 75 |             tokentype_ids=tokentype_ids
 76 |         )
 77 |         if self.post_process:
 78 |             _, pooled_output = lm_output[0], lm_output[1]
 79 |             multichoice_output = self.multichoice_dropout(pooled_output)
 80 |             multichoice_logits = self.multichoice_head(multichoice_output)
 81 | 
 82 |             # Reshape back to separate choices.
 83 |             multichoice_logits = multichoice_logits.view(-1, num_choices)
 84 | 
 85 |             return multichoice_logits
 86 |         return lm_output
 87 | 
 88 |     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
 89 |         """For easy load when model is combined with other heads,
 90 |         add an extra key."""
 91 | 
 92 |         state_dict_ = {}
 93 |         state_dict_[self._language_model_key] \
 94 |             = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
 95 |                                                                  keep_vars=keep_vars)
 96 |         if self.post_process:
 97 |             state_dict_[self._multichoice_head_key] \
 98 |                 = self.multichoice_head.state_dict(prefix=prefix, keep_vars=keep_vars)
 99 |         return state_dict_
100 | 
101 |     def load_state_dict(self, state_dict, strict=True):
102 |         """Customized load."""
103 | 
104 |         self.language_model.load_state_dict(
105 |             state_dict[self._language_model_key], strict=strict)
106 |         if self.post_process:
107 |             if self._multichoice_head_key in state_dict:
108 |                 self.multichoice_head.load_state_dict(
109 |                     state_dict[self._multichoice_head_key], strict=strict)
110 |             else:
111 |                 print_rank_last('***WARNING*** could not find {} in the checkpoint, '
112 |                                 'initializing to random'.format(
113 |                                     self._multichoice_head_key))
114 | 


--------------------------------------------------------------------------------
/src/megatron/model/rotary_pos_embedding.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The following code has been taken from https://github.com/NVIDIA/NeMo/blob/ \
 4 | # 782b4e1652aaa43c8be390d9db0dc89544afa080/nemo/collections/nlp/modules/ \
 5 | # common/megatron/rotary_pos_embedding.py
 6 | 
 7 | import importlib.util
 8 | import torch
 9 | 
10 | from torch import einsum, nn
11 | 
12 | __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
13 | 
14 | class RotaryEmbedding(nn.Module):
15 |     def __init__(self, dim):
16 |         super().__init__()
17 |         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
18 |         self.register_buffer('inv_freq', inv_freq)
19 |         if importlib.util.find_spec('einops') is None:
20 |             raise RuntimeError("einops is required for Rotary Embedding")
21 | 
22 |     def forward(self, max_seq_len, offset=0):
23 |         seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
24 |         freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
25 |         # first part even vector components, second part odd vector components,
26 |         #  2 * dim in dimension size
27 |         emb = torch.cat((freqs, freqs), dim=-1)
28 |         # emb [seq_length, .., dim]
29 |         from einops import rearrange
30 |         return rearrange(emb, 'n d -> n 1 1 d')
31 | 
32 | 
33 | def _rotate_half(x):
34 |     """
35 |     change sign so the last dimension becomes [-odd, +even]
36 |     """
37 |     from einops import rearrange
38 |     x = rearrange(x, '... (j d) -> ... j d', j=2)
39 |     x1, x2 = x.unbind(dim=-2)
40 |     return torch.cat((-x2, x1), dim=-1)
41 | 
42 | 
43 | def apply_rotary_pos_emb(t, freqs):
44 |     """
45 |     input tensor t is of shape [seq_length, ..., dim]
46 |     rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
47 |     check https://kexue.fm/archives/8265 for detailed formulas
48 |     """
49 |     rot_dim = freqs.shape[-1]
50 |     # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
51 |     t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
52 | 
53 |     # first part is cosine component
54 |     # second part is sine component, need to change signs with _rotate_half method
55 |     t = (t * freqs.cos().to(t.dtype)) + (_rotate_half(t) * freqs.sin().to(t.dtype))
56 |     return torch.cat((t, t_pass), dim=-1)
57 | 


--------------------------------------------------------------------------------
/src/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Utilities for models."""
 4 | 
 5 | import math
 6 | 
 7 | import torch
 8 | 
 9 | from megatron import get_args
10 | 
11 | from deepspeed.runtime.zero import GatheredParameters
12 | 
13 | def init_method_normal(sigma):
14 |     """Init method based on N(0, sigma)."""
15 |     def init_(tensor):
16 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
17 | 
18 |     return init_
19 | 
20 | 
21 | def scaled_init_method_normal(sigma, num_layers):
22 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
23 |     std = sigma / math.sqrt(2.0 * num_layers)
24 | 
25 |     def init_(tensor):
26 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
27 | 
28 |     return init_
29 | 
30 | 
31 | def gather_and_init(param, init_method):
32 |     with GatheredParameters(param, modifier_rank=0):
33 |         init_method(param)
34 |         
35 | 
36 | def attention_mask_func(attention_scores, attention_mask):
37 |     args = get_args()
38 |     if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
39 |         attention_mask_ = attention_mask
40 |         actual_seqlen = attention_scores.size()[2]
41 |         if actual_seqlen != attention_mask_.size()[2]:
42 |             # attention_mask has size [1, 1, seqlen, seqlen]
43 |             attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous()
44 |         attention_scores.masked_fill_(attention_mask_, -10000.0)
45 |     else:
46 |         attention_scores.masked_fill_(attention_mask, -10000.0)
47 |     return attention_scores
48 | 
49 | 
50 | def get_linear_layer(rows, columns, init_method, gather_params_on_init=False):
51 |     """Simple linear layer with weight initialization."""
52 |     layer = torch.nn.Linear(rows, columns)
53 |     if get_args().perform_initialization:
54 |         with GatheredParameters(layer.weight, modifier_rank=0, enabled=gather_params_on_init):
55 |             init_method(layer.weight)
56 |     with torch.no_grad():
57 |         with GatheredParameters(layer.bias, modifier_rank=0, enabled=gather_params_on_init):
58 |             layer.bias.zero_()
59 |     return layer
60 | 
61 | @torch.jit.script
62 | def gelu_impl(x):
63 |     """OpenAI's gelu implementation."""
64 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
65 |                                        (1.0 + 0.044715 * x * x)))
66 | def openai_gelu(x):
67 |     return gelu_impl(x)
68 | 
69 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
70 | @torch.jit.script
71 | def erf_gelu(x):
72 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
73 | 


--------------------------------------------------------------------------------
/src/megatron/model/vision/classification.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Vision Transformer(VIT) model."""
 4 | 
 5 | import torch
 6 | from torch.nn.init import trunc_normal_
 7 | from megatron import get_args
 8 | from megatron.model.utils import get_linear_layer
 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
10 | from megatron.model.vision.mit_backbone import mit_b3_avg
11 | from megatron.model.module import MegatronModule
12 | 
13 | class VitClassificationModel(MegatronModule):
14 |     """Vision Transformer Model."""
15 | 
16 |     def __init__(self, config, num_classes, finetune=False,
17 |                  pre_process=True, post_process=True):
18 |         super(VitClassificationModel, self).__init__()
19 |         args = get_args()
20 | 
21 |         self.hidden_size = args.hidden_size
22 |         self.num_classes = num_classes
23 |         self.finetune = finetune
24 |         self.pre_process = pre_process
25 |         self.post_process = post_process
26 |         self.backbone = VitBackbone(
27 |             config=config,
28 |             pre_process=self.pre_process,
29 |             post_process=self.post_process,
30 |             single_token_output=True
31 |         )
32 |         
33 |         if self.post_process:
34 |             if not self.finetune:
35 |                 self.head = VitMlpHead(self.hidden_size, self.num_classes)
36 |             else:
37 |                 self.head = get_linear_layer(
38 |                     self.hidden_size,
39 |                     self.num_classes,
40 |                     torch.nn.init.zeros_,
41 |                     gather_params_on_init=args.zero_stage == 3
42 |                 )
43 | 
44 |     def set_input_tensor(self, input_tensor):
45 |         """See megatron.model.transformer.set_input_tensor()"""
46 |         self.backbone.set_input_tensor(input_tensor)
47 | 
48 |     def forward(self, input):
49 |         hidden_states = self.backbone(input)
50 | 
51 |         if self.post_process:
52 |             hidden_states = self.head(hidden_states)
53 | 
54 |         return hidden_states
55 | 
56 | 
57 | class MitClassificationModel(MegatronModule):
58 |     """Mix vision Transformer Model."""
59 | 
60 |     def __init__(self, num_classes,
61 |                  pre_process=True, post_process=True):
62 |         super(MitClassificationModel, self).__init__()
63 |         args = get_args()
64 | 
65 |         self.hidden_size = args.hidden_size
66 |         self.num_classes = num_classes
67 | 
68 |         self.backbone = mit_b3_avg()
69 |         self.head = torch.nn.Linear(512, num_classes)
70 |         self.apply(self._init_weights)
71 | 
72 |     def _init_weights(self, m):
73 |         if isinstance(m, torch.nn.Linear):
74 |             trunc_normal_(m.weight, std=.02)
75 |             if isinstance(m, torch.nn.Linear) and m.bias is not None:
76 |                 torch.nn.init.constant_(m.bias, 0)
77 | 
78 |     def set_input_tensor(self, input_tensor):
79 |         """See megatron.model.transformer.set_input_tensor()"""
80 |         pass
81 | 
82 |     def forward(self, input):
83 |         hidden_states = self.backbone(input)
84 |         hidden_states = self.head(hidden_states)
85 | 
86 |         return hidden_states
87 | 


--------------------------------------------------------------------------------
/src/megatron/model/vision/inpainting.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # This source code is licensed under the BSD license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | i
  6 | import math
  7 | import apex
  8 | import einops
  9 | import torch
 10 | import torch.nn.functional as F
 11 | from megatron import get_args, print_rank_0
 12 | from megatron.model.utils import get_linear_layer
 13 | from megatron.model.vision.vit_backbone import VitBackbone
 14 | from megatron.model.module import MegatronModule
 15 | from megatron.model.vision.mit_backbone import mit_b3
 16 | from megatron.model.vision.utils import resize_
 17 | 
 18 | 
 19 | class VitInpaintingModel(MegatronModule):
 20 | 
 21 |     def __init__(self, config, pre_process=True, post_process=True):
 22 |         super(VitInpaintingModel, self).__init__()
 23 |         args = get_args()
 24 | 
 25 |         self.pre_process = pre_process
 26 |         self.post_process = post_process
 27 |         self.hidden_size = config.hidden_size
 28 |         self.backbone = VitBackbone(
 29 |             config=config,
 30 |             pre_process=self.pre_process,
 31 |             post_process=self.post_process,
 32 |             class_token=False,
 33 |         )
 34 |         self.patch_dim = args.patch_dim
 35 |         self.img_h = args.img_h
 36 |         self.img_w = args.img_w
 37 |         self.seq_length = args.seq_length
 38 |         # full mask
 39 | 
 40 |         if self.post_process:
 41 |             self.linear_decoder = get_linear_layer(
 42 |                 self.hidden_size,
 43 |                 self.backbone.flatten_dim,
 44 |                 torch.nn.init.zeros_,
 45 |                 gather_params_on_init=args.zero_stage == 3
 46 |             )
 47 | 
 48 |     def set_input_tensor(self, input_tensor):
 49 |         self.backbone.set_input_tensor(input_tensor)
 50 | 
 51 |     def forward(self, input):
 52 | 
 53 |         hidden_states = self.backbone(input)
 54 | 
 55 |         if not self.post_process:
 56 |             return hidden_states
 57 |         decoded_output = self.linear_decoder(hidden_states)
 58 |         output = einops.rearrange(
 59 |                 decoded_output,
 60 |                 "b (h w) (p1 p2 c) -> b c (h p1) (w p2)",
 61 |                 p1=self.patch_dim,
 62 |                 p2=self.patch_dim,
 63 |                 h=self.img_h//self.patch_dim,
 64 |                 w=self.img_w//self.patch_dim,
 65 |             )
 66 | 
 67 |         return output
 68 | 
 69 | 
 70 | class MLP(torch.nn.Module):
 71 |     """
 72 |     Linear Embedding
 73 |     """
 74 |     def __init__(self, input_dim=2048, embed_dim=768):
 75 |         super().__init__()
 76 |         self.proj = torch.nn.Linear(input_dim, embed_dim)
 77 | 
 78 |     def forward(self, x):
 79 |         x = x.flatten(2).transpose(1, 2)
 80 |         x = self.proj(x)
 81 |         return x
 82 | 
 83 | 
 84 | class MitInpaintingModel(MegatronModule):
 85 |     """Mix vision Transformer Model."""
 86 | 
 87 |     def __init__(self, pre_process=True, post_process=True):
 88 |         super(MitInpaintingModel, self).__init__()
 89 |         self.pre_process = pre_process
 90 |         self.post_process = post_process
 91 | 
 92 |         args = get_args()
 93 |         self.patch_dim = args.patch_dim
 94 |         self.img_h = args.img_h
 95 |         self.img_w = args.img_w
 96 |         self.flatten_dim = self.patch_dim * self.patch_dim * 3
 97 |         self.backbone = mit_b3()
 98 | 
 99 |         self.in_channels = [64, 128, 320, 512]
100 |         self.embedding_dim = 768
101 | 
102 |         c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels
103 | 
104 |         self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=self.embedding_dim)
105 |         self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=self.embedding_dim)
106 |         self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=self.embedding_dim)
107 |         self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=self.embedding_dim)
108 | 
109 |         self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, self.embedding_dim, 1, 1, bias=False)
110 |         self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim)
111 |         self.dropout = torch.nn.Dropout2d(0.1)
112 |         
113 |         self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1)
114 |     
115 |     def set_input_tensor(self, input_tensor):
116 |         """See megatron.model.transformer.set_input_tensor()"""
117 |         pass
118 | 
119 |     def forward(self, input):
120 |         c1, c2, c3, c4 = self.backbone(input)
121 | 
122 |         n, _, h, w = c4.shape
123 |         _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3])
124 |         _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False)
125 |     
126 |         _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3])
127 |         _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False)
128 | 
129 |         _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3])
130 |         _c2 = resize(_c2, size=c1.size()[2:], mode='bilinear', align_corners=False)
131 | 
132 |         _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3])
133 | 
134 |         _c = torch.cat([_c4, _c3, _c2, _c1], dim=1)
135 |         _c = self.conv_fuse(_c)
136 |  
137 |         x = self.norm(_c)
138 |         x = F.relu(x, inplace=True)
139 |         x = self.dropout(x)
140 | 
141 |         x = self.linear_pred(x)
142 | 
143 |         output = einops.rearrange(
144 |             x,
145 |             "b (c p1 p2) h w -> b c (h p1) (w p2)",
146 |             p1=self.patch_dim,
147 |             p2=self.patch_dim,
148 |             h=self.img_h//self.patch_dim,
149 |             w=self.img_w//self.patch_dim,
150 |         )
151 | 
152 |         return output
153 | 


--------------------------------------------------------------------------------
/src/megatron/model/vision/knn_monitor.py:
--------------------------------------------------------------------------------
  1 | import torch.nn.functional as F
  2 | import torch
  3 | from megatron import print_rank_0, get_args
  4 | from megatron.core import mpu
  5 | from megatron.data.vit_dataset import ClassificationTransform
  6 | from megatron.data.image_folder import ImageFolder
  7 | 
  8 | _FEATURE_BANK = None
  9 | 
 10 | 
 11 | def build_data_loader(dataset, drop_last=True, shuffle=False):
 12 |     """Data loader. Note that batch-size is the local (per GPU) batch-size."""
 13 |     # Sampler.
 14 |     args = get_args()
 15 |     micro_batch_size = 16
 16 |     num_workers = args.num_workers
 17 |     world_size = mpu.get_data_parallel_world_size()
 18 |     rank = mpu.get_data_parallel_rank()
 19 |     sampler = torch.utils.data.distributed.DistributedSampler(
 20 |         dataset, num_replicas=world_size, rank=rank,
 21 |         drop_last=drop_last, shuffle=shuffle
 22 |     )
 23 | 
 24 |     # Data loader. Note that batch size is the per GPU batch size.
 25 |     data_loader = torch.utils.data.DataLoader(
 26 |         dataset,
 27 |         batch_size=micro_batch_size,
 28 |         sampler=sampler,
 29 |         shuffle=False,
 30 |         num_workers=num_workers,
 31 |         drop_last=not drop_last,
 32 |         pin_memory=True,
 33 |     )
 34 |     return data_loader
 35 | 
 36 | 
 37 | def compute_feature_bank(model):
 38 |     args = get_args()
 39 |     global _FEATURE_BANK
 40 |     feature_bank = []
 41 |     feature_label = []
 42 | 
 43 |     train_ds = ImageFolder(
 44 |         root=args.data_path[0],
 45 |         transform=ClassificationTransform((args.img_h, args.img_w), train=False),
 46 |         data_per_class_fraction=1.0
 47 |     )
 48 |     classes = len(train_ds.classes)
 49 |     dataloader = build_data_loader(train_ds)
 50 |      
 51 |     for m in model:
 52 |         m.eval()
 53 | 
 54 |     with torch.no_grad():
 55 |         for i, batch in enumerate(dataloader):
 56 |             images = batch[0].cuda().contiguous()
 57 |             labels = batch[1].cuda().contiguous()
 58 |             student_feature, teacher_feature = model[0](images)
 59 |             feature = F.normalize(teacher_feature.float(), dim=1)
 60 |             feature_bank.append(feature)
 61 |             feature_label.append(labels)
 62 |     
 63 |     for m in model:
 64 |         m.train()
 65 | 
 66 |     # [N', D]
 67 |     feature_bank = torch.cat(feature_bank, dim=0).contiguous()
 68 |     feature_label = torch.cat(feature_label, dim=0).contiguous()
 69 | 
 70 |     feature_banks = [torch.zeros_like(feature_bank)
 71 |                      for i in range(mpu.get_data_parallel_world_size())]
 72 |     torch.distributed.all_gather(feature_banks,
 73 |                                  feature_bank,
 74 |                                  group=mpu.get_data_parallel_group())
 75 | 
 76 |     assert torch.all(torch.eq(feature_banks[mpu.get_data_parallel_rank()],
 77 |                               feature_bank))
 78 | 
 79 |     feature_labels = [torch.zeros_like(feature_label)
 80 |                       for i in range(mpu.get_data_parallel_world_size())]
 81 |     torch.distributed.all_gather(feature_labels,
 82 |                                  feature_label,
 83 |                                  group=mpu.get_data_parallel_group())
 84 | 
 85 |     # [D, N]
 86 |     feature_banks = torch.cat(feature_banks, dim=0).t().contiguous()
 87 |     # [N]
 88 |     feature_labels = torch.cat(feature_labels, dim=0).contiguous()
 89 |     print_rank_0("feature_banks size is {}".format(feature_banks.size()))
 90 |     print_rank_0("feature labels size is {}".format(feature_labels.size()))
 91 | 
 92 |     _FEATURE_BANK = (feature_banks, feature_labels, classes)
 93 | 
 94 | 
 95 | def get_feature_bank():
 96 |     global _FEATURE_BANK
 97 |     assert _FEATURE_BANK is not None
 98 |     return _FEATURE_BANK
 99 | 
100 | 
101 | # knn monitor as in InstDisc https://arxiv.org/abs/1805.01978
102 | # implementation follows http://github.com/zhirongw/lemniscate.pytorch and
103 | # https://github.com/leftthomas/SimCLR
104 | def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t):
105 |     # compute cos similarity between each feature vector and feature bank ---> [B, N]
106 |     sim_matrix = torch.mm(feature, feature_bank)
107 |     # [B, K]
108 |     sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1)
109 |     # [B, K]
110 |     sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1),
111 |                               dim=-1,
112 |                               index=sim_indices)
113 |     sim_weight = (sim_weight / knn_t).exp()
114 | 
115 |     # counts for each class
116 |     one_hot_label = torch.zeros(feature.size(0) * knn_k,
117 |                                 classes,
118 |                                 device=sim_labels.device)
119 |     # [B*K, C]
120 |     one_hot_label = one_hot_label.scatter(dim=-1,
121 |                                           index=sim_labels.view(-1, 1),
122 |                                           value=1.0)
123 |     # weighted score ---> [B, C]
124 |     pred_scores = torch.sum(
125 |             one_hot_label.view(feature.size(0), -1, classes) * sim_weight.unsqueeze(dim=-1),
126 |             dim=1)
127 | 
128 |     pred_labels = pred_scores.argsort(dim=-1, descending=True)
129 |     return pred_labels
130 | 


--------------------------------------------------------------------------------
/src/megatron/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def resize(input,
 7 |            size=None,
 8 |            scale_factor=None,
 9 |            mode='nearest',
10 |            align_corners=None,
11 |            warning=True):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
18 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
19 |                         and (output_w - 1) % (input_w - 1)):
20 |                     warnings.warn(
21 |                         f'When align_corners={align_corners}, '
22 |                         'the output would more aligned if '
23 |                         f'input size {(input_h, input_w)} is `x+1` and '
24 |                         f'out size {(output_h, output_w)} is `nx+1`')
25 |     if isinstance(size, torch.Size):
26 |         size = tuple(int(x) for x in size)
27 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
28 | 


--------------------------------------------------------------------------------
/src/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/FastLLM/3113a18be60f959925e87f36f364504ec99725a0/src/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/src/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import os
 5 | import random
 6 | import numpy
 7 | import torch
 8 | 
 9 | import mpu
10 | from deepspeed.accelerator import get_accelerator
11 | 
12 | class IdentityLayer(torch.nn.Module):
13 |     def __init__(self, size, scale=1.0):
14 |         super(IdentityLayer, self).__init__()
15 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
16 | 
17 |     def forward(self):
18 |         return self.weight
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     """Set random seed for reproducability."""
23 |     random.seed(seed)
24 |     numpy.random.seed(seed)
25 |     torch.manual_seed(seed)
26 |     mpu.model_parallel_cuda_manual_seed(seed)
27 | 
28 | 
29 | def initialize_distributed(backend='nccl'):
30 |     """Initialize torch.distributed."""
31 |     # Get local rank in case it is provided.
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--local_rank', type=int, default=None,
34 |                         help='local rank passed from distributed launcher')
35 |     args = parser.parse_args()
36 |     local_rank = args.local_rank
37 | 
38 |     # Get rank and world size.
39 |     rank = int(os.getenv('RANK', '0'))
40 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
41 | 
42 |     print('> initializing torch.distributed with local rank: {}, '
43 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
44 | 
45 |     # Set the device id.
46 |     device = rank % get_accelerator().device_count()
47 |     if local_rank is not None:
48 |         device = local_rank
49 |     get_accelerator().set_device(device)
50 | 
51 |     # Call the init process.
52 |     init_method = 'tcp://'
53 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
54 |     master_port = os.getenv('MASTER_PORT', '6000')
55 |     init_method += master_ip + ':' + master_port
56 |     torch.distributed.init_process_group(
57 |         backend=backend,
58 |         world_size=world_size,
59 |         rank=rank,
60 |         init_method=init_method)
61 | 
62 | 
63 | def print_separator(message):
64 |     torch.distributed.barrier()
65 |     filler_len = (78 - len(message)) // 2
66 |     filler = '-' * filler_len
67 |     string = '\n' + filler + ' {} '.format(message) + filler
68 |     if torch.distributed.get_rank() == 0:
69 |         print(string, flush=True)
70 |     torch.distributed.barrier()
71 | 


--------------------------------------------------------------------------------
/src/megatron/mpu/tests/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import set_random_seed
 4 | from commons import IdentityLayer
 5 | from commons import print_separator
 6 | from commons import initialize_distributed
 7 | from mpu.cross_entropy import vocab_parallel_cross_entropy
 8 | import mpu
 9 | import torch.nn.functional as F
10 | import torch
11 | import random
12 | import sys
13 | from deepspeed.accelerator import get_accelerator
14 | sys.path.append("../..")
15 | 
16 | 
17 | def torch_cross_entropy(batch_size, seq_length, vocab_size,
18 |                         logits_scale, seed):
19 |     set_random_seed(seed)
20 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
21 |                              scale=logits_scale).to(get_accelerator().device_name())
22 |     logits = identity()
23 |     target = get_accelerator().LongTensor(
24 |         size=(batch_size, seq_length)).random_(0, vocab_size)
25 |     loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
26 |                            target.view(-1),
27 |                            reduction='none').view_as(target).mean()
28 |     loss.backward()
29 |     return loss, identity.weight.grad
30 | 
31 | 
32 | def mpu_cross_entropy(batch_size, seq_length, vocab_size,
33 |                       logits_scale, seed):
34 |     set_random_seed(seed)
35 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
36 |                              scale=logits_scale).to(get_accelerator().device_name())
37 |     logits = identity()
38 |     logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits)
39 |     target = get_accelerator().LongTensor(
40 |         size=(batch_size, seq_length)).random_(0, vocab_size)
41 |     loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
42 |     loss.backward()
43 |     return loss, identity.weight.grad
44 | 
45 | 
46 | def test_cross_entropy(tensor_model_parallel_size):
47 | 
48 |     if torch.distributed.get_rank() == 0:
49 |         print('> testing cross entropy with model parallel size {} ...'.
50 |               format(tensor_model_parallel_size))
51 | 
52 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
53 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
54 | 
55 |     batch_size = 13
56 |     seq_length = 17
57 |     vocab_size_per_partition = 11
58 |     logits_scale = 1000.0
59 |     vocab_size = vocab_size_per_partition * tensor_model_parallel_size
60 |     seed = 1234
61 | 
62 |     loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
63 |                                                  vocab_size, logits_scale,
64 |                                                  seed)
65 |     loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
66 |                                            vocab_size, logits_scale,
67 |                                            seed)
68 | 
69 |     error = loss_torch.sub_(loss_mpu).abs().max()
70 |     print('   max error in loss on global rank {}: {}'.format(
71 |         torch.distributed.get_rank(), error))
72 |     assert error < 1.0e-6
73 | 
74 |     error = grad_torch.sub_(grad_mpu).abs().max()
75 |     print('   max error in grad on global rank {}: {}'.format(
76 |         torch.distributed.get_rank(), error))
77 |     assert error < 1.0e-6
78 | 
79 |     # Reset groups
80 |     mpu.destroy_tensor_model_parallel()
81 | 
82 |     torch.distributed.barrier()
83 |     if torch.distributed.get_rank() == 0:
84 |         print('>> passed the test :-)')
85 | 
86 | 
87 | if __name__ == '__main__':
88 | 
89 |     initialize_distributed()
90 |     world_size = torch.distributed.get_world_size()
91 | 
92 |     tensor_model_parallel_size = 1
93 |     while tensor_model_parallel_size <= world_size:
94 |         print_separator('test cross entropy')
95 |         test_cross_entropy(tensor_model_parallel_size)
96 |         tensor_model_parallel_size *= 2
97 | 


--------------------------------------------------------------------------------
/src/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import print_separator
 4 | from commons import initialize_distributed
 5 | from deepspeed.accelerator import get_accelerator
 6 | from mpu import data as data_utils
 7 | import mpu
 8 | import torch
 9 | import functools
10 | import operator
11 | import sys
12 | sys.path.append("../..")
13 | 
14 | 
15 | def test_broadcast_data(tensor_model_parallel_size):
16 | 
17 |     if torch.distributed.get_rank() == 0:
18 |         print('> testing broadcast_data with model parallel size {} ...'.
19 |               format(tensor_model_parallel_size))
20 | 
21 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
22 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
23 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
24 | 
25 |     key_size_t = {'key1': [7, 11],
26 |                   'key2': [8, 2, 1],
27 |                   'key3': [13],
28 |                   'key4': [5, 1, 2],
29 |                   'key5': [5, 12]}
30 |     keys = list(key_size_t.keys())
31 | 
32 |     data = {}
33 |     data_t = {}
34 |     for key in key_size_t:
35 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
36 |         data_t[key] = data[key].clone()
37 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
38 |     data_t['keyX'] = data['keyX'].clone()
39 |     if mpu.get_tensor_model_parallel_rank() != 0:
40 |         data = None
41 | 
42 |     data_utils._check_data_types(keys, data_t, torch.int64)
43 |     key_size, key_numel, \
44 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
45 |     for key in keys:
46 |         assert key_size[key] == key_size_t[key]
47 |     total_numel_t = 0
48 |     for key in keys:
49 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
50 |         assert key_numel[key] == target_size
51 |         total_numel_t += target_size
52 |     assert total_numel == total_numel_t
53 | 
54 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
55 |     for key in keys:
56 |         tensor = data_t[key].to(get_accelerator().device_name())
57 |         assert data_b[key].sub(tensor).abs().max() == 0
58 | 
59 |     # Reset groups
60 |     mpu.destroy_tensor_model_parallel()
61 | 
62 |     torch.distributed.barrier()
63 |     if torch.distributed.get_rank() == 0:
64 |         print('>> passed the test :-)')
65 | 
66 | 
67 | if __name__ == '__main__':
68 | 
69 |     initialize_distributed()
70 |     world_size = torch.distributed.get_world_size()
71 | 
72 |     tensor_model_parallel_size = 1
73 |     while tensor_model_parallel_size <= world_size:
74 |         print_separator('test test broadcast data')
75 |         test_broadcast_data(tensor_model_parallel_size)
76 |         tensor_model_parallel_size *= 2
77 | 


--------------------------------------------------------------------------------
/src/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import print_separator
 4 | from commons import initialize_distributed
 5 | import mpu
 6 | import torch
 7 | import sys
 8 | sys.path.append("../..")
 9 | 
10 | 
11 | def test_initialize_model_parallel(tensor_model_parallel_size):
12 | 
13 |     if torch.distributed.get_rank() == 0:
14 |         print('> testing initialize_model_parallel with size {} ...'.format(
15 |             tensor_model_parallel_size))
16 |     tensor_model_parallel_size_ = min(tensor_model_parallel_size,
17 |                                torch.distributed.get_world_size())
18 |     assert not mpu.model_parallel_is_initialized()
19 |     mpu.initialize_model_parallel(tensor_model_parallel_size_)
20 |     assert mpu.model_parallel_is_initialized()
21 | 
22 |     # Checks.
23 |     def check(group, world_size, rank):
24 |         assert world_size == torch.distributed.get_world_size(group=group)
25 |         assert rank == torch.distributed.get_rank(group=group)
26 | 
27 |     # Model parallel.
28 |     world_size = tensor_model_parallel_size_
29 |     rank = torch.distributed.get_rank() % tensor_model_parallel_size_
30 |     assert world_size == mpu.get_tensor_model_parallel_world_size()
31 |     assert rank == mpu.get_tensor_model_parallel_rank()
32 |     check(mpu.get_tensor_model_parallel_group(), world_size, rank)
33 | 
34 |     # Data parallel.
35 |     world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
36 |     rank = torch.distributed.get_rank() // tensor_model_parallel_size
37 |     assert world_size == mpu.get_data_parallel_world_size()
38 |     assert rank == mpu.get_data_parallel_rank()
39 |     check(mpu.get_data_parallel_group(), world_size, rank)
40 | 
41 |     # Reset groups
42 |     mpu.destroy_model_parallel()
43 | 
44 |     torch.distributed.barrier()
45 |     if torch.distributed.get_rank() == 0:
46 |         print('>> passed the test :-)')
47 | 
48 | 
49 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
50 | 
51 |     if torch.distributed.get_rank() == 0:
52 |         print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
53 |             tensor_model_parallel_size_))
54 |     tensor_model_parallel_size = min(tensor_model_parallel_size_,
55 |                               torch.distributed.get_world_size())
56 |     assert not mpu.model_parallel_is_initialized()
57 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
58 |     assert mpu.model_parallel_is_initialized()
59 | 
60 |     # Checks
61 |     src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank()
62 |     assert mpu.get_tensor_model_parallel_src_rank() == src_rank
63 | 
64 |     # Reset groups
65 |     mpu.destroy_model_parallel()
66 | 
67 |     torch.distributed.barrier()
68 |     if torch.distributed.get_rank() == 0:
69 |         print('>> passed the test :-)')
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     initialize_distributed()
75 |     world_size = torch.distributed.get_world_size()
76 |     tensor_model_parallel_size = 1
77 |     while tensor_model_parallel_size <= world_size:
78 |         print_separator('test initialize model parallel')
79 |         test_initialize_model_parallel(tensor_model_parallel_size)
80 |         print_separator('test model parallel source rank')
81 |         test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
82 |         tensor_model_parallel_size *= 2
83 | 


--------------------------------------------------------------------------------
/src/megatron/optimizer/clip_grads.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Gradient clipping."""
  4 | 
  5 | import torch
  6 | try:
  7 |     from torch._six import inf as inf
  8 | except ModuleNotFoundError:
  9 |     from torch import inf as inf
 10 | 
 11 | from deepspeed.accelerator import get_accelerator
 12 | if get_accelerator().device_name() == 'cuda':
 13 |     from apex.multi_tensor_apply import multi_tensor_applier
 14 |     import amp_C
 15 | 
 16 | from megatron.model.module import param_is_not_shared
 17 | from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
 18 | 
 19 | 
 20 | def clip_grad_norm_fp32(parameters, grads_for_norm,
 21 |                         max_norm, norm_type=2,
 22 |                         model_parallel_group=None):
 23 |     """Clips gradient norm of an iterable of parameters whose gradients
 24 |        are in fp32.
 25 | 
 26 |     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
 27 |     added functionality to handle model parallel parameters. Note that
 28 |     the gradients are modified in place.
 29 | 
 30 |     Arguments:
 31 |         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
 32 |             single Tensor that will have gradients normalized
 33 |         grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
 34 |             Tensor that will be used for calculating the grad norm.
 35 |         max_norm (float or int): max norm of the gradients
 36 |         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
 37 |             infinity norm.
 38 |         model_parallel_group (group): given the nature of the distributed
 39 |             optimizer, this is passed as an argument.
 40 | 
 41 |     Returns:
 42 |         Total norm of the parameters (viewed as a single vector).
 43 |     """
 44 | 
 45 |     if isinstance(parameters, torch.Tensor):
 46 |         parameters = [parameters]
 47 |     if isinstance(grads_for_norm, torch.Tensor):
 48 |         grads_for_norm = [grads_for_norm]
 49 | 
 50 |     # Grads.
 51 |     grads = []
 52 |     for param in parameters:
 53 |         if param.grad is not None:
 54 |             assert param.grad.type() == 'torch.{}.FloatTensor'.format(get_accelerator().device_name())
 55 |             grads.append(param.grad.detach())
 56 | 
 57 |     # Norm parameters.
 58 |     max_norm = float(max_norm)
 59 |     norm_type = float(norm_type)
 60 |     total_norm = 0.0
 61 | 
 62 |     # Calculate norm.
 63 |     if norm_type == inf:
 64 |         total_norm = max(grad.abs().max() for grad in grads_for_norm)
 65 |         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
 66 |         # Take max across all model-parallel GPUs.
 67 |         torch.distributed.all_reduce(total_norm_cuda,
 68 |                                      op=torch.distributed.ReduceOp.MAX,
 69 |                                      group=model_parallel_group)
 70 |         total_norm = total_norm_cuda[0].item()
 71 | 
 72 |     else:
 73 |         if norm_type == 2.0:
 74 |             if get_accelerator().device_name() == 'cuda':
 75 |                 dummy_overflow_buf = torch.cuda.IntTensor([0])
 76 |                 # Use apex's multi-tensor applier for efficiency reasons.
 77 |                 # Multi-tensor applier takes a function and a list of list
 78 |                 # and performs the operation on that list all in one kernel.
 79 |                 if grads_for_norm:
 80 |                     grad_norm, _ = multi_tensor_applier(
 81 |                         amp_C.multi_tensor_l2norm,
 82 |                         dummy_overflow_buf,
 83 |                         [grads_for_norm],
 84 |                         False # no per-parameter norm
 85 |                     )
 86 |                 else:
 87 |                     grad_norm = torch.cuda.FloatTensor([0])
 88 |             else:
 89 |                 grad_norm = torch.norm(grads_for_norm,p=2.0)
 90 |             # Since we will be summing across data parallel groups,
 91 |             # we need the pow(norm-type).
 92 |             total_norm = grad_norm ** norm_type
 93 |         else:
 94 |             for grad in grads_for_norm:
 95 |                 grad_norm = torch.norm(grad, norm_type)
 96 |                 total_norm += grad_norm ** norm_type
 97 | 
 98 |         # Sum across all model-parallel GPUs.
 99 |         torch.distributed.all_reduce(total_norm,
100 |                                      op=torch.distributed.ReduceOp.SUM,
101 |                                      group=model_parallel_group)
102 |         total_norm = total_norm.item() ** (1.0 / norm_type)
103 | 
104 |     # Scale.
105 |     clip_coeff = max_norm / (total_norm + 1.0e-6)
106 |     if clip_coeff < 1.0:
107 |         if get_accelerator().device_name() == 'cuda':
108 |             dummy_overflow_buf = get_accelerator().IntTensor([0])
109 |             multi_tensor_applier(amp_C.multi_tensor_scale,
110 |                                 dummy_overflow_buf,
111 |                                 [grads, grads],
112 |                                 clip_coeff)
113 |         else:
114 |             for g in grads:
115 |                 g.detach().mul_(clip_coeff.to(g.device))
116 | 
117 |     return total_norm
118 | 
119 | 
120 | def count_zeros_fp32(parameters, model_parallel_group):
121 | 
122 |     if isinstance(parameters, torch.Tensor):
123 |         parameters = [parameters]
124 | 
125 |     # Filter parameters based on:
126 |     #   - grad should not be none
127 |     #   - parameter should not be shared
128 |     #   - should not be a replica due to tensor model parallelism
129 |     total_num_zeros = torch.cuda.FloatTensor([0.0])
130 |     for param in parameters:
131 |         grad_not_none = param.grad is not None
132 |         is_not_shared = param_is_not_shared(param)
133 |         is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
134 |         if grad_not_none and is_not_shared and is_not_tp_duplicate:
135 |             grad = param.grad.detach()
136 |             num_zeros = grad.numel() - torch.count_nonzero(grad)
137 |             total_num_zeros = num_zeros + total_num_zeros
138 | 
139 |     # Sum across all model-parallel GPUs.
140 |     torch.distributed.all_reduce(total_num_zeros,
141 |                                  op=torch.distributed.ReduceOp.SUM,
142 |                                  group=model_parallel_group)
143 | 
144 |     total_num_zeros = total_num_zeros.item()
145 | 
146 |     return total_num_zeros
147 | 


--------------------------------------------------------------------------------
/src/megatron/optimizer/grad_scaler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Megatron grad scaler."""
  4 | 
  5 | from abc import ABC
  6 | from abc import abstractmethod
  7 | 
  8 | import torch
  9 | from deepspeed.accelerator import get_accelerator
 10 | 
 11 | class MegatronGradScaler(ABC):
 12 | 
 13 |     def __init__(self, initial_scale):
 14 |         """Initialize scale value with the input initial scale."""
 15 |         assert initial_scale > 0.0
 16 |         self._scale = get_accelerator().FloatTensor([initial_scale])
 17 | 
 18 |     @property
 19 |     def scale(self):
 20 |         return self._scale
 21 | 
 22 |     @property
 23 |     def inv_scale(self):
 24 |         return self._scale.double().reciprocal().float()
 25 | 
 26 |     @abstractmethod
 27 |     def update(self, found_inf):
 28 |         pass
 29 | 
 30 |     @abstractmethod
 31 |     def state_dict(self):
 32 |         pass
 33 | 
 34 |     @abstractmethod
 35 |     def load_state_dict(self, state_dict):
 36 |         pass
 37 | 
 38 | 
 39 | 
 40 | class ConstantGradScaler(MegatronGradScaler):
 41 | 
 42 |     def update(self, found_inf):
 43 |         pass
 44 | 
 45 |     def state_dict(self):
 46 |         return dict()
 47 | 
 48 |     def load_state_dict(self, state_dict):
 49 |         pass
 50 | 
 51 | 
 52 | 
 53 | class DynamicGradScaler(MegatronGradScaler):
 54 | 
 55 |     def __init__(self, initial_scale, min_scale,
 56 |                  growth_factor, backoff_factor,
 57 |                  growth_interval, hysteresis):
 58 |         """"Grad scaler with dynamic scale that gets adjusted
 59 |         during training."""
 60 |         super(DynamicGradScaler, self).__init__(initial_scale)
 61 | 
 62 |         # Lower bound on the scale.
 63 |         assert min_scale > 0.0
 64 |         assert min_scale <= initial_scale
 65 |         self.min_scale = get_accelerator().FloatTensor([min_scale])
 66 |         # Growth and backoff factors for the scale.
 67 |         assert growth_factor > 1.0
 68 |         self.growth_factor = get_accelerator().FloatTensor([growth_factor])
 69 |         assert backoff_factor < 1.0
 70 |         assert backoff_factor > 0.0
 71 |         self.backoff_factor = get_accelerator().FloatTensor([backoff_factor])
 72 |         # Interval over which if we don't see any inf/nan,
 73 |         # we will scale the grad scale by the growth factor.
 74 |         assert growth_interval > 0
 75 |         self.growth_interval = growth_interval
 76 |         # Number of inf/nans we should see before scaling down
 77 |         # the grad scale by the backoff factor.
 78 |         assert hysteresis > 0
 79 |         self.hysteresis = hysteresis
 80 | 
 81 |         # Trackers.
 82 |         self._growth_tracker = 0
 83 |         self._hysteresis_tracker = self.hysteresis
 84 | 
 85 | 
 86 |     def update(self, found_inf):
 87 | 
 88 |         # If we have an inf/nan, growth tracker is set to 0
 89 |         # and hysterisis tracker is reduced by 1.
 90 |         if found_inf:
 91 |             self._growth_tracker = 0
 92 |             self._hysteresis_tracker -= 1
 93 |             # Now if we are out of hysteresis count, scale down the loss.
 94 |             if self._hysteresis_tracker <= 0:
 95 |                 self._scale = torch.max(self._scale * self.backoff_factor,
 96 |                                         self.min_scale)
 97 |         else:
 98 |             # If there is no nan/inf, increment the growth tracker.
 99 |             self._growth_tracker += 1
100 |             # If we have had enough consequitive intervals with no nan/inf:
101 |             if self._growth_tracker == self.growth_interval:
102 |                 # Reset the tracker and hysteresis trackers,
103 |                 self._growth_tracker = 0
104 |                 self._hysteresis_tracker = self.hysteresis
105 |                 # and scale up the loss scale.
106 |                 self._scale = self._scale * self.growth_factor
107 | 
108 | 
109 |     def state_dict(self):
110 |         state_dict = {}
111 |         state_dict['scale'] = self._scale
112 |         state_dict['growth_tracker'] = self._growth_tracker
113 |         state_dict['hysteresis_tracker'] = self._hysteresis_tracker
114 |         return state_dict
115 | 
116 | 
117 |     def load_state_dict(self, state_dict):
118 |         self._scale = state_dict['scale'].to(get_accelerator().current_device_name())
119 |         self._growth_tracker = state_dict['growth_tracker']
120 |         self._hysteresis_tracker = state_dict['hysteresis_tracker']
121 | 


--------------------------------------------------------------------------------
/src/megatron/static/index.html:
--------------------------------------------------------------------------------
  1 | <!-- coding=utf-8-->
  2 | <!-- Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.-->
  3 | 
  4 | <!DOCTYPE html>
  5 | <html lang="en">
  6 | <head>
  7 | <meta charset="utf-8" />
  8 | <title>Megatron</title>
  9 | <style>
 10 | 
 11 | .wrapper {
 12 | max-width: 75%;
 13 | margin: auto;
 14 | }
 15 | 
 16 | h1 { 
 17 | margin: 3rem 0 1rem 0; 
 18 | padding: 0;
 19 | font-size: 1.5rem;
 20 | }
 21 | 
 22 | textarea {
 23 | width: 100%;
 24 | min-height: 300px;
 25 | resize: none;
 26 | border-radius: 8px;
 27 | border: 1px solid #ddd;
 28 | padding: 0.5rem;
 29 | box-shadow: inset 0 0 0.25rem #ddd;
 30 | &:focus {
 31 | outline: none;
 32 | border: 1px solid darken(#ddd, 5%);
 33 | box-shadow: inset 0 0 0.5rem darken(#ddd, 5%);
 34 | }
 35 | }
 36 | 
 37 | #the-count {
 38 | float: right;
 39 | padding: 0.1rem 0 0 0;
 40 | font-size: 0.875rem;
 41 | }
 42 | /* Chat containers */
 43 | .container {
 44 | font-family: 'Arial', sans-serif;
 45 | font-size: 16px;
 46 | border: 2px solid #dedede;
 47 | background-color: #f1f1f1;
 48 | border-radius: 5px;
 49 | padding: 15px;
 50 | margin: 10px 0;
 51 | }
 52 | 
 53 | 
 54 | /* Clear floats */
 55 | .container::after {
 56 | content: "";
 57 | clear: both;
 58 | display: table;
 59 | }
 60 | 
 61 | /* Style images */
 62 | .container img {
 63 | float: left;
 64 | max-width: 60px;
 65 | width: 100%;
 66 | margin-right: 20px;
 67 | border-radius: 50%;
 68 | }
 69 | 
 70 | </style>
 71 | </head>
 72 | <body>
 73 | <div class="wrapper">
 74 | <h1>Prompt Megatron</h1>
 75 | <textarea name="prompt" id="prompt" maxlength="1024" placeholder="Add prompt"autofocus></textarea>
 76 | <label for="tokens_to_generate">Number tokens to generate (1-1024):</label>
 77 | <input type="number" id="tokens_to_generate" name="tokens_to_generate" min="10" max="256", value=32>
 78 | <button onclick="submit_query()">Submit</button>
 79 | 
 80 | <div id="the-count">
 81 | <span id="current">0</span>
 82 | <span id="maximum">/ 1000</span>
 83 | </div>
 84 | <textarea name="response" id="response" maxlength="2048" placeholder="Megatron response..."></textarea>
 85 | </div>
 86 | <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
 87 | <script type="text/javascript">
 88 | 
 89 | 	function submit_query() {
 90 | 		$("#response").val("Waiting for Megatron response...");
 91 | 		$.ajax({
 92 | 			url:"api",
 93 | 			type:"PUT",
 94 | 			data:JSON.stringify({prompts: [$("#prompt").val()], tokens_to_generate: parseInt($("#tokens_to_generate").val(),10)}),
 95 | 			contentType:"application/json; charset=utf-8",
 96 | 			dataType:"json",
 97 | 			success: function(data){
 98 | 				data.max_len=35;
 99 | 				$("#response").val(data.text);
100 | 			}
101 | 		});
102 | 	}
103 | 	
104 | $('textarea').keyup(function() {
105 | var characterCount = $(this).val().length,
106 | current = $('#current'),
107 | maximum = $('#maximum'),
108 | theCount = $('#the-count');
109 | 
110 | current.text(characterCount);
111 | 
112 | if (characterCount >= 800) {
113 | maximum.css('color', '#8f0001');
114 | current.css('color', '#8f0001');
115 | theCount.css('font-weight','bold');
116 | } else {
117 | maximum.css('color','#666');
118 | theCount.css('font-weight','normal');
119 | }
120 | });
121 | </script>
122 | </body>
123 | </html>
124 | 
125 | 


--------------------------------------------------------------------------------
/src/megatron/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/src/megatron/text_generation/beam_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | ## from huggingface beam search
19 | class BeamHypotheses(object):
20 |     def __init__(self, num_beams, length_penalty=1.0, early_stopping=False):
21 |         """
22 |         Initialize n-best list of hypotheses.
23 |         """
24 |         self.length_penalty = length_penalty
25 |         self.early_stopping = early_stopping
26 |         self.num_beams = num_beams
27 |         self.beams = []
28 |         self.worst_score = 1e9
29 | 
30 |     def __len__(self):
31 |         """
32 |         Number of hypotheses in the list.
33 |         """
34 |         return len(self.beams)
35 | 
36 |     def add(self, hyp, sum_logprobs, length):
37 |         """
38 |         Add a new hypothesis to the list.
39 |         """
40 |         score = sum_logprobs / length ** self.length_penalty
41 |         if len(self) < self.num_beams or score > self.worst_score:
42 |             self.beams.append((score, hyp))
43 |             if len(self) > self.num_beams:
44 |                 sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
45 |                 del self.beams[sorted_scores[0][1]]
46 |                 self.worst_score = sorted_scores[1][0]
47 |             else:
48 |                 self.worst_score = min(score, self.worst_score)
49 | 
50 |     def is_done(self, best_sum_logprobs, cur_len):
51 |         """
52 |         If there are enough hypotheses and that none of the hypotheses being generated
53 |         can become better than the worst one in the heap, then we are done with this sentence.
54 |         """
55 | 
56 |         if len(self) < self.num_beams:
57 |             return False
58 |         elif self.early_stopping:
59 |             return True
60 |         else:
61 |             cur_score = best_sum_logprobs / cur_len ** self.length_penalty
62 |             ret = self.worst_score >= cur_score
63 |             return ret
64 | 
65 | 


--------------------------------------------------------------------------------
/src/megatron/text_generation/sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Sampling utilities.
 4 | Part of this code is inspired by:
 5 |  - https://github.com/ari-holtzman/degen/blob/master/gen.py
 6 |  - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html
 7 | """
 8 | 
 9 | 
10 | import torch
11 | 
12 | 
13 | 
14 | def modify_logits_for_top_k_filtering(logits, top_k):
15 |     """Set the logits for none top-k values to -inf."""
16 | 
17 |     filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
18 |     logits.masked_fill_(filter_, float('-Inf'))
19 | 
20 | 
21 | 
22 | def modify_logits_for_top_p_filtering(logits, top_p):
23 |     """Set the logits for none top-p values to -inf."""
24 | 
25 |     # First sort and calculate cumulative sum of probabilities.
26 |     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
27 |     cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
28 | 
29 |     # Filteration based on the cumulative sum.
30 |     filter_ = cumulative_probs > top_p
31 |     # This shift by 1 is weird and I cannot justify it. This existed
32 |     # in the original implementation:
33 |     #   https://github.com/ari-holtzman/degen/blob/master/gen.py
34 |     # and I guess it is needed so keeping it for now.
35 |     filter_[:, 1:] = filter_[:, :-1].clone()
36 |     # Make sure we at least have one token to select from.
37 |     filter_[..., 0] = 0
38 | 
39 |     # Fill in the filtered part
40 |     filter_ = filter_.scatter(1, sorted_indices, filter_)
41 |     logits.masked_fill_(filter_, float('-Inf'))
42 | 
43 | 
44 | 
45 | def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
46 |     """ Sample and generate a token.
47 |     Note: logits has the dimension [b, v] where b is the batch size
48 |           and v is the vocabulary size.
49 |     If vocab_size is provided, we will make sure the sample that is
50 |     generated is in [0, vocab-size). This will avoid out of vocabulary
51 |     generations due to padding.
52 |     """
53 | 
54 |     # Check logits for consistency.
55 |     assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
56 |     assert logits.type() == 'torch.cuda.FloatTensor', \
57 |         'input logits should be floats.'
58 | 
59 | 
60 |     # Greedy is just simple argmax.
61 |     if top_k == 1:
62 |         assert top_p == 0.0, 'cannot set both greedy and top-p samplings.'
63 |         samples = torch.argmax(logits, dim=-1)
64 | 
65 |     # Top-k or top-p sampling.
66 |     else:
67 |         # Clone so we do not modify the inputs,
68 |         logits = logits.clone()
69 |         # Apply temperature in place.
70 |         if temperature != 1.0:
71 |             logits.div_(temperature)
72 | 
73 |         if top_k > 1:
74 |             assert top_p == 0.0, 'cannot set both top-k and top-p samplings.'
75 |             assert top_k <= logits.size(1), 'top-k is larger than logit size.'
76 |             if vocab_size:
77 |                 assert top_k < vocab_size, 'top-k is larger than vocab size.'
78 |             modify_logits_for_top_k_filtering(logits, top_k)
79 | 
80 |         elif top_p > 0.0:
81 |             assert top_p <= 1.0, 'top-p should be in (0, 1].'
82 |             modify_logits_for_top_p_filtering(logits, top_p)
83 | 
84 |         # After filtering, we need to recalculate the distribution.
85 |         probs = logits.softmax(dim=-1)
86 |         samples = torch.multinomial(probs, num_samples=1).view(-1)
87 | 
88 |     # If vocab size is provided, make sure the samples are in
89 |     # in the range [0, vocab-size).
90 |     if vocab_size:
91 |         samples = torch.clamp(samples, min=0, max=(vocab_size - 1))
92 | 
93 |     return samples
94 | 


--------------------------------------------------------------------------------
/src/megatron/text_generation/tokenization.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """Tokenization utilities."""
  4 | 
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | from megatron import get_tokenizer, get_args
 10 | from .communication import broadcast_int_list, broadcast_tensor
 11 | 
 12 | 
 13 | def detokenize_generations(tokens_gpu_tensor,
 14 |                            lengths_gpu_tensor,
 15 |                            return_segments):
 16 |     """Detokenize the generated tokens."""
 17 | 
 18 |     tokenizer = get_tokenizer()
 19 |     args = get_args()
 20 |     prompts_plus_generations = []
 21 |     if return_segments:
 22 |         prompts_plus_generations_segments = []
 23 | 
 24 |     tokens = tokens_gpu_tensor.cpu().numpy().tolist()
 25 |     lengths = lengths_gpu_tensor.cpu().numpy().tolist()
 26 |     for sequence_tokens, length in zip(tokens, lengths):
 27 |         sequence_tokens = sequence_tokens[:length]
 28 |         prompts_plus_generations.append(
 29 |             tokenizer.detokenize(sequence_tokens))
 30 |         if return_segments:
 31 |             words = []
 32 |             for token in sequence_tokens:
 33 |                 if args.tokenizer_type in ['SentencePieceTokenizer', 
 34 |                         'GPTSentencePieceTokenizer']:
 35 |                     word = tokenizer.decoder[token]
 36 |                 elif args.tokenizer_type == 'NullTokenizer':
 37 |                     word = str(token)
 38 |                 else:
 39 |                     word = tokenizer.tokenizer.decoder[token]
 40 |                     word = bytearray(
 41 |                         [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
 42 |                             'utf-8', errors='replace')
 43 |                 words.append(word)
 44 |             prompts_plus_generations_segments.append(words)
 45 | 
 46 |     if return_segments:
 47 |         return tokens, prompts_plus_generations, \
 48 |             prompts_plus_generations_segments
 49 | 
 50 |     return tokens, prompts_plus_generations
 51 | 
 52 | 
 53 | def tokenize_prompts(prompts=None, tokens_to_generate=None,
 54 |                      add_BOS=None, rank=0):
 55 |     """Tokenize prompts and make them avaiable on all ranks."""
 56 | 
 57 |     # On all ranks set to None so we can pass them to functions
 58 |     sizes_list = None
 59 |     prompts_tokens_cuda_long_tensor = None
 60 |     prompts_length_cuda_long_tensor = None
 61 | 
 62 |     # On the specified rank, build the above.
 63 |     if torch.distributed.get_rank() == rank:
 64 |         assert prompts is not None
 65 |         assert tokens_to_generate is not None
 66 |         # Tensor of tokens padded and their unpadded length.
 67 |         prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
 68 |             _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS)
 69 |         # We need the sizes of these tensors for the boradcast
 70 |         sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
 71 |                       prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
 72 | 
 73 |     # First, broadcast the sizes.
 74 |     sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
 75 | 
 76 |     # Now that we have the sizes, we can boradcast the tokens
 77 |     # and length tensors.
 78 |     sizes = sizes_tensor.tolist()
 79 |     prompts_tokens_cuda_long_tensor = broadcast_tensor(
 80 |         sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
 81 |     prompts_length_cuda_long_tensor = broadcast_tensor(
 82 |         sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
 83 |         rank=rank)
 84 | 
 85 |     return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
 86 | 
 87 | 
 88 | def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
 89 |     """Given a set of prompts and number of tokens to generate:
 90 |         - tokenize prompts
 91 |         - set the sequence length to be the max of length of prompts
 92 |           plus the number of tokens we would like to generate
 93 |         - pad all the sequences to this length so we can convert them
 94 |           into a 2D tensor.
 95 |     """
 96 | 
 97 |     # Tokenize all the prompts.
 98 |     tokenizer = get_tokenizer()
 99 |     if add_BOS:
100 |         prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt)
101 |                           for prompt in prompts]
102 |     else:
103 |         prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
104 | 
105 |     # Now we have a list of list of tokens which each list has a different
106 |     # size. We want to extend this list to:
107 |     #   - incorporate the tokens that need to be generated
108 |     #   - make all the sequences equal length.
109 |     # Get the prompts length.
110 |     prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
111 |     # Get the max prompts length.
112 |     max_prompt_len = max(prompts_length)
113 |     # Number of tokens in the each sample of the batch.
114 |     samples_length = max_prompt_len + tokens_to_generate
115 |     # Now update the list of list to be of the same size: samples_length.
116 |     for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
117 |         padding_size = samples_length - prompt_length
118 |         prompt_tokens.extend([tokenizer.eod] * padding_size)
119 | 
120 |     # Now we are in a structured format, we can convert to tensors.
121 |     prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens)
122 |     prompts_length_tensor = torch.cuda.LongTensor(prompts_length)
123 | 
124 |     return prompts_tokens_tensor, prompts_length_tensor
125 | 


--------------------------------------------------------------------------------
/src/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/src/scripts/generate_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export TORCH_CUDA_ARCH_LIST=8.6+PTX
 3 | CHECKPOINT_PATH=dataset/checkpoints/gpt2_345m
 4 | VOCAB_FILE=dataset/gpt2-vocab.json
 5 | MERGE_FILE=dataset/gpt2-merges.txt
 6 | b=8
 7 | mp=1
 8 | experts=1
 9 | nodes=1
10 | gpus=1
11 | 
12 | 
13 | use_tutel=""
14 | #use_tutel="--use-tutel"
15 | 
16 | 
17 | ds_inference=""
18 | #ds_inference="--ds-inference"
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | 
22 | launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus"
23 | L=24
24 | H=1024
25 | A=16
26 | #experts1=${experts[$k]}
27 | program_cmd="tools/generate_samples_gpt.py \
28 |        --tensor-model-parallel-size $mp \
29 |        --num-layers $L \
30 |        --hidden-size $H \
31 |        --num-attention-heads $A \
32 |        --max-position-embeddings 1024 \
33 |        --tokenizer-type GPT2BPETokenizer \
34 |        --fp16 \
35 |        --num-experts ${experts} \
36 |        --mlp-type standard \
37 |        --micro-batch-size $b \
38 |        --seq-length 1024 \
39 |        --out-seq-length 1024 \
40 |        --temperature 1.0 \
41 |        --vocab-file $VOCAB_FILE \
42 |        --merge-file $MERGE_FILE \
43 |        --genfile unconditional_samples.json \
44 |        --top_p 0.9 \
45 |        --log-interval 1 \
46 |        --num-samples 0 \
47 |        --load $CHECKPOINT_PATH \
48 |        $use_tutel $ds_inference"
49 | 
50 | echo $launch_cmd $program_cmd
51 | $launch_cmd $program_cmd
52 | 


--------------------------------------------------------------------------------
/src/scripts/gpt/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false
24 | }
25 | 


--------------------------------------------------------------------------------
/src/scripts/pretrain_llama_distributed.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # This example script is contributed by external user https://github.com/LydiaXiaohongLi
  3 | set -ex
  4 | 
  5 | ######################################
  6 | # Change the below configurations here
  7 | BASE_PATH=./tmp
  8 | DS_CONFIG=${BASE_PATH}/deepspeed.json
  9 | DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence"
 10 | DATASET="1 ${DATASET_1}"
 11 | CHECKPOINT_PATH=./tmp
 12 | TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model
 13 | 
 14 | TP=2
 15 | PP=2
 16 | ZERO_STAGE=0
 17 | 
 18 | GPUS_PER_NODE=8
 19 | MASTER_ADDR=localhost
 20 | MASTER_PORT=6000
 21 | NNODES=1
 22 | NODE_RANK=0
 23 | 
 24 | HIDDEN_SIZE=2048 # e.g. llama-13b: 5120
 25 | FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824
 26 | NUM_LAYERS=24 # e.g. llama-13b: 40
 27 | NUM_HEADS=16 # e.g. llama-13b: 40
 28 | SEQ_LENGTH=2048
 29 | 
 30 | MICRO_BATCH_SIZE=4
 31 | GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens
 32 | TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps
 33 | LR=3e-4
 34 | MIN_LR=3e-5
 35 | LR_WARMUP_STEPS=2000
 36 | WEIGHT_DECAY=0.1
 37 | GRAD_CLIP=1
 38 | 
 39 | ## Activation checkpointing saves GPU memory, but reduces training speed
 40 | # activation_checkpoint="true"
 41 | activation_checkpoint="false"
 42 | 
 43 | # Below configuration required for llama model as per llama paper
 44 | # --no-query-key-layer-scaling \
 45 | # --attention-dropout 0 \
 46 | # --hidden-dropout 0 \
 47 | # --use-rotary-position-embeddings \
 48 | # --untie-embeddings-and-output-weights \
 49 | # --swiglu \
 50 | # --normalization rmsnorm \
 51 | # --disable-bias-linear \
 52 | ######################################
 53 | 
 54 | 
 55 | 
 56 | cat <<EOT > $DS_CONFIG
 57 | {
 58 |   "train_batch_size" : $GLOBAL_BATCH_SIZE,
 59 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
 60 |   "steps_per_print": 1,
 61 |   "zero_optimization": {
 62 |     "stage": $ZERO_STAGE
 63 |   },
 64 |   "bf16": {
 65 |     "enabled": true
 66 |   }
 67 | }
 68 | EOT
 69 | 
 70 | ds_args=""
 71 | ds_args=" --deepspeed ${ds_args}"
 72 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
 73 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 74 | 
 75 | if [ "${activation_checkpoint}" = "true" ]; then
 76 |   ds_args="--deepspeed-activation-checkpointing ${ds_args}"
 77 | 
 78 |   ## old argument for recomputing the transformer layer
 79 |   # ds_args="--checkpoint-activations ${ds_args}"
 80 | 
 81 |   ## new argument for recomputing the transformer layer
 82 |   ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}"
 83 |   ## new argument for recomputing only the attention layer
 84 |   # ds_args="--recompute-granularity selective ${ds_args}"
 85 | fi
 86 | 
 87 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 88 | 
 89 | torchrun $DISTRIBUTED_ARGS \
 90 |        pretrain.py \
 91 |        --tensor-model-parallel-size $TP \
 92 |        --pipeline-model-parallel-size $PP \
 93 |        --num-layers $NUM_LAYERS \
 94 |        --hidden-size $HIDDEN_SIZE \
 95 |        --ffn-hidden-size $FFN_HIDDEN_SIZE \
 96 |        --num-attention-heads $NUM_HEADS \
 97 |        --micro-batch-size $MICRO_BATCH_SIZE \
 98 |        --global-batch-size $GLOBAL_BATCH_SIZE \
 99 |        --seq-length $SEQ_LENGTH \
100 |        --max-position-embeddings $SEQ_LENGTH \
101 |        --train-iters $TRAIN_STEPS \
102 |        --save $CHECKPOINT_PATH \
103 |        --load $CHECKPOINT_PATH \
104 |        --data-path $DATASET \
105 |        --data-impl mmap \
106 |        --tokenizer-type GPTSentencePieceTokenizer \
107 |        --tokenizer-model $TOKENIZER_PATH \
108 |        --split 949,50,1 \
109 |        --distributed-backend nccl \
110 |        --lr $LR \
111 |        --lr-decay-style cosine \
112 |        --min-lr $MIN_LR \
113 |        --weight-decay $WEIGHT_DECAY \
114 |        --clip-grad $GRAD_CLIP \
115 |        --lr-warmup-iters $LR_WARMUP_STEPS \
116 |        --optimizer adam \
117 |        --adam-beta1 0.9 \
118 |        --adam-beta2 0.95 \
119 |        --log-interval 1 \
120 |        --save-interval 10000 \
121 |        --eval-interval 1000 \
122 |        --eval-iters 10 \
123 |        --bf16 \
124 |        --no-query-key-layer-scaling \
125 |        --attention-dropout 0 \
126 |        --hidden-dropout 0 \
127 |        --use-rotary-position-embeddings \
128 |        --untie-embeddings-and-output-weights \
129 |        --swiglu \
130 |        --normalization rmsnorm \
131 |        --disable-bias-linear \
132 |        $ds_args


--------------------------------------------------------------------------------
/src/scripts/run_deepspeed_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | BASE_PATH=/vc_data/Megatron-LM/data
 5 | DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
 6 | DS_CONFIG=ds_config.json
 7 | 
 8 | TP=1
 9 | PP=1
10 | NLAYERS=24
11 | HIDDEN=512
12 | 
13 | GLOBAL_BATCH=64
14 | MICRO_BATCH=4
15 | 
16 | ZERO_STAGE=2
17 | 
18 | OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
19 | #OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
20 | mkdir -p $OUTPUT_DIR
21 | 
22 | cat <<EOT > $DS_CONFIG
23 | {
24 |   "train_batch_size" : $GLOBAL_BATCH,
25 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH,
26 |   "steps_per_print": 1,
27 | 
28 |   "zero_optimization": {
29 |     "stage": $ZERO_STAGE
30 |   },
31 | 
32 |   "fp16": {
33 |     "enabled": true,
34 |     "initial_scale_power": 12
35 |   },
36 | 
37 |   "wall_clock_breakdown" : true
38 | }
39 | EOT
40 | 
41 | export NCCL_DEBUG=warn 
42 | 
43 | ds_args=""
44 | ds_args=" --deepspeed ${ds_args}"
45 | ds_args=" --no-pipeline-parallel ${ds_args}" 
46 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
47 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
48 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
49 | 
50 | 
51 | deepspeed ./src/pretrain.py \
52 |     --tensor-model-parallel-size $TP \
53 |     --pipeline-model-parallel-size $PP \
54 |     --num-layers $NLAYERS \
55 |     --hidden-size $HIDDEN \
56 |     --num-attention-heads 16 \
57 |     --seq-length 256 \
58 |     --loss-scale 12 \
59 |     --max-position-embeddings 1024 \
60 |     --micro-batch-size 4 \
61 |     --global-batch-size 1024 \
62 |     --train-iters 1000 \
63 |     --lr 6.0e-5 \
64 |     --min-lr 6.0e-6 \
65 |     --lr-decay-style cosine \
66 |     --log-interval 1 \
67 |     --eval-iters 40 \
68 |     --eval-interval 1000 \
69 |     --data-path $DATA_PATH \
70 |     --vocab-file $BASE_PATH/gpt2-vocab.json \
71 |     --merge-file $BASE_PATH/gpt2-merges.txt \
72 |     --save-interval 1000 \
73 |     --split 98,2,0 \
74 |     --clip-grad 1.0 \
75 |     --weight-decay 0.1 \
76 |     --adam-beta1 0.9 \
77 |     --adam-beta2 0.95 \
78 |     --init-method-std 0.006 \
79 |     --fp16 \
80 |     --checkpoint-activations \
81 |     --tensorboard-dir $OUTPUT_DIR \
82 |     $ds_args \
83 |     --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
84 | 
85 | 


--------------------------------------------------------------------------------
/src/scripts/sequence_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # Sequence Parallelism
 2 | 
 3 | This folder contains examples that demonstrate how to use DeepSpeed's sequence parallelism.
 4 | 
 5 | ## Setting Up the Environment for FlashAttention
 6 | 
 7 | DeepSpeed's sequence parallelism can be combined with the following types of attention.
 8 | 
 9 | - Classic attention
10 | - FlashAttention (enabled by `--use-flash-attn`)
11 | - FlashAttention + Triton (enabled by `--use-flash-attn-triton`)
12 | 
13 | For the best performance, we recommend using FlashAttention + Triton. Here are the installation steps and the versions we have tested. Note that FlashAttention is compatible only with Turing, Ampere, Ada, or Hopper GPUs.
14 | 
15 | ```shell
16 | # install triton
17 | git clone -b legacy-backend https://github.com/openai/triton
18 | cd triton/python/
19 | pip install cmake
20 | pip install .
21 | 
22 | # install
23 | cd ${WORK_DIR}
24 | git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
25 | cd flash-attention
26 | python setup.py install
27 | ```
28 | 
29 | ## Enabling Sequence Parallelism
30 | 
31 | To enable sequence parallelism, set the degree of parallelism using the `--ds-sequence-parallel-size` argument. Ensure that the number of attention heads is divisible by this value.
32 | Ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the document of [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/v1.0.4) for more details.
33 | 
34 | Some working examples ([GPT1.3B](ds_pretrain_gpt_1.3B_seq_parallel_32k.sh), [GPT30B](ds_pretrain_gpt_30B_seq_parallel_32k.sh)), that enable sequence parallelism, are available in this foloder.
35 | 
36 | Please note that our sequence parallelism feature is currently incompatible with Megatron-LM's tensor or pipeline parallelism.
37 | 


--------------------------------------------------------------------------------
/src/scripts/sequence_parallel/ds_config_gpt_TEMPLATE.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": GBSIZE,
 3 |   "train_micro_batch_size_per_gpu": MBSIZE,
 4 |   "steps_per_print": LOG_INTERVAL,
 5 | 
 6 |   "zero_optimization": {
 7 |     "stage": ZERO_STAGE,
 8 |     "elastic_checkpoint": true
 9 |   },
10 | 
11 |   "gradient_clipping": 1.0,
12 |   "prescale_gradients": PRESCALE_GRAD,
13 | 
14 |   "fp16": {
15 |     "enabled": true,
16 |     "loss_scale": 0,
17 |     "loss_scale_window": 500,
18 |     "hysteresis": 2,
19 |     "min_loss_scale": 1,
20 |     "initial_scale_power": 11
21 |   },
22 | 
23 |   "wall_clock_breakdown" : false
24 | }
25 | 


--------------------------------------------------------------------------------
/src/tools/convert_checkpoint/deepspeed_to_megatron.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import os
  5 | import torch
  6 | from collections import OrderedDict
  7 | from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
  8 | 
  9 | MODEL_KEY = 'model'
 10 | ARGS_KEY = 'args'
 11 | LANGUGAGE_MODEL_KEY = 'language_model'
 12 | EMBEDDING_KEY = 'embedding'
 13 | ENCODER_KEY = 'encoder'
 14 | WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
 15 | WORD_EMBEDDINGS_KEY = 'word_embeddings'
 16 | FINAL_LAYER_NORM_KEY ='final_layernorm'
 17 | CHECKPOINT_VERSION_KEY = 'checkpoint_version'
 18 | CHECKPOINT_VERSION_VALUE = 3.0
 19 | ITERATION_KEY = 'iteration'
 20 | 
 21 | def parse_arguments():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument('--input_folder', default=None, type=str, help='Input DeepSpeed Checkpoint folder')
 24 |     parser.add_argument('--output_folder', default=None, type=str, help='Output Megatron checkpoint folder')
 25 |     parser.add_argument('--target_tp', default=1, type=int, help='Target TP degree')
 26 |     parser.add_argument('--target_pp', default=1, type=int, help='Target PP degree')
 27 |     parser.add_argument('--for_release', action='store_true', help='Convert for release purpose, reset some (progress) counters.')
 28 |     args = parser.parse_args()
 29 |     print(f'args = {args}')
 30 |     return args
 31 | 
 32 | 
 33 | def _convert_ds_transformer_state(sd_list):
 34 |     new_sd = OrderedDict()
 35 |     for i, sd in enumerate(sd_list):
 36 |         for key, value in sd.items():
 37 |             new_key = f'layers.{i}.{key}'
 38 |             new_sd[new_key] = value
 39 | 
 40 |     return new_sd
 41 | 
 42 | def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
 43 |     path_list = []
 44 |     iter_folder = f'iter_{iteration:07d}'
 45 |     for i in range(0, tp_degree):
 46 |         path_list.append([])
 47 |         for j in range(0, pp_degree):
 48 |             rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
 49 |             ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
 50 |             path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path))
 51 | 
 52 |     return path_list
 53 | 
 54 | 
 55 | def _create_megatron_dict():
 56 |     language_model_dict = {
 57 |         EMBEDDING_KEY: {},
 58 |         ENCODER_KEY: {}
 59 |     }
 60 |     megatron_dict = {
 61 |         MODEL_KEY: {LANGUGAGE_MODEL_KEY: language_model_dict},
 62 |         CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
 63 |     }
 64 |     return megatron_dict
 65 | 
 66 | 
 67 | def _save_checkpoint(file_path, chkpt_sd):
 68 |     dir, _ = os.path.split(file_path)
 69 |     os.makedirs(dir, exist_ok=True)
 70 |     torch.save(chkpt_sd, file_path)
 71 | 
 72 | 
 73 | def _renest_sd(sd):
 74 |     new_sd = OrderedDict()
 75 |     for key, value in sd.items():
 76 |         a, b = key.split('.')
 77 |         new_sd[a] = {b: value}
 78 |     return new_sd
 79 | 
 80 | 
 81 | def _create_rank_checkpoint(ds_checkpoint, tp_index, pp_index, for_release=False):
 82 |     meg_encoder_sd = OrderedDict()
 83 |     meg_embedding_sd = OrderedDict()
 84 |     meg_embedding_for_head_sd = OrderedDict()
 85 | 
 86 |     transformer_sd = ds_checkpoint.get_transformer_state(tp_index, pp_index)
 87 |     meg_encoder_sd.update(_convert_ds_transformer_state(transformer_sd))
 88 | 
 89 |     if pp_index in [0, ds_checkpoint.pp_degree - 1]:
 90 |         embedding_sd = ds_checkpoint.get_embedding_state(tp_index)
 91 |         nested_embedding_sd = _renest_sd(embedding_sd)
 92 |         if pp_index == 0:
 93 |             meg_embedding_sd.update(nested_embedding_sd)
 94 | 
 95 |         if pp_index == ds_checkpoint.pp_degree -1:
 96 |             for key, value in embedding_sd.items():
 97 |                 if key.startswith(WORD_EMBEDDINGS_KEY):
 98 |                     fields = key.split('.')
 99 |                     new_fields = fields[1:]
100 |                     new_key = '.'.join(new_fields)
101 |                     meg_embedding_for_head_sd[new_key] = value
102 | 
103 |             final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index)
104 |             new_final_norm_sd = {f'{FINAL_LAYER_NORM_KEY}.{key}': value for key, value in final_norm_sd.items()}
105 |             meg_encoder_sd.update(new_final_norm_sd)
106 | 
107 |     checkpoint_sd = _create_megatron_dict()
108 | 
109 |     iteration = ds_checkpoint.get_iteration()
110 |     checkpoint_sd[ITERATION_KEY] = iteration
111 |     if pp_index == 0:
112 |         checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][EMBEDDING_KEY] = meg_embedding_sd
113 |     checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd
114 |     if pp_index == ds_checkpoint.pp_degree -1:
115 |         checkpoint_sd[MODEL_KEY][WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
116 | 
117 |     checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args()
118 |     # Adjust specific fields
119 |     checkpoint_sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
120 |     checkpoint_sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
121 |     if for_release:
122 |         checkpoint_sd[ARGS_KEY].consumed_train_samples = 0
123 |         checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0
124 | 
125 |     return checkpoint_sd
126 | 
127 | 
128 | def _create_latest_file(base_folder, iteration):
129 |     file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt')
130 |     os.makedirs(base_folder, exist_ok=True)
131 |     with open(file_path, 'w') as f:
132 |         f.write(str(iteration))
133 | 
134 | def main():
135 |     print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint')
136 | 
137 |     args = parse_arguments()
138 |     print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}')
139 | 
140 |     ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp)
141 |     iteration = ds_checkpoint.get_iteration()
142 |     _create_latest_file(args.output_folder, iteration)
143 |     checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree)
144 |     for i in range(0, ds_checkpoint.tp_degree):
145 |         for j in range(0, ds_checkpoint.pp_degree):
146 |             sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release)
147 |             _save_checkpoint(checkpoint_paths[i][j], sd)
148 | 
149 | if __name__ == "__main__":
150 |     main()


--------------------------------------------------------------------------------
/src/tools/convert_checkpoint/deepspeed_to_transformers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import torch
 5 | import json
 6 | 
 7 | from deepspeed_checkpoint import DeepSpeedCheckpoint
 8 | from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
 9 | 
10 | # the import was tested to work with this version
11 | # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
12 | # copying that version here instead
13 | from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
14 | from transformers import GPT2Config
15 | 
16 | def main():
17 | 
18 |     # this first part comes mainly from deepspeed_to_megatron.main
19 |     args = parse_arguments()
20 |     print(f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}')
21 | 
22 |     ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp)
23 |     iteration = ds_checkpoint.get_iteration()
24 |     input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, args.for_release)
25 | 
26 |     # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
27 |     # Spell out all parameters in case the defaults change.
28 |     config = GPT2Config(
29 |         vocab_size=50257,
30 |         n_positions=1024,
31 |         n_ctx=1024,
32 |         n_embd=1024,
33 |         n_layer=24,
34 |         n_head=16,
35 |         n_inner=4096,
36 |         activation_function="gelu",  # used to be "gelu_new" in earlier versions
37 |         resid_pdrop=0.1,
38 |         embd_pdrop=0.1,
39 |         attn_pdrop=0.1,
40 |         layer_norm_epsilon=1e-5,
41 |         initializer_range=0.02,
42 |         summary_type="cls_index",
43 |         summary_use_proj=True,
44 |         summary_activation=None,
45 |         summary_proj_to_labels=True,
46 |         summary_first_dropout=0.1,
47 |         scale_attn_weights=True,
48 |         gradient_checkpointing=False,
49 |         use_cache=True,
50 |         bos_token_id=50256,
51 |         eos_token_id=50256,
52 |     )
53 | 
54 |     # Convert.
55 |     print("Converting to HF Checkpoint")
56 |     output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
57 | 
58 |     basename = args.output_folder
59 |     os.makedirs(basename, exist_ok=True)
60 | 
61 |     # Print the structure of converted state dict.
62 |     #if args.print_checkpoint_structure:
63 |     #    recursive_print(None, output_state_dict)
64 | 
65 |     # Store the config to file.
66 |     output_config_file = os.path.join(basename, "config.json")
67 |     output_config = config.to_dict()
68 |     output_config["architectures"] = ["GPT2LMHeadModel"]
69 |     output_config["model_type"] = "gpt2"
70 |     print(f'Saving config to "{output_config_file}"')
71 |     with open(output_config_file, "w") as f:
72 |         json.dump(output_config, f)
73 | 
74 |     # Store the state_dict to file.
75 |     output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
76 |     print(f'Saving checkpoint to "{output_checkpoint_file}"')
77 |     torch.save(output_state_dict, output_checkpoint_file)
78 | 
79 |     print("Now add tokenizer files and upload to the hub")
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()


--------------------------------------------------------------------------------
/src/tools/convert_checkpoint/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | import argparse
  4 | import importlib
  5 | import torch.multiprocessing as mp
  6 | import os
  7 | import sys
  8 | 
  9 | # A loader is a python file with at least two functions
 10 | # - add_arguments - takes in a parser and adds any arguments needed
 11 | # - load_checkpoint - takes in the queue and parsed arguments
 12 | 
 13 | # A saver is similar but has save_checkpoint instead of
 14 | # load_checkpoint
 15 | 
 16 | # The loader and saver process are each given a queue, the loader
 17 | # should load the checkpoint and send the weights in messages in the
 18 | # following order, the saver should receive them in this order and
 19 | # save the checkpoints. A message consists of a python dictionary with
 20 | # a "name" for error checking and an entry for each tensor as
 21 | # indicated below. Note that the weight sent over the queue are the
 22 | # full model weights, nothing split.
 23 | 
 24 | # If the loader ever sends "exit" to the queue, that means something
 25 | # went wrong and it is exiting.
 26 | 
 27 | # - Metadata Namespace with the following attributes:
 28 | #     model_type - GPT, BERT, T5, etc.  (Part of protocol to allow this to be deduced later instead of given on command line)
 29 | #     num_layers - Number of transformer layers
 30 | #     hidden_size
 31 | #     seq_length
 32 | #     num_attention_heads
 33 | #     max_position_embeddings
 34 | #     tokenizer_type
 35 | #     iteration
 36 | #     params_dtype
 37 | #     bert_binary_head - Used only if model_type is BERT
 38 | #     previous_tensor_parallel_size - Optional
 39 | #     previous_pipeline_parallel_size - Optional
 40 | #     true_vocab_size
 41 | #     make_vocab_size_divisble_by
 42 | #     consumed_train_samples
 43 | #     consumed_valid_samples
 44 | # messages
 45 | # {
 46 | #   "name": "embeddings"
 47 | #   "position embeddings"
 48 | #   "word embeddings"
 49 | # }
 50 | # (for each transformer layer):
 51 | # {
 52 | #   "name": "transformer layer N"
 53 | #   "input layernorm weight"
 54 | #   "input layernorm bias"
 55 | #   "qkv weight"
 56 | #   "qkv bias"
 57 | #   "dense weight"
 58 | #   "dense bias"
 59 | #   "post layernorm weight"
 60 | #   "post layernorm bias"
 61 | #   "mlp l0 weight"
 62 | #   "mlp l0 bias"
 63 | #   "mlp l1 weight"
 64 | #   "mlp l1 bias"
 65 | # }
 66 | # {
 67 | #   "name": "final layer norm"
 68 | #   "weight"
 69 | #   "bias"
 70 | # }
 71 | # if present (i.e. for BERT):
 72 | # {
 73 | #   "name": "pooler"
 74 | #   "weight"
 75 | #   "bias"
 76 | # }
 77 | # {
 78 | #   "name": "lm head"
 79 | #   "dense weight"
 80 | #   "dense bias"
 81 | #   "layernorm weight"
 82 | #   "layernorm bias"
 83 | # }
 84 | # {
 85 | #   "name": "binary head"
 86 | #   "weight"
 87 | #   "bias"
 88 | # }
 89 | # - "done"
 90 | 
 91 | def load_plugin(plugin_type, name):
 92 |     module_name = f"{plugin_type}_{name}"
 93 |     try:
 94 |         plugin = importlib.import_module(module_name)
 95 |     except ModuleNotFoundError:
 96 |         module_name = name
 97 |         try:
 98 |             plugin = importlib.import_module(module_name)
 99 |         except ModuleNotFoundError:
100 |             sys.exit(f"Unable to load {plugin_type} plugin {name}. Exiting.")
101 | 
102 |     if not hasattr(plugin, 'add_arguments'):
103 |         sys.exit(f"{module_name} module is not a plugin. Exiting.")
104 | 
105 |     print(f"Loaded {module_name} as the {plugin_type}.")
106 |     return plugin
107 | 
108 | def main():
109 |     import argparse
110 |     parser = argparse.ArgumentParser(description="Megatron Checkpoint Utility Arguments",
111 |                                      allow_abbrev=False, conflict_handler='resolve')
112 | 
113 |     parser.add_argument('--model-type', type=str, required=True,
114 |                         choices=['GPT', 'BERT'],
115 |                         help='Type of the model')
116 |     parser.add_argument('--loader', type=str, default='megatron',
117 |                         help='Module name to load checkpoint, should be on python path')
118 |     parser.add_argument('--saver', type=str, default='megatron',
119 |                         help='Module name to save checkpoint, shdoul be on python path')
120 |     parser.add_argument('--load-dir', type=str, required=True,
121 |                         help='Directory to load model checkpoint from')
122 |     parser.add_argument('--save-dir', type=str, required=True,
123 |                         help='Directory to save model checkpoint to')
124 |     parser.add_argument('--max-queue-size', type=int, default=50,
125 |                         help='Maximum number of tensors in the queue')
126 |     parser.add_argument('--no-checking', action='store_false',
127 |                         help='Do not perform checking on the name and ordering of weights',
128 |                         dest='checking')
129 | 
130 |     known_args, _ = parser.parse_known_args()
131 |     loader = load_plugin('loader', known_args.loader)
132 |     saver = load_plugin('saver', known_args.saver)
133 | 
134 |     loader.add_arguments(parser)
135 |     saver.add_arguments(parser)
136 | 
137 |     args = parser.parse_args()
138 | 
139 |     queue = mp.Queue(maxsize=args.max_queue_size)
140 | 
141 |     print("Starting saver...")
142 |     saver_proc = mp.Process(target=saver.save_checkpoint, args=(queue, args))
143 |     saver_proc.start()
144 | 
145 |     print("Starting loader...")
146 |     loader.load_checkpoint(queue, args)
147 | 
148 |     print("Waiting for saver to complete...")
149 |     saver_proc.join()
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     main()


--------------------------------------------------------------------------------