├── .coveragerc ├── .gitignore ├── .gitlab-ci.yml ├── LICENSE ├── README.md ├── docs ├── distrib_optimizer.md └── images │ └── distrib_optimizer │ ├── data_flow.png │ └── sharding_scheme.png ├── examples ├── detxoify_lm │ ├── README.md │ ├── annotations │ │ ├── filter-selfgeneration.py │ │ ├── perspective_api_annotate.py │ │ └── preprocess.sh │ ├── finetune_gpt.py │ ├── finetune_gpt_distributed-1.3b.sh │ ├── generate-1.3b.sh │ ├── generate_samples_gpt.py │ ├── perspective_api.py │ └── self_generation │ │ └── selfgenerate-1.3b-unconditional.sh ├── evaluate_retriever_nq.sh ├── evaluate_zeroshot_gpt.sh ├── finetune_mnli_distributed.sh ├── finetune_race_distributed.sh ├── finetune_retriever_distributed.sh ├── merge_mp_bert.sh ├── msdp │ ├── README.md │ ├── data_processing.sh │ ├── eval_knwl_generation.sh │ ├── eval_resp_generation.sh │ ├── prep_resp_gen.sh │ ├── prompt_knwl_gen.sh │ └── prompt_resp_gen.sh ├── pretrain_bert.sh ├── pretrain_bert_distributed.sh ├── pretrain_bert_distributed_with_mp.sh ├── pretrain_gpt.sh ├── pretrain_gpt3_175B.sh ├── pretrain_gpt_distributed.sh ├── pretrain_gpt_distributed_with_mp.sh ├── pretrain_ict.sh ├── pretrain_t5.sh ├── pretrain_t5_distributed.sh ├── pretrain_t5_distributed_with_mp.sh ├── run_text_generation_server_345M.sh ├── run_text_generation_server_345M_8_tensor_parallel.sh └── sc21 │ ├── CONFIG.sh │ ├── README.md │ ├── SBATCH.sh │ ├── SRUN.sh │ ├── run_figure_11.sh │ ├── run_figure_12.sh │ ├── run_figure_13.sh │ ├── run_figure_14.sh │ ├── run_figure_15.sh │ ├── run_figure_16.sh │ ├── run_figure_17.sh │ ├── run_figure_18.sh │ └── run_table_1.sh ├── images ├── Achieved_petaFLOPs.png └── cases_april2021.png ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── core │ ├── __init__.py │ ├── parallel_state.py │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ └── utils.py ├── data │ ├── Makefile │ ├── __init__.py │ ├── autoaugment.py │ ├── bert_dataset.py │ ├── biencoder_dataset_utils.py │ ├── blendable_dataset.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── gpt_dataset.py │ ├── helpers.cpp │ ├── ict_dataset.py │ ├── image_folder.py │ ├── indexed_dataset.py │ ├── orqa_wiki_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ ├── t5_dataset.py │ ├── test │ │ ├── test_indexed_dataset.py │ │ └── test_preprocess_data.sh │ └── vit_dataset.py ├── dist_signal_handler.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── fused_weight_gradient_dense.cpp │ ├── fused_weight_gradient_dense.cu │ ├── layer_norm_cuda.cpp │ ├── layer_norm_cuda_kernel.cu │ ├── scaled_masked_softmax.cpp │ ├── scaled_masked_softmax.h │ ├── scaled_masked_softmax_cuda.cu │ ├── scaled_softmax.cpp │ ├── scaled_softmax_cuda.cu │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax.h │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ ├── tests │ │ ├── __init__.py │ │ └── test_fused_kernels.py │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── distributed.py │ ├── enums.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_softmax.py │ ├── gpt_model.py │ ├── language_model.py │ ├── megablocks_utils.py │ ├── module.py │ ├── multiple_choice.py │ ├── realm_model.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vision │ │ ├── classification.py │ │ ├── dino.py │ │ ├── esvit_swin_backbone.py │ │ ├── inpainting.py │ │ ├── knn_monitor.py │ │ ├── mit_backbone.py │ │ ├── swin_backbone.py │ │ ├── utils.py │ │ └── vit_backbone.py ├── mpu │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py ├── optimizer │ ├── __init__.py │ ├── adafactor.py │ ├── clip_grads.py │ ├── distrib_optimizer.py │ ├── grad_scaler.py │ └── optimizer.py ├── optimizer_param_scheduler.py ├── p2p_communication.py ├── schedules.py ├── static │ └── index.html ├── text_generation │ ├── __init__.py │ ├── api.py │ ├── beam_utils.py │ ├── communication.py │ ├── forward_step.py │ ├── generation.py │ ├── sampling.py │ └── tokenization.py ├── text_generation_server.py ├── timers.py ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_ict.py ├── pretrain_t5.py ├── pretrain_vision_classify.py ├── pretrain_vision_dino.py ├── pretrain_vision_inpaint.py ├── setup.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── main.py ├── msdp │ ├── README.md │ ├── evaluate.py │ ├── main.py │ ├── metrics.py │ ├── preprocessing.py │ └── prompt.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification │ │ ├── classification.py │ │ └── eval_utils.py │ ├── finetune_utils.py │ ├── main.py │ └── segmentation │ │ ├── cityscapes.py │ │ ├── data.py │ │ ├── finetune_segformer.py │ │ ├── finetune_setr.py │ │ ├── metrics.py │ │ ├── seg_heads.py │ │ ├── seg_models.py │ │ ├── transforms.py │ │ └── utils.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── __init__.py ├── tensor_parallel │ ├── test_cross_entropy.py │ ├── test_data.py │ ├── test_mappings.py │ ├── test_random.py │ └── test_tensor_parallel_utils.py ├── test_basic.py ├── test_parallel_state.py ├── test_utilities.py └── test_utils.py └── tools ├── checkpoint_loader_megatron.py ├── checkpoint_saver_megatron.py ├── checkpoint_util.py ├── linter.py ├── merge_datasets.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py ├── preprocess_data.py ├── preprocess_data_nmt.py ├── preprocess_data_partitions.py ├── run_text_generation_server.py └── text_generation_cli.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [html] 2 | directory = coverage 3 | 4 | [run] 5 | data_file = .coverage_$LOCAL_RANK 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel 2 | 3 | test: 4 | tags: 5 | - docker_gpu_enabled 6 | script: 7 | - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/ 8 | coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' 9 | artifacts: 10 | paths: 11 | - coverage 12 | expire_in: 30 days 13 | -------------------------------------------------------------------------------- /docs/distrib_optimizer.md: -------------------------------------------------------------------------------- 1 | # Distributed Optimizer 2 | 3 | The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks, versus the current method of replicating the optimizer state across data parallel ranks. As described in https://arxiv.org/abs/1910.02054, this branch specifically implements the following: 4 | 5 | - [yes] distribute all 'non-overlapping' optimizer state (i.e., model params already in fp32 are NOT distributed) 6 | - [no] distribute model gradients 7 | - [no] distribute model parameters 8 | 9 | Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In the current implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size): 10 | 11 | | | Non-distributed optim | Distributed optim | 12 | | ------ | ------ | ------ | 13 | | float16 param, float16 grads | 20 | 4 + 16/d | 14 | | float16 param, fp32 grads | 18 | 6 + 12/d | 15 | | fp32 param, fp32 grads | 16 | 8 + 8/d | 16 | 17 | The implementation of the distributed optimizer is centered on using the contiguous grad buffer for communicating grads & params between the model state and the optimizer state. The grad buffer at any given moment either holds: 18 | 19 | 1. all model grads 20 | 2. a 1/d size _copy_ of the main grads (before copying to the optimizer state) 21 | 3. a 1/d size _copy_ of the main params (after copying from the optimizer state) 22 | 4. all model params 23 | 5. zeros (or None), between iterations 24 | 25 | The grad buffer is used for performing reduce-scatter and all-gather operations, for passing grads & params between the model state and optimizer state. With this implementation, no dynamic buffers are allocated. 26 | 27 | The figures below illustrate the grad buffer's sharding scheme, and the key steps of the distributed optimizer's param update: 28 | 29 | ## Data flow 30 | 31 | ![Data flow](images/distrib_optimizer/data_flow.png) 32 | 33 | ## Sharding scheme 34 | 35 | ![Sharding scheme](images/distrib_optimizer/sharding_scheme.png) 36 | 37 | ## Key steps 38 | 39 | _(note: using illustrations above, and assuming fp16 grads)_ 40 | 41 | - Backward pass finishes (grad buffer holds 16 fp16 grad elements) 42 | - Call reduce-scatter on each DP rank 43 | - Each DP rank now has 4 elements within the grad buffer that are fully reduced (remaining 12 elements are garbage) 44 | - Each DP rank copies its relevant 4 fp16 grad elements from the grad buffer into 4 fp32 main grad elements (separate buffer, owned by the optimizer); i.e. 45 | - DP rank 0 copies elements [0:4] 46 | - DP rank 1 copies elements [4:8] 47 | - DP rank 2 copies elements [8:12] 48 | - DP rank 3 copies elements [12:16] 49 | - Optimizer.step() 50 | - Each DP rank copies its 4 fp32 main (/optimizer) param elements into the corresponding 4 fp16 elements in the grad buffer 51 | - Call all-gather on each DP rank 52 | - Grad buffer now contains all 16, fully updated, fp16 model param elements 53 | - Copy updated model params from grad buffer into their respective param tensors 54 | - (At this point, grad buffer is ready to be zero'd for the next iteration) 55 | -------------------------------------------------------------------------------- /docs/images/distrib_optimizer/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/docs/images/distrib_optimizer/data_flow.png -------------------------------------------------------------------------------- /docs/images/distrib_optimizer/sharding_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/docs/images/distrib_optimizer/sharding_scheme.png -------------------------------------------------------------------------------- /examples/detxoify_lm/annotations/filter-selfgeneration.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from typing import Dict, Optional, List 4 | 5 | import joblib 6 | from googleapiclient import discovery 7 | from googleapiclient.errors import HttpError 8 | 9 | import argparse 10 | 11 | from tqdm import tqdm 12 | 13 | parser = argparse.ArgumentParser(description='Process some integers.') 14 | parser.add_argument('--data-path', type=str, default='', 15 | help='data path to load the jsonl') 16 | parser.add_argument('--out-path', type=str, default='', 17 | help='data path to load the jsonl') 18 | parser.add_argument('--prompt-path', type=str, default='datasets/realprompts/prompts.jsonl', 19 | help='data path to load the prompt jsonl') 20 | parser.add_argument('--workers', type=int, default=10, 21 | help='Number of worker processes to launch') 22 | 23 | 24 | def get_corpus_scores(lines): 25 | scores = [] 26 | corpus = [] 27 | 28 | for line in tqdm(lines): 29 | data = json.loads(line) 30 | score = data['score'] 31 | if score is not None: 32 | scores.append(score['toxicity']) 33 | corpus.append(data) 34 | return scores, corpus 35 | 36 | 37 | def main(): 38 | args = parser.parse_args() 39 | 40 | with open(args.data_path) as f: 41 | lines = f.readlines() 42 | 43 | print(f"total line number: {len(lines)}") 44 | 45 | scores, corpus = get_corpus_scores(lines) 46 | import numpy as np 47 | scores = np.array(scores) 48 | indices = np.argsort(scores) 49 | 50 | print(f"total valid samples: {len(scores)}") 51 | 52 | print(f"Avg toxicity (all): {np.mean(scores)} +- {np.std(scores)}") 53 | print(f"Avg toxicity (toxic): {np.mean(scores[scores > 0.5])} +- {np.std(scores[scores > 0.5])}") 54 | print(f"Toxic Percentage {sum(scores > 0.5) / len(scores)}") 55 | print(f"Avg toxicity (nontoxic): {np.mean(scores[scores <= 0.5])} +- {np.std(scores[scores <= 0.5])}") 56 | print(f"Nontoxic Percentage {sum(scores <= 0.5) / len(scores)}") 57 | 58 | samples_left = len(lines) // 2 59 | print(f"After filtering: {samples_left} of samples are left") 60 | nontoxic_indices = indices[:samples_left] 61 | print(f"Avg toxicity (filtered): {np.mean(scores[nontoxic_indices])} +- {np.std(scores[nontoxic_indices])}") 62 | print(f"Toxicity Range (filtered): {np.min(scores[nontoxic_indices])} ~ {np.max(scores[nontoxic_indices])}") 63 | nontoxic_data = [corpus[ind] for ind in nontoxic_indices] 64 | print(f"Total samples after filtering: {len(nontoxic_data)}") 65 | print(f"Examples: {nontoxic_data[:3]}") 66 | 67 | from sklearn.utils import shuffle 68 | nontoxic_data = shuffle(nontoxic_data) 69 | 70 | with open(args.out_path, 'w') as f: 71 | for x in nontoxic_data: 72 | f.write(json.dumps(x) + '\n') 73 | 74 | 75 | main() -------------------------------------------------------------------------------- /examples/detxoify_lm/annotations/preprocess.sh: -------------------------------------------------------------------------------- 1 | VOCAB_FILE=pt2-vocab.json 2 | MERGE_FILE=gpt2-merges.txt 3 | 4 | python3 tools/preprocess_data.py \ 5 | --input $1 \ 6 | --output-prefix $2 \ 7 | --vocab-file $VOCAB_FILE \ 8 | --merge-file $MERGE_FILE \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --append-eod --workers 20 --chunk-size 25 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Change for multinode config 4 | GPUS_PER_NODE=16 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=$(($RANDOM + 1024)) 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | # input 12 | DATA_PATH=$1 13 | SHARE_DATA=$PWD # current work dir 14 | FINETUNED_PATH="$SHARE_DATA/$2" 15 | lr=$3 16 | bs=$4 17 | iter=$5 18 | CHECKPOINT_PATH=$6 19 | 20 | # vocab 21 | VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab 22 | MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file 23 | 24 | # tensorboard 25 | TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2" 26 | mkdir -p ${TENSORBOARD_DIR} 27 | 28 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 29 | 30 | python -m torch.distributed.run $DISTRIBUTED_ARGS \ 31 | examples/detxoify_lm/finetune_gpt.py \ 32 | --num-layers 24 \ 33 | --hidden-size 2048 \ 34 | --num-attention-heads 32 \ 35 | --micro-batch-size 4 \ 36 | --global-batch-size $bs \ 37 | --seq-length 2048 \ 38 | --max-position-embeddings 2048 \ 39 | --train-iters $iter \ 40 | --save $FINETUNED_PATH \ 41 | --load $CHECKPOINT_PATH \ 42 | --data-path $DATA_PATH \ 43 | --data-path2 ${DATA_BLEND} \ 44 | --vocab-file $VOCAB_FILE \ 45 | --merge-file $MERGE_FILE \ 46 | --data-impl mmap \ 47 | --split 100,0,0 \ 48 | --distributed-backend nccl \ 49 | --lr-decay-style constant \ 50 | --lr $lr \ 51 | --clip-grad 1.0 \ 52 | --weight-decay 0.1 \ 53 | --adam-beta1 0.9 \ 54 | --adam-beta2 0.95 \ 55 | --checkpoint-activations \ 56 | --log-interval 1 \ 57 | --save-interval 78 \ 58 | --eval-interval 78 \ 59 | --eval-iters 50 \ 60 | --fp16 \ 61 | --DDP-impl local \ 62 | --finetune --no-load-optim \ 63 | --log-validation-ppl-to-tensorboard \ 64 | --tensorboard-dir ${TENSORBOARD_DIR} 65 | -------------------------------------------------------------------------------- /examples/detxoify_lm/generate-1.3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | VOCAB_FILE=gpt2-vocab.json 4 | MERGE_FILE=gpt2-merges.txt 5 | 6 | GPUS_PER_NODE=1 7 | # Change for multinode config 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=$(($RANDOM + 1024)) 10 | NNODES=1 11 | NODE_RANK=0 12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 13 | NUM_SAMPLES=$(wc -l < $1) 14 | PREFIX=$(basename $2) 15 | SEED=$(($RANDOM)) 16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl 17 | 18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 19 | 20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 21 | --tensor-model-parallel-size 1 \ 22 | --num-layers 24 \ 23 | --hidden-size 2048 \ 24 | --load $CHECKPOINT_PATH \ 25 | --num-attention-heads 32 \ 26 | --max-position-embeddings 2048 \ 27 | --tokenizer-type GPT2BPETokenizer \ 28 | --fp16 \ 29 | --micro-batch-size 400 \ 30 | --seq-length 2048 \ 31 | --out-seq-length 20 \ 32 | --temperature 1.0 \ 33 | --vocab-file $VOCAB_FILE \ 34 | --merge-file $MERGE_FILE \ 35 | --sample-input-file $1 \ 36 | --sample-output-file $OUTPUT \ 37 | --num-samples $NUM_SAMPLES \ 38 | --max-tokens-to-oom 1200000 \ 39 | --top_p 0.9 \ 40 | --seed $SEED 41 | 42 | -------------------------------------------------------------------------------- /examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | SHARE_DATA=$PWD # current work dir 4 | VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab 5 | MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file 6 | 7 | GPUS_PER_NODE=1 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=$(($RANDOM + 1024)) 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | SEED=$3 15 | SUFFIX=$(basename $CHECKPOINT_PATH) 16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/ 17 | mkdir -p $save_dir 18 | echo $save_dir/$SEED.out 19 | 20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 21 | 22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 23 | --tensor-model-parallel-size 1 \ 24 | --num-layers 24 \ 25 | --hidden-size 2048 \ 26 | --load $CHECKPOINT_PATH \ 27 | --num-attention-heads 32 \ 28 | --max-position-embeddings 2048 \ 29 | --tokenizer-type GPT2BPETokenizer \ 30 | --fp16 \ 31 | --micro-batch-size 150 \ 32 | --seq-length 2048 \ 33 | --out-seq-length 1000 \ 34 | --temperature 1.0 \ 35 | --vocab-file $VOCAB_FILE \ 36 | --merge-file $MERGE_FILE \ 37 | --num-samples $1 \ 38 | --top_p 0.9 \ 39 | --max-tokens-to-oom 1200000 \ 40 | --genfile $save_dir/$SEED.out \ 41 | --seed $SEED 42 | 43 | -------------------------------------------------------------------------------- /examples/evaluate_retriever_nq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained 4 | # ICT model or a finetuned model for Natural Question task 5 | 6 | # Datasets can be downloaded from the following link: 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 8 | 9 | EVIDENCE_DATA_DIR= 10 | EMBEDDING_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | QA_FILE= 14 | 15 | python tasks/main.py \ 16 | --task RETRIEVER-EVAL \ 17 | --tokenizer-type BertWordPieceLowerCase \ 18 | --num-layers 12 \ 19 | --hidden-size 768 \ 20 | --num-attention-heads 12 \ 21 | --tensor-model-parallel-size 1 \ 22 | --micro-batch-size 128 \ 23 | --activations-checkpoint-method uniform \ 24 | --seq-length 512 \ 25 | --max-position-embeddings 512 \ 26 | --load ${CHECKPOINT_PATH} \ 27 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 28 | --embedding-path ${EMBEDDING_PATH} \ 29 | --retriever-seq-length 256 \ 30 | --vocab-file bert-vocab.txt\ 31 | --qa-data-test ${QA_FILE} \ 32 | --faiss-use-gpu \ 33 | --retriever-report-topk-accuracies 1 5 20 100 \ 34 | --fp16 \ 35 | --indexer-log-interval 1000 \ 36 | --indexer-batch-size 128 37 | 38 | 39 | -------------------------------------------------------------------------------- /examples/evaluate_zeroshot_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TASK="LAMBADA" 12 | 13 | VALID_DATA= 14 | VOCAB_FILE=gpt2-vocab.json 15 | MERGE_FILE=gpt2-merges.txt 16 | CHECKPOINT=checkpoints/gpt2_345m 17 | 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 20 | --task $TASK \ 21 | --valid-data $VALID_DATA \ 22 | --tokenizer-type GPT2BPETokenizer \ 23 | --strict-lambada \ 24 | --vocab-file $VOCAB_FILE \ 25 | --merge-file $MERGE_FILE \ 26 | --load $CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 8 \ 32 | --activations-checkpoint-method uniform \ 33 | --seq-length 1024 \ 34 | --max-position-embeddings 1024 \ 35 | --log-interval 10 \ 36 | --fp16 \ 37 | --no-load-optim \ 38 | --no-load-rng 39 | -------------------------------------------------------------------------------- /examples/finetune_mnli_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv" 12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \ 13 | data/glue_data/MNLI/dev_mismatched.tsv" 14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 15 | VOCAB_FILE=bert-vocab.txt 16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task MNLI \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 5 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 8 \ 32 | --activations-checkpoint-method uniform \ 33 | --lr 5.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.065 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 500000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/finetune_race_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/RACE/train/middle" 12 | VALID_DATA="data/RACE/dev/middle \ 13 | data/RACE/dev/high" 14 | VOCAB_FILE=bert-vocab.txt 15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 16 | CHECKPOINT_PATH=checkpoints/bert_345m_race 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task RACE \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 3 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 4 \ 32 | --activations-checkpoint-method uniform \ 33 | --lr 1.0e-5 \ 34 | --lr-decay-style linear \ 35 | --lr-warmup-fraction 0.06 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 100000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --clip-grad 1.0 \ 45 | --hidden-dropout 0.1 \ 46 | --attention-dropout 0.1 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /examples/finetune_retriever_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Finetune a BERT or pretrained ICT model using Google natural question data 4 | # Datasets can be downloaded from the following link: 5 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 6 | 7 | WORLD_SIZE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | CHECKPOINT_PATH= 16 | 17 | # Load either of the below 18 | BERT_LOAD_PATH= 19 | PRETRAINED_CHECKPOINT= 20 | 21 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 22 | --task RET-FINETUNE-NQ \ 23 | --train-with-neg \ 24 | --train-hard-neg 1 \ 25 | --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \ 26 | --num-layers 12 \ 27 | --hidden-size 768 \ 28 | --num-attention-heads 12 \ 29 | --tensor-model-parallel-size 1 \ 30 | --tokenizer-type BertWordPieceLowerCase \ 31 | --train-data nq-train.json \ 32 | --valid-data nq-dev.json \ 33 | --save ${CHECKPOINT_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --vocab-file bert-vocab.txt \ 36 | --bert-load ${BERT_LOAD_PATH} \ 37 | --save-interval 5000 \ 38 | --log-interval 10 \ 39 | --eval-interval 20000 \ 40 | --eval-iters 100 \ 41 | --indexer-log-interval 1000 \ 42 | --faiss-use-gpu \ 43 | --DDP-impl torch \ 44 | --fp16 \ 45 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 46 | --seq-length 512 \ 47 | --retriever-seq-length 256 \ 48 | --max-position-embeddings 512 \ 49 | --retriever-score-scaling \ 50 | --epochs 80 \ 51 | --micro-batch-size 8 \ 52 | --eval-micro-batch-size 16 \ 53 | --indexer-batch-size 128 \ 54 | --lr 2e-5 \ 55 | --lr-warmup-fraction 0.01 \ 56 | --weight-decay 1e-1 57 | -------------------------------------------------------------------------------- /examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TENSOR_MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /examples/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp). 5 | 6 | -------------------------------------------------------------------------------- /examples/msdp/data_processing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Data preparation for our framework: preprocessing the WoW and WoI datasets 4 | # The datasets can be downloaded through the following links: 5 | # WoW: https://parl.ai/projects/wizard_of_wikipedia/ 6 | # WoI: https://parl.ai/projects/sea/ 7 | 8 | DIR=`pwd` 9 | # Before running the preprocessing, please download 10 | # the wizard of wikipedia and wizard datasets 11 | WOW_DATA_FOLDER= 12 | WOI_DATA_FOLDER= 13 | 14 | # We provide examples for processing the raw data from Wizard of Wikipedia 15 | # Processing the train dataset (train.json) 16 | python ${DIR}/tasks/msdp/preprocessing.py \ 17 | --func process_wow_dataset \ 18 | --raw_file ${WOW_DATA_FOLDER}/train.json \ 19 | --processed_file ${WOW_DATA_FOLDER}/train_processed.txt 20 | 21 | # Processing test seen dataset (test_random_split.json) 22 | python ${DIR}/tasks/msdp/preprocessing.py \ 23 | --func process_wow_dataset \ 24 | --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \ 25 | --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \ 26 | --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \ 27 | --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt 28 | 29 | # processing test unseen dataset (test_topic_split.json) 30 | python ${DIR}/tasks/msdp/preprocessing.py \ 31 | --func process_wow_dataset \ 32 | --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \ 33 | --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \ 34 | --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \ 35 | --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt 36 | 37 | 38 | # We provide the following script to process the raw data from Wizard of Internet 39 | # Processing the test dataset (test.jsonl) 40 | python ${DIR}/tasks/msdp/preprocessing.py \ 41 | --func process_woi_dataset \ 42 | --raw_file ${WOI_DATA_FOLDER}/test.jsonl \ 43 | --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \ 44 | --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \ 45 | --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt 46 | 47 | 48 | # Get the knowledge generation prompts for the each test dataset in WoW and WoI 49 | MODEL_FILE= 50 | # WoW test seen 51 | python ${DIR}/tasks/msdp/preprocessing.py \ 52 | --func get_knwl_gen_prompts \ 53 | --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \ 54 | --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ 55 | --model_file ${MODEL_FILE} \ 56 | --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \ 57 | --data_type wow_seen 58 | 59 | # WoW test unseen 60 | python ${DIR}/tasks/msdp/preprocessing.py \ 61 | --func get_knwl_gen_prompts \ 62 | --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \ 63 | --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ 64 | --model_file ${MODEL_FILE} \ 65 | --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \ 66 | --data_type wow_unseen 67 | 68 | # WoI 69 | python ${DIR}/tasks/msdp/preprocessing.py \ 70 | --func get_knwl_gen_prompts \ 71 | --test_file ${WOI_DATA_FOLDER}/test_processed.txt \ 72 | --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ 73 | --model_file ${MODEL_FILE} \ 74 | --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \ 75 | --data_type woi 76 | 77 | 78 | # Get the response generation prompts (can be applied for all the test datasets) 79 | python ${DIR}/tasks/msdp/preprocessing.py \ 80 | --func get_resp_gen_prompts \ 81 | --train_file ${WOW_DATA_FOLDER}/train_processed.txt \ 82 | --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt 83 | 84 | -------------------------------------------------------------------------------- /examples/msdp/eval_knwl_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################### 4 | # Evaluate the F1 scores. 5 | ######################### 6 | 7 | WORLD_SIZE=1 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 9 | --nnodes 1 \ 10 | --node_rank 0 \ 11 | --master_addr localhost \ 12 | --master_port 6000" 13 | 14 | MODEL_GEN_PATH= \ 15 | (e.g., /testseen_knowledge_generations.txt) 16 | GROUND_TRUTH_PATH= \ 17 | (e.g., /testseen_knowledge_reference.txt) 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --seq-length 2048 \ 24 | --max-position-embeddings 2048 \ 25 | --micro-batch-size 4 \ 26 | --task MSDP-EVAL-F1 \ 27 | --guess-file ${MODEL_GEN_PATH} \ 28 | --answer-file ${GROUND_TRUTH_PATH} 29 | 30 | 31 | ############################################ 32 | # Evaluate BLEU, METEOR, and ROUGE-L scores. 33 | ############################################ 34 | 35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 37 | 38 | # To evaluate on these metrics, please setup the environments based on 39 | # the nlg-eval github, and run the corresponding evaluation commands. 40 | 41 | nlg-eval \ 42 | --hypothesis= \ 43 | --references= 44 | -------------------------------------------------------------------------------- /examples/msdp/eval_resp_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################### 4 | # Evaluate the F1 scores. 5 | ######################### 6 | 7 | WORLD_SIZE=1 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 9 | --nnodes 1 \ 10 | --node_rank 0 \ 11 | --master_addr localhost \ 12 | --master_port 6000" 13 | 14 | MODEL_GEN_PATH= \ 15 | (e.g., /testseen_response_generations.txt) 16 | GROUND_TRUTH_PATH= \ 17 | (e.g., /testseen_response_reference.txt) 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --seq-length 2048 \ 24 | --max-position-embeddings 2048 \ 25 | --micro-batch-size 4 \ 26 | --task MSDP-EVAL-F1 \ 27 | --guess-file ${MODEL_GEN_PATH} \ 28 | --answer-file ${GROUND_TRUTH_PATH} 29 | 30 | 31 | ########################## 32 | # Evaluate the KF1 scores. 33 | ########################## 34 | 35 | MODEL_GEN_PATH= \ 36 | (e.g., /testseen_response_generations.txt) 37 | GROUND_TRUTH_PATH= \ 38 | (e.g., /testseen_knowledge_reference.txt) 39 | 40 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 41 | --num-layers 24 \ 42 | --hidden-size 1024 \ 43 | --num-attention-heads 16 \ 44 | --seq-length 2048 \ 45 | --max-position-embeddings 2048 \ 46 | --micro-batch-size 4 \ 47 | --task MSDP-EVAL-F1 \ 48 | --guess-file ${MODEL_GEN_PATH} \ 49 | --answer-file ${GROUND_TRUTH_PATH} 50 | 51 | 52 | ############################################ 53 | # Evaluate BLEU, METEOR, and ROUGE-L scores. 54 | ############################################ 55 | 56 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 57 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 58 | 59 | # To evaluate on these metrics, please setup the environments based on 60 | # the nlg-eval github, and run the corresponding evaluation commands. 61 | 62 | nlg-eval \ 63 | --hypothesis= \ 64 | --references= 65 | -------------------------------------------------------------------------------- /examples/msdp/prep_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Preparing the input file for the response generation (second-stage prompting) 4 | 5 | DIR=`pwd` 6 | 7 | TEST_FILE= \ 8 | (e.g., /testseen_processed.txt) 9 | KNOWLEDGE_FILE= \ 10 | (e.g., /testseen_knowledge_generations.txt) 11 | PROCESSED_FILE= \ 12 | (e.g., /testseen_processed_with_generated_knowledge.txt) 13 | 14 | python ${DIR}/tasks/msdp/preprocessing.py \ 15 | --func prepare_input \ 16 | --test_file ${TEST_FILE} \ 17 | --knwl_gen_file ${KNOWLEDGE_FILE} \ 18 | --processed_file ${PROCESSED_FILE} 19 | -------------------------------------------------------------------------------- /examples/msdp/prompt_knwl_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge 5 | # The size of the pretrained language model is 357M 6 | 7 | WORLD_SIZE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | CHECKPOINT_PATH= (e.g., /357m) 16 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 17 | MERGE_PATH= (e.g., /gpt2-merges.txt) 18 | INPUT_PATH= \ 19 | (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /testseen_knowledge_prompts.json) 22 | OUTPUT_PATH= \ 23 | (e.g., /testseen_knowledge_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type knowledge \ 42 | --num-prompt-examples 10 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/msdp/prompt_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1 5 | # The output is the corresponding response. 6 | # The size of the pretrained language model is 357M 7 | 8 | WORLD_SIZE=8 9 | 10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 11 | --nnodes 1 \ 12 | --node_rank 0 \ 13 | --master_addr localhost \ 14 | --master_port 6000" 15 | 16 | CHECKPOINT_PATH= (e.g., /357m) 17 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 18 | MERGE_PATH= (e.g., /gpt2-merges.txt) 19 | INPUT_PATH= (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /response_prompts.txt) 22 | OUTPUT_PATH= \ 23 | (e.g., /output_testseen_response_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type response \ 42 | --num-prompt-examples 20 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH=_text_sentence 6 | CHECKPOINT_PATH= 7 | 8 | python pretrain_bert.py \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --num-attention-heads 16 \ 12 | --micro-batch-size 4 \ 13 | --global-batch-size 8 \ 14 | --seq-length 512 \ 15 | --max-position-embeddings 512 \ 16 | --train-iters 2000000 \ 17 | --lr-decay-iters 990000 \ 18 | --save $CHECKPOINT_PATH \ 19 | --load $CHECKPOINT_PATH \ 20 | --data-path $DATA_PATH \ 21 | --vocab-file bert-vocab.txt \ 22 | --data-impl mmap \ 23 | --split 949,50,1 \ 24 | --lr 0.0001 \ 25 | --min-lr 0.00001 \ 26 | --lr-decay-style linear \ 27 | --lr-warmup-fraction .01 \ 28 | --weight-decay 1e-2 \ 29 | --clip-grad 1.0 \ 30 | --log-interval 100 \ 31 | --save-interval 10000 \ 32 | --eval-interval 1000 \ 33 | --eval-iters 10 \ 34 | --fp16 35 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH=_text_sentence 12 | CHECKPOINT_PATH= 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | pretrain_bert.py \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --num-attention-heads 16 \ 21 | --micro-batch-size 4 \ 22 | --global-batch-size 32 \ 23 | --seq-length 512 \ 24 | --max-position-embeddings 512 \ 25 | --train-iters 1000000 \ 26 | --save $CHECKPOINT_PATH \ 27 | --load $CHECKPOINT_PATH \ 28 | --data-path $DATA_PATH \ 29 | --vocab-file bert-vocab.txt \ 30 | --data-impl mmap \ 31 | --split 949,50,1 \ 32 | --distributed-backend nccl \ 33 | --lr 0.0001 \ 34 | --lr-decay-style linear \ 35 | --min-lr 1.0e-5 \ 36 | --lr-decay-iters 990000 \ 37 | --weight-decay 1e-2 \ 38 | --clip-grad 1.0 \ 39 | --lr-warmup-fraction .01 \ 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH=_text_sentence 12 | VOCAB_FILE= 13 | CHECKPOINT_PATH= 14 | 15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 16 | 17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 18 | pretrain_bert.py \ 19 | --tensor-model-parallel-size 2 \ 20 | --pipeline-model-parallel-size 2 \ 21 | --num-layers 24 \ 22 | --hidden-size 1024 \ 23 | --num-attention-heads 16 \ 24 | --micro-batch-size 2 \ 25 | --global-batch-size 16 \ 26 | --seq-length 512 \ 27 | --max-position-embeddings 512 \ 28 | --train-iters 1000000 \ 29 | --save $CHECKPOINT_PATH \ 30 | --load $CHECKPOINT_PATH \ 31 | --data-path $DATA_PATH \ 32 | --vocab-file $VOCAB_FILE \ 33 | --data-impl mmap \ 34 | --split 949,50,1 \ 35 | --distributed-backend nccl \ 36 | --lr 0.0001 \ 37 | --lr-decay-style linear \ 38 | --min-lr 1.0e-5 \ 39 | --lr-decay-iters 990000 \ 40 | --weight-decay 1e-2 \ 41 | --clip-grad 1.0 \ 42 | --lr-warmup-fraction .01 \ 43 | --log-interval 100 \ 44 | --save-interval 10000 \ 45 | --eval-interval 1000 \ 46 | --eval-iters 10 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /examples/pretrain_gpt.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | DATA_PATH=_text_document 9 | CHECKPOINT_PATH= 10 | 11 | 12 | python pretrain_gpt.py \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --micro-batch-size 4 \ 17 | --global-batch-size 8 \ 18 | --seq-length 1024 \ 19 | --max-position-embeddings 1024 \ 20 | --train-iters 500000 \ 21 | --lr-decay-iters 320000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file gpt2-vocab.json \ 26 | --merge-file gpt2-merges.txt \ 27 | --data-impl mmap \ 28 | --split 949,50,1 \ 29 | --distributed-backend nccl \ 30 | --lr 0.00015 \ 31 | --min-lr 1.0e-5 \ 32 | --lr-decay-style cosine \ 33 | --weight-decay 1e-2 \ 34 | --clip-grad 1.0 \ 35 | --lr-warmup-fraction .01 \ 36 | --activations-checkpoint-method uniform \ 37 | --log-interval 100 \ 38 | --save-interval 10000 \ 39 | --eval-interval 1000 \ 40 | --eval-iters 10 \ 41 | --fp16 42 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_175B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #SBATCH --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b 5 | 6 | 7 | DIR=`pwd` 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 9 | mkdir -p $DIR/logs 10 | 11 | 12 | DATASET_1="" 13 | DATASET_2="" 14 | DATASET_3="" 15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" 16 | 17 | 18 | options=" \ 19 | --tensor-model-parallel-size 8 \ 20 | --pipeline-model-parallel-size 16 \ 21 | --num-layers 96 \ 22 | --hidden-size 12288 \ 23 | --num-attention-heads 96 \ 24 | --seq-length 2048 \ 25 | --max-position-embeddings 2048 \ 26 | --micro-batch-size 1 \ 27 | --global-batch-size 1536 \ 28 | --rampup-batch-size 16 16 5859375 \ 29 | --train-samples 146484375 \ 30 | --lr-decay-samples 126953125 \ 31 | --lr-warmup-samples 183105 \ 32 | --lr 6.0e-5 \ 33 | --min-lr 6.0e-6 \ 34 | --lr-decay-style cosine \ 35 | --log-interval 10 \ 36 | --eval-iters 40 \ 37 | --eval-interval 1000 \ 38 | --data-path ${DATASET} \ 39 | --vocab-file \ 40 | --merge-file \ 41 | --save-interval 1000 \ 42 | --save \ 43 | --load \ 44 | --split 98,2,0 \ 45 | --clip-grad 1.0 \ 46 | --weight-decay 0.1 \ 47 | --adam-beta1 0.9 \ 48 | --adam-beta2 0.95 \ 49 | --init-method-std 0.006 \ 50 | --tensorboard-dir \ 51 | --fp16 \ 52 | --activations-checkpoint-method uniform " 53 | 54 | 55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}" 56 | 57 | 58 | srun -l \ 59 | --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \ 60 | --container-mounts "" \ 61 | --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}" 62 | 63 | 64 | set +x 65 | 66 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DATA_PATH=_text_document 14 | CHECKPOINT_PATH= 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 19 | pretrain_gpt.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --micro-batch-size 8 \ 24 | --global-batch-size 64 \ 25 | --seq-length 1024 \ 26 | --max-position-embeddings 1024 \ 27 | --train-iters 500000 \ 28 | --lr-decay-iters 320000 \ 29 | --save $CHECKPOINT_PATH \ 30 | --load $CHECKPOINT_PATH \ 31 | --data-path $DATA_PATH \ 32 | --vocab-file gpt2-vocab.json \ 33 | --merge-file gpt2-merges.txt \ 34 | --data-impl mmap \ 35 | --split 949,50,1 \ 36 | --distributed-backend nccl \ 37 | --lr 0.00015 \ 38 | --lr-decay-style cosine \ 39 | --min-lr 1.0e-5 \ 40 | --weight-decay 1e-2 \ 41 | --clip-grad 1.0 \ 42 | --lr-warmup-fraction .01 \ 43 | --activations-checkpoint-method uniform \ 44 | --log-interval 100 \ 45 | --save-interval 10000 \ 46 | --eval-interval 1000 \ 47 | --eval-iters 10 \ 48 | --fp16 49 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DATA_PATH=_text_document 14 | CHECKPOINT_PATH= 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 19 | pretrain_gpt.py \ 20 | --tensor-model-parallel-size 2 \ 21 | --pipeline-model-parallel-size 2 \ 22 | --sequence-parallel \ 23 | --num-layers 24 \ 24 | --hidden-size 1024 \ 25 | --num-attention-heads 16 \ 26 | --micro-batch-size 4 \ 27 | --global-batch-size 16 \ 28 | --seq-length 1024 \ 29 | --max-position-embeddings 1024 \ 30 | --train-iters 500000 \ 31 | --lr-decay-iters 320000 \ 32 | --save $CHECKPOINT_PATH \ 33 | --load $CHECKPOINT_PATH \ 34 | --data-path $DATA_PATH \ 35 | --vocab-file gpt2-vocab.json \ 36 | --merge-file gpt2-merges.txt \ 37 | --data-impl mmap \ 38 | --split 949,50,1 \ 39 | --distributed-backend nccl \ 40 | --lr 0.00015 \ 41 | --lr-decay-style cosine \ 42 | --min-lr 1.0e-5 \ 43 | --weight-decay 1e-2 \ 44 | --clip-grad 1.0 \ 45 | --lr-warmup-fraction .01 \ 46 | --activations-checkpoint-method uniform \ 47 | --log-interval 100 \ 48 | --save-interval 10000 \ 49 | --eval-interval 1000 \ 50 | --eval-iters 10 \ 51 | --fp16 52 | -------------------------------------------------------------------------------- /examples/pretrain_ict.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "217M" parameter biencoder model for ICT retriever 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | PRETRAINED_BERT_PATH= 9 | TEXT_DATA_PATH= 10 | TITLE_DATA_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | 14 | python pretrain_ict.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 32 \ 20 | --seq-length 256 \ 21 | --max-position-embeddings 512 \ 22 | --train-iters 100000 \ 23 | --vocab-file bert-vocab.txt \ 24 | --tokenizer-type BertWordPieceLowerCase \ 25 | --DDP-impl torch \ 26 | --bert-load ${PRETRAINED_BERT_PATH} \ 27 | --log-interval 100 \ 28 | --eval-interval 1000 \ 29 | --eval-iters 10 \ 30 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 31 | --retriever-score-scaling \ 32 | --load $CHECKPOINT_PATH \ 33 | --save $CHECKPOINT_PATH \ 34 | --data-path ${TEXT_DATA_PATH} \ 35 | --titles-data-path ${TITLE_DATA_PATH} \ 36 | --lr 0.0001 \ 37 | --lr-decay-style linear \ 38 | --weight-decay 1e-2 \ 39 | --clip-grad 1.0 \ 40 | --lr-warmup-fraction 0.01 \ 41 | --save-interval 4000 \ 42 | --exit-interval 8000 \ 43 | --query-in-block-prob 0.1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_t5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH= 6 | VOCAB_FILE= 7 | CHECKPOINT_PATH= 8 | 9 | python pretrain_t5.py \ 10 | --num-layers 12 \ 11 | --hidden-size 768 \ 12 | --num-attention-heads 12 \ 13 | --kv-channels 64 \ 14 | --ffn-hidden-size 3072 \ 15 | --encoder-seq-length 512 \ 16 | --decoder-seq-length 128 \ 17 | --micro-batch-size 16 \ 18 | --global-batch-size 16 \ 19 | --max-position-embeddings 512 \ 20 | --train-iters 1000000 \ 21 | --lr-decay-iters 1000000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file $VOCAB_FILE \ 26 | --data-impl mmap \ 27 | --split 949,50,1 \ 28 | --lr 0.0001 \ 29 | --min-lr 0.00001 \ 30 | --lr-decay-style linear \ 31 | --lr-warmup-fraction .01 \ 32 | --weight-decay 1e-2 \ 33 | --clip-grad 1.0 \ 34 | --log-interval 100 \ 35 | --save-interval 10000 \ 36 | --eval-interval 1000 \ 37 | --eval-iters 10 \ 38 | --fp16 \ 39 | --vocab-extra-ids 100 40 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH= 12 | VOCAB_FILE= 13 | CHECKPOINT_PATH= 14 | 15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 16 | 17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 18 | pretrain_t5.py \ 19 | --num-layers 12 \ 20 | --hidden-size 768 \ 21 | --num-attention-heads 12 \ 22 | --kv-channels 64 \ 23 | --ffn-hidden-size 3072 \ 24 | --encoder-seq-length 512 \ 25 | --decoder-seq-length 128 \ 26 | --micro-batch-size 16 \ 27 | --global-batch-size 128 \ 28 | --max-position-embeddings 512 \ 29 | --train-iters 1000000 \ 30 | --lr-decay-iters 1000000 \ 31 | --save $CHECKPOINT_PATH \ 32 | --load $CHECKPOINT_PATH \ 33 | --data-path $DATA_PATH \ 34 | --vocab-file $VOCAB_FILE \ 35 | --data-impl mmap \ 36 | --split 949,50,1 \ 37 | --lr 0.0001 \ 38 | --min-lr 0.00001 \ 39 | --lr-decay-style linear \ 40 | --lr-warmup-fraction .01 \ 41 | --weight-decay 1e-2 \ 42 | --clip-grad 1.0 \ 43 | --log-interval 100 \ 44 | --save-interval 10000 \ 45 | --eval-interval 1000 \ 46 | --eval-iters 10 \ 47 | --fp16 \ 48 | --vocab-extra-ids 100 49 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH= 12 | CHECKPOINT_PATH= 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | pretrain_t5.py \ 18 | --tensor-model-parallel-size 2 \ 19 | --num-layers 12 \ 20 | --hidden-size 768 \ 21 | --num-attention-heads 12 \ 22 | --kv-channels 64 \ 23 | --ffn-hidden-size 3072 \ 24 | --encoder-seq-length 512 \ 25 | --decoder-seq-length 128 \ 26 | --micro-batch-size 16 \ 27 | --global-batch-size 128 \ 28 | --max-position-embeddings 512 \ 29 | --train-iters 1000000 \ 30 | --lr-decay-iters 1000000 \ 31 | --save $CHECKPOINT_PATH \ 32 | --load $CHECKPOINT_PATH \ 33 | --data-path $DATA_PATH \ 34 | --vocab-file t5-vocab.txt \ 35 | --data-impl mmap \ 36 | --split 949,50,1 \ 37 | --lr 0.0001 \ 38 | --min-lr 0.00001 \ 39 | --lr-decay-style linear \ 40 | --lr-warmup-fraction .01 \ 41 | --weight-decay 1e-2 \ 42 | --clip-grad 1.0 \ 43 | --log-interval 100 \ 44 | --save-interval 10000 \ 45 | --eval-interval 1000 \ 46 | --eval-iters 10 \ 47 | --fp16 \ 48 | --vocab-extra-ids 100 49 | -------------------------------------------------------------------------------- /examples/run_text_generation_server_345M.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model. 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | pip install flask-restful 14 | 15 | python -m torch.distributed.run $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 16 | --tensor-model-parallel-size 1 \ 17 | --pipeline-model-parallel-size 1 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --load ${CHECKPOINT} \ 21 | --num-attention-heads 16 \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --micro-batch-size 1 \ 26 | --seq-length 1024 \ 27 | --out-seq-length 1024 \ 28 | --temperature 1.0 \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --top_p 0.9 \ 32 | --seed 42 33 | -------------------------------------------------------------------------------- /examples/run_text_generation_server_345M_8_tensor_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | pip install flask-restful 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 16 | --tensor-model-parallel-size 8 \ 17 | --pipeline-model-parallel-size 1 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --load ${CHECKPOINT} \ 21 | --num-attention-heads 16 \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --micro-batch-size 1 \ 26 | --seq-length 1024 \ 27 | --out-seq-length 1024 \ 28 | --temperature 1.0 \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --top_p 0.9 \ 32 | --seed 42 33 | -------------------------------------------------------------------------------- /examples/sc21/CONFIG.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # SLURM options. 5 | export SLURM_PARTITION= 6 | export SLURM_ACCOUNT= 7 | 8 | 9 | # Source code. 10 | export MEGATRON_CODE_DIR= 11 | 12 | 13 | # This variable is used to mount the relevant part of the filesystem 14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the 15 | # launch directory already get mounted; this variable should be used to 16 | # mount the directories that contain the data and tokenizer files. 17 | export DOCKER_MOUNT_DIR= 18 | 19 | 20 | # Data and tokenizer files. 21 | MEGATRON_DATA= 22 | BPE_VOCAB_FILE= 23 | BPE_MERGE_FILE= 24 | 25 | 26 | # Megatron input parameters. 27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters 28 | # that are not listed here. 29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ 30 | --tensor-model-parallel-size ${TP} \ 31 | --pipeline-model-parallel-size ${PP} \ 32 | --micro-batch-size ${MBS} \ 33 | --global-batch-size ${GBS} \ 34 | --num-layers ${NLS} \ 35 | --hidden-size ${HS} \ 36 | --num-attention-heads ${NAH} \ 37 | --DDP-impl ${DDP} \ 38 | --data-path ${MEGATRON_DATA} \ 39 | --vocab-file ${BPE_VOCAB_FILE} \ 40 | --merge-file ${BPE_MERGE_FILE} \ 41 | --log-interval 5 \ 42 | --seq-length 2048 \ 43 | --max-position-embeddings 2048 \ 44 | --train-iters 500 \ 45 | --lr-decay-iters 320 \ 46 | --lr 0.0001 \ 47 | --min-lr 0.00001 \ 48 | --lr-decay-style cosine \ 49 | --lr-warmup-fraction 0.01 \ 50 | --split 969,30,1 \ 51 | --eval-iters 100 \ 52 | --eval-interval 1000 \ 53 | --clip-grad 1.0 \ 54 | --fp16 \ 55 | --loss-scale 8192 " 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/sc21/README.md: -------------------------------------------------------------------------------- 1 | # Reproducing Figures in SC21 Paper 2 | 3 | 4 | This directory contains some of the scripts that were used to produce the 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other 9 | schedulers as well. 10 | 11 | 12 | ## Setup 13 | 14 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please 15 | update the unspecified values (in angle brackets `<...>`) before launching any 16 | scripts. 17 | 18 | 19 | 20 | ## Scripts 21 | 22 | Below is a list of scripts that can be used to reproduce various figures in our 23 | [paper](https://arxiv.org/pdf/2104.04473.pdf): 24 | 25 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput 26 | for GPT models ranging from 1 billion to 1 trillion parameters. 27 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling 28 | performance of pipeline parallelism. 29 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of 30 | the interleaved schedule on a 175B GPT model. 31 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of 32 | different degrees of pipeline and tensor model parallelism on a model with 33 | 162.2 billion parameters. 34 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of 35 | different degrees of data and pipeline model parallelism on a model with 36 | 5.9 billion parameters. 37 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of 38 | different degrees of data and tensor model parallelism on a model with 39 | 5.9 billion parameters. 40 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of 41 | microbatch size. 42 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of 43 | activation recomputation. 44 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of 45 | the scatter-gather communication optimization. 46 | -------------------------------------------------------------------------------- /examples/sc21/SBATCH.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | sbatch -p ${SLURM_PARTITION} \ 5 | -A ${SLURM_ACCOUNT} \ 6 | --job-name=${JOB_NAME} \ 7 | --nodes=${NNODES} \ 8 | --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh 9 | 10 | exit 0 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/sc21/SRUN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 4 | 5 | 6 | THIS_DIR=`pwd` 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 8 | mkdir -p ${THIS_DIR}/logs 9 | 10 | 11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" 12 | 13 | 14 | srun -l \ 15 | --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ 16 | --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ 17 | --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" 18 | 19 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [1, 2, 4, 8]. 8 | PP=1 9 | 10 | # Batch size (global batch size) options = [8, 128]. 11 | GBS=8 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel size options. 18 | NLS=$((3*PP)) 19 | NNODES=${PP} 20 | 21 | 22 | # Other params. 23 | TP=8 24 | MBS=1 25 | HS=20480 26 | NAH=128 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Interleaved schedule options = [YES, NO]. 8 | INTERLEAVED=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set interleaved schedule options. 18 | if [ ${INTERLEAVED} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${INTERLEAVED} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 128]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and tensor-parallel size options. 18 | TP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | MBS=1 23 | NLS=32 24 | HS=20480 25 | NAH=128 26 | DDP=local 27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 28 | NNODES=8 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and data-parallel size options. 18 | DP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | TP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32]. 8 | TP=2 9 | 10 | # Batch size (global batch size) options = [32, 128, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set tensor-parallel and data-parallel size options. 18 | DP=$((64/TP)) 19 | 20 | 21 | # Other params. 22 | PP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Microbatch size options = [1, 2, 4, 8]. 8 | MBS=1 9 | 10 | # Batch size (global batch size) options = [128, 512]. 11 | GBS=128 12 | 13 | 14 | 15 | 16 | 17 | # Other params. 18 | TP=8 19 | PP=8 20 | NLS=32 21 | HS=15360 22 | NAH=128 23 | DDP=local 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | NNODES=8 26 | 27 | 28 | # Name of the job. 29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} 30 | 31 | 32 | # Import the configs. 33 | . `pwd`/CONFIG.sh 34 | 35 | 36 | # Submit the job. 37 | . `pwd`/SBATCH.sh 38 | 39 | 40 | exit 0 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_17.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Activation recomputation options = [YES, NO]. 8 | ACTIVATION_RECOMPUTATION=YES 9 | 10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256]. 11 | GBS=1 12 | 13 | 14 | 15 | 16 | 17 | # Set activation recomputation. 18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="" 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=16 31 | MBS=1 32 | NLS=80 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=16 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_18.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Scatter-gather communication optimization options = [YES, NO]. 8 | SCATTER_GATHER=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set scatter-gather communication optimization options. 18 | if [ ${SCATTER_GATHER} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${SCATTER_GATHER} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_table_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | # model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T] 7 | MODEL_SIZE=1.7B 8 | 9 | 10 | 11 | 12 | 13 | 14 | if [ ${MODEL_SIZE} == "1.7B" ]; then 15 | TP=1 16 | PP=1 17 | MBS=16 18 | GBS=512 19 | NLS=24 20 | HS=2304 21 | NAH=24 22 | DDP=torch 23 | NNODES=4 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | elif [ ${MODEL_SIZE} == "3.6B" ]; then 26 | TP=2 27 | PP=1 28 | MBS=16 29 | GBS=512 30 | NLS=30 31 | HS=3072 32 | NAH=32 33 | DDP=torch 34 | NNODES=8 35 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 36 | elif [ ${MODEL_SIZE} == "7.5B" ]; then 37 | TP=4 38 | PP=1 39 | MBS=16 40 | GBS=512 41 | NLS=36 42 | HS=4096 43 | NAH=32 44 | DDP=torch 45 | NNODES=16 46 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 47 | elif [ ${MODEL_SIZE} == "18B" ]; then 48 | TP=8 49 | PP=1 50 | MBS=8 51 | GBS=1024 52 | NLS=40 53 | HS=6144 54 | NAH=48 55 | DDP=torch 56 | NNODES=32 57 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 58 | elif [ ${MODEL_SIZE} == "39B" ]; then 59 | TP=8 60 | PP=2 61 | MBS=4 62 | GBS=1536 63 | NLS=48 64 | HS=8192 65 | NAH=64 66 | DDP=local 67 | NNODES=64 68 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 69 | elif [ ${MODEL_SIZE} == "76B" ]; then 70 | TP=8 71 | PP=4 72 | MBS=2 73 | GBS=1792 74 | NLS=60 75 | HS=10240 76 | NAH=80 77 | DDP=local 78 | NNODES=128 79 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5" 80 | elif [ ${MODEL_SIZE} == "145B" ]; then 81 | TP=8 82 | PP=8 83 | MBS=2 84 | GBS=2304 85 | NLS=80 86 | HS=12288 87 | NAH=96 88 | DDP=local 89 | NNODES=192 90 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 " 91 | elif [ ${MODEL_SIZE} == "310B" ]; then 92 | TP=8 93 | PP=16 94 | MBS=1 95 | GBS=2160 96 | NLS=96 97 | HS=16384 98 | NAH=128 99 | DDP=local 100 | NNODES=240 101 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 " 102 | elif [ ${MODEL_SIZE} == "530B" ]; then 103 | TP=8 104 | PP=35 105 | MBS=1 106 | GBS=2520 107 | NLS=105 108 | HS=20480 109 | NAH=128 110 | DDP=local 111 | NNODES=315 112 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 " 113 | elif [ ${MODEL_SIZE} == "1T" ]; then 114 | TP=8 115 | PP=64 116 | MBS=1 117 | GBS=3072 118 | NLS=128 119 | HS=25600 120 | NAH=160 121 | DDP=local 122 | NNODES=384 123 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 124 | else 125 | echo "Invalid configuration" 126 | exit 1 127 | fi 128 | 129 | 130 | # Name of the job 131 | export JOB_NAME=results_table_1_model_size_${MODEL_SIZE} 132 | 133 | 134 | # Import the configs. 135 | . `pwd`/CONFIG.sh 136 | 137 | 138 | # Submit the job. 139 | . `pwd`/SBATCH.sh 140 | 141 | 142 | exit 0 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /images/Achieved_petaFLOPs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/images/Achieved_petaFLOPs.png -------------------------------------------------------------------------------- /images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/images/cases_april2021.png -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | 4 | from .global_vars import get_args 5 | from .global_vars import get_current_global_batch_size 6 | from .global_vars import get_num_microbatches 7 | from .global_vars import get_signal_handler 8 | from .global_vars import update_num_microbatches 9 | from .global_vars import get_tokenizer 10 | from .global_vars import get_tensorboard_writer 11 | from .global_vars import get_adlr_autoresume 12 | from .global_vars import get_timers 13 | from .initialize import initialize_megatron 14 | 15 | from .utils import (print_rank_0, 16 | is_last_rank, 17 | print_rank_last) 18 | -------------------------------------------------------------------------------- /megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | import megatron.core.parallel_state 2 | import megatron.core.tensor_parallel 3 | import megatron.core.utils 4 | 5 | # Alias parallel_state as mpu, its legacy name 6 | mpu = parallel_state 7 | 8 | __all__ = [ 9 | "parallel_state", 10 | "tensor_parallel", 11 | "utils", 12 | ] 13 | -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy import vocab_parallel_cross_entropy 2 | from .data import broadcast_data 3 | 4 | from .layers import ( 5 | ColumnParallelLinear, 6 | RowParallelLinear, 7 | VocabParallelEmbedding, 8 | set_tensor_model_parallel_attributes, 9 | set_defaults_if_not_set_tensor_model_parallel_attributes, 10 | copy_tensor_model_parallel_attributes, 11 | param_is_not_tensor_parallel_duplicate, 12 | linear_with_grad_accumulation_and_async_allreduce 13 | 14 | ) 15 | 16 | from .mappings import ( 17 | copy_to_tensor_model_parallel_region, 18 | gather_from_tensor_model_parallel_region, 19 | gather_from_sequence_parallel_region, 20 | scatter_to_tensor_model_parallel_region, 21 | scatter_to_sequence_parallel_region, 22 | ) 23 | 24 | from .random import ( 25 | checkpoint, 26 | get_cuda_rng_tracker, 27 | model_parallel_cuda_manual_seed, 28 | ) 29 | 30 | from .utils import ( 31 | split_tensor_along_last_dim, 32 | split_tensor_into_1d_equal_chunks, 33 | gather_split_1d_tensor, 34 | ) 35 | 36 | __all__ = [ 37 | # cross_entropy.py 38 | "vocab_parallel_cross_entropy", 39 | # data.py 40 | "broadcast_data", 41 | #layers.py 42 | "ColumnParallelLinear", 43 | "RowParallelLinear", 44 | "VocabParallelEmbedding", 45 | "set_tensor_model_parallel_attributes", 46 | "set_defaults_if_not_set_tensor_model_parallel_attributes", 47 | "copy_tensor_model_parallel_attributes", 48 | "param_is_not_tensor_parallel_duplicate", 49 | "linear_with_grad_accumulation_and_async_allreduce", 50 | # mappings.py 51 | "copy_to_tensor_model_parallel_region", 52 | "gather_from_tensor_model_parallel_region", 53 | "gather_from_sequence_parallel_region", 54 | # "reduce_from_tensor_model_parallel_region", 55 | "scatter_to_tensor_model_parallel_region", 56 | "scatter_to_sequence_parallel_region", 57 | # random.py 58 | "checkpoint", 59 | "get_cuda_rng_tracker", 60 | "model_parallel_cuda_manual_seed", 61 | # utils.py 62 | "split_tensor_along_last_dim", 63 | "split_tensor_into_1d_equal_chunks", 64 | "gather_split_1d_tensor", 65 | ] 66 | -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core.parallel_state import ( 6 | get_tensor_model_parallel_group, 7 | get_tensor_model_parallel_rank, 8 | get_tensor_model_parallel_src_rank, 9 | ) 10 | 11 | 12 | _MAX_DATA_DIM = 5 13 | 14 | 15 | def _check_data_types(keys, data, target_dtype): 16 | """Check that all the keys have the same target data type.""" 17 | for key in keys: 18 | assert data[key].dtype == target_dtype, '{} has data type {} which '\ 19 | 'is different than {}'.format(key, data[key].dtype, target_dtype) 20 | 21 | 22 | def _build_key_size_numel_dictionaries(keys, data): 23 | """Build the size on rank 0 and broadcast.""" 24 | max_dim = _MAX_DATA_DIM 25 | sizes = [0 for _ in range(max_dim) for _ in keys] 26 | 27 | # Pack the sizes on rank zero. 28 | if get_tensor_model_parallel_rank() == 0: 29 | offset = 0 30 | for key in keys: 31 | assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' 32 | size = data[key].size() 33 | for i, s in enumerate(size): 34 | sizes[i + offset] = s 35 | offset += max_dim 36 | 37 | # Move to GPU and broadcast. 38 | sizes_cuda = torch.cuda.LongTensor(sizes) 39 | torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(), 40 | group=get_tensor_model_parallel_group()) 41 | 42 | # Move back to cpu and unpack. 43 | sizes_cpu = sizes_cuda.cpu() 44 | key_size = {} 45 | key_numel = {} 46 | total_numel = 0 47 | offset = 0 48 | for key in keys: 49 | i = 0 50 | size = [] 51 | numel = 1 52 | while sizes_cpu[offset + i] > 0: 53 | this_size = sizes_cpu[offset + i] 54 | size.append(this_size) 55 | numel *= this_size 56 | i += 1 57 | key_size[key] = size 58 | key_numel[key] = numel 59 | total_numel += numel 60 | offset += max_dim 61 | 62 | return key_size, key_numel, total_numel 63 | 64 | 65 | def broadcast_data(keys, data, datatype): 66 | """Broadcast data from rank zero of each model parallel group to the 67 | members of the same model parallel group. 68 | 69 | Arguments: 70 | keys: list of keys in the data disctionary to be broadcasted 71 | data: data dictionary of string keys and cpu tensor values. 72 | datatype: torch data type of all tensors in data associated 73 | with keys. 74 | """ 75 | # Build (key, size) and (key, number of elements) dictionaries along 76 | # with the total number of elements on all ranks. 77 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, 78 | data) 79 | 80 | # Pack on rank zero. 81 | if get_tensor_model_parallel_rank() == 0: 82 | # Check that all keys have the same data type. 83 | _check_data_types(keys, data, datatype) 84 | # Flatten the data associated with the keys 85 | flatten_data = torch.cat( 86 | [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() 87 | else: 88 | flatten_data = torch.empty(total_numel, 89 | device=torch.cuda.current_device(), 90 | dtype=datatype) 91 | 92 | # Broadcast 93 | torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(), 94 | group=get_tensor_model_parallel_group()) 95 | 96 | # Unpack 97 | output = {} 98 | offset = 0 99 | for key in keys: 100 | size = key_size[key] 101 | numel = key_numel[key] 102 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 103 | offset += numel 104 | 105 | return output 106 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Blendable dataset.""" 4 | 5 | import time 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from megatron import print_rank_0 11 | 12 | class BlendableDataset(torch.utils.data.Dataset): 13 | 14 | 15 | def __init__(self, datasets, weights): 16 | 17 | self.datasets = datasets 18 | num_datasets = len(datasets) 19 | assert num_datasets == len(weights) 20 | 21 | self.size = 0 22 | for dataset in self.datasets: 23 | self.size += len(dataset) 24 | 25 | # Normalize weights. 26 | weights = np.array(weights, dtype=np.float64) 27 | sum_weights = np.sum(weights) 28 | assert sum_weights > 0.0 29 | weights /= sum_weights 30 | 31 | # Build indecies. 32 | start_time = time.time() 33 | assert num_datasets < 255 34 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 35 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 36 | 37 | from megatron.data import helpers 38 | helpers.build_blending_indices(self.dataset_index, 39 | self.dataset_sample_index, 40 | weights, num_datasets, self.size, 41 | torch.distributed.get_rank() == 0) 42 | print_rank_0('> elapsed time for building blendable dataset indices: ' 43 | '{:.2f} (sec)'.format(time.time() - start_time)) 44 | 45 | 46 | def __len__(self): 47 | return self.size 48 | 49 | 50 | def __getitem__(self, idx): 51 | dataset_idx = self.dataset_index[idx] 52 | sample_idx = self.dataset_sample_index[idx] 53 | return self.datasets[dataset_idx][sample_idx] 54 | -------------------------------------------------------------------------------- /megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /megatron/dist_signal_handler.py: -------------------------------------------------------------------------------- 1 | import signal 2 | 3 | import torch 4 | 5 | 6 | def get_world_size(): 7 | if torch.distributed.is_available() and torch.distributed.is_initialized(): 8 | world_size = torch.distributed.get_world_size() 9 | else: 10 | world_size = 1 11 | return world_size 12 | 13 | 14 | def get_device(local_rank=None): 15 | backend = torch.distributed.get_backend() 16 | if backend == 'nccl': 17 | if local_rank is None: 18 | device = torch.device('cuda') 19 | else: 20 | device = torch.device(f'cuda:{local_rank}') 21 | elif backend == 'gloo': 22 | device = torch.device('cpu') 23 | else: 24 | raise RuntimeError 25 | return device 26 | 27 | 28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): 29 | if not torch.distributed.is_available() or \ 30 | not torch.distributed.is_initialized(): 31 | return [item] 32 | 33 | device = get_device(local_rank) 34 | 35 | if group is not None: 36 | group_size = group.size() 37 | else: 38 | group_size = get_world_size() 39 | 40 | tensor = torch.tensor([item], device=device, dtype=dtype) 41 | output_tensors = [ 42 | torch.zeros(1, dtype=tensor.dtype, device=tensor.device) 43 | for _ in range(group_size) 44 | ] 45 | torch.distributed.all_gather(output_tensors, tensor, group, async_op) 46 | output = [elem.item() for elem in output_tensors] 47 | return output 48 | 49 | 50 | class DistributedSignalHandler: 51 | def __init__(self, sig=signal.SIGTERM): 52 | self.sig = sig 53 | 54 | def signals_received(self): 55 | all_received = all_gather_item( 56 | self._signal_received, dtype=torch.int32 57 | ) 58 | return all_received 59 | 60 | def __enter__(self): 61 | self._signal_received = False 62 | self.released = False 63 | self.original_handler = signal.getsignal(self.sig) 64 | 65 | def handler(signum, frame): 66 | self._signal_received = True 67 | 68 | signal.signal(self.sig, handler) 69 | 70 | return self 71 | 72 | def __exit__(self, type, value, tb): 73 | self.release() 74 | 75 | def release(self): 76 | if self.released: 77 | return False 78 | 79 | signal.signal(self.sig, self.original_handler) 80 | self.released = True 81 | return True 82 | -------------------------------------------------------------------------------- /megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /megatron/fused_kernels/fused_weight_gradient_dense.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include "type_shim.h" 8 | 9 | 10 | template 11 | int wgrad_gemm_accum_fp32_cuda(T *input, T *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim); 12 | 13 | void wgrad_gemm_accum_fp32(const at::Tensor input, const at::Tensor d_output, at::Tensor d_weight) { 14 | at::Tensor input_2d, d_output_2d; 15 | // input tensor: collapse to the first dim 16 | auto in_sizes = input.sizes(); 17 | if (input.dim() > 2) { 18 | input_2d = input.view({-1, in_sizes[in_sizes.size() - 1]}); 19 | } else { 20 | input_2d = input; 21 | } 22 | // d_output tensor: collapse to the first dim 23 | auto d_out_sizes = d_output.sizes(); 24 | if (d_output.dim() > 2) { 25 | d_output_2d = d_output.view({-1, d_out_sizes[d_out_sizes.size() - 1]}); 26 | } else { 27 | d_output_2d = d_output; 28 | } 29 | 30 | int hidden_dim = input_2d.size(0); 31 | int in_dim = input_2d.size(1); 32 | int out_dim = d_weight.size(0); 33 | 34 | DISPATCH_HALF_BFLOAT_AND_FLOAT(input_2d.scalar_type(), "wgrad_gemm_accum_fp32", 35 | int result = wgrad_gemm_accum_fp32_cuda( 36 | input_2d.data_ptr(), 37 | d_output_2d.data_ptr(), 38 | d_weight.data_ptr(), 39 | in_dim, 40 | hidden_dim, 41 | out_dim); 42 | ); 43 | } 44 | 45 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 46 | m.def("wgrad_gemm_accum_fp32", &wgrad_gemm_accum_fp32, "wgrad gemm accum in fp32"); 47 | } 48 | -------------------------------------------------------------------------------- /megatron/fused_kernels/fused_weight_gradient_dense.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* Includes, cuda */ 10 | #include 11 | #include 12 | 13 | 14 | // BF16 Tensor core wrapper around cublas GEMMEx 15 | cublasStatus_t gemmex_wrapper( 16 | cublasHandle_t handle, 17 | cublasOperation_t transa, 18 | cublasOperation_t transb, 19 | int m, 20 | int n, 21 | int k, 22 | const float* alpha, 23 | at::BFloat16* A, 24 | int lda, 25 | at::BFloat16* B, 26 | int ldb, 27 | const float* beta, 28 | float* C, 29 | int ldc) { 30 | return cublasGemmEx( 31 | handle, 32 | transa, 33 | transb, 34 | m, 35 | n, 36 | k, 37 | alpha, 38 | A, 39 | CUDA_R_16BF, 40 | lda, 41 | B, 42 | CUDA_R_16BF, 43 | ldb, 44 | beta, 45 | C, 46 | CUDA_R_32F, 47 | ldc, 48 | CUDA_R_32F, 49 | CUBLAS_GEMM_DEFAULT_TENSOR_OP); 50 | } 51 | 52 | // FP16 Tensor core wrapper around cublas GEMMEx 53 | cublasStatus_t gemmex_wrapper( 54 | cublasHandle_t handle, 55 | cublasOperation_t transa, 56 | cublasOperation_t transb, 57 | int m, 58 | int n, 59 | int k, 60 | const float* alpha, 61 | at::Half* A, 62 | int lda, 63 | at::Half* B, 64 | int ldb, 65 | const float* beta, 66 | float* C, 67 | int ldc) { 68 | return cublasGemmEx( 69 | handle, 70 | transa, 71 | transb, 72 | m, 73 | n, 74 | k, 75 | alpha, 76 | A, 77 | CUDA_R_16F, 78 | lda, 79 | B, 80 | CUDA_R_16F, 81 | ldb, 82 | beta, 83 | C, 84 | CUDA_R_32F, 85 | ldc, 86 | CUDA_R_32F, 87 | CUBLAS_GEMM_DEFAULT_TENSOR_OP); 88 | } 89 | 90 | // FP32 Tensor core wrapper around cublas GEMMEx 91 | cublasStatus_t gemmex_wrapper( 92 | cublasHandle_t handle, 93 | cublasOperation_t transa, 94 | cublasOperation_t transb, 95 | int m, 96 | int n, 97 | int k, 98 | const float* alpha, 99 | float* A, 100 | int lda, 101 | float* B, 102 | int ldb, 103 | const float* beta, 104 | float* C, 105 | int ldc) { 106 | return cublasGemmEx( 107 | handle, 108 | transa, 109 | transb, 110 | m, 111 | n, 112 | k, 113 | alpha, 114 | A, 115 | CUDA_R_32F, 116 | lda, 117 | B, 118 | CUDA_R_32F, 119 | ldb, 120 | beta, 121 | C, 122 | CUDA_R_32F, 123 | ldc, 124 | CUDA_R_32F, 125 | CUBLAS_GEMM_DEFAULT_TENSOR_OP); 126 | } 127 | 128 | template 129 | int wgrad_gemm_accum_fp32_cuda(T *input, T *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim) { 130 | cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); 131 | cudaStream_t stream; 132 | cublasGetStream(handle, &stream); 133 | const float alpha = 1.0; 134 | const float beta = 1.0; 135 | int status = 1; 136 | 137 | status = gemmex_wrapper( 138 | handle, 139 | CUBLAS_OP_N, 140 | CUBLAS_OP_T, 141 | in_dim, 142 | out_dim, 143 | hidden_dim, 144 | &alpha, 145 | input, 146 | in_dim, 147 | d_output, 148 | out_dim, 149 | &beta, 150 | d_weight, 151 | in_dim); 152 | return status; 153 | } 154 | 155 | template int wgrad_gemm_accum_fp32_cuda(at::Half *input, at::Half *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim); 156 | template int wgrad_gemm_accum_fp32_cuda(at::BFloat16 *input, at::BFloat16 *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim); 157 | template int wgrad_gemm_accum_fp32_cuda(float *input, float *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim); 158 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_masked_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | torch::Tensor const& mask, 14 | float scale_factor); 15 | 16 | torch::Tensor bwd_cuda( 17 | torch::Tensor const& output_grads, 18 | torch::Tensor const& softmax_results, 19 | float scale_factor); 20 | 21 | int get_batch_per_block_cuda( 22 | int query_seq_len, 23 | int key_seq_len, 24 | int batches, 25 | int attn_heads); 26 | 27 | torch::Tensor fwd( 28 | torch::Tensor const& input, 29 | torch::Tensor const& mask, 30 | float scale_factor) { 31 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 32 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 33 | (input.scalar_type() == at::ScalarType::BFloat16), 34 | "Only fp16 and bf16 are supported"); 35 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 36 | 37 | return fwd_cuda(input, mask, scale_factor); 38 | } 39 | 40 | torch::Tensor bwd( 41 | torch::Tensor const& output_grads, 42 | torch::Tensor const& softmax_results, 43 | float scale_factor) { 44 | 45 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 46 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 47 | 48 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 49 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 50 | "Only fp16 and bf16 are supported"); 51 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 52 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 53 | "Only fp16 and bf16 are supported"); 54 | 55 | return bwd_cuda(output_grads, softmax_results, scale_factor); 56 | } 57 | 58 | int get_batch_per_block( 59 | int query_seq_len, 60 | int key_seq_len, 61 | int batches, 62 | int attn_heads) { 63 | return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads); 64 | } 65 | 66 | } // end namespace scaled_masked_softmax 67 | } // end namespace fused_softmax 68 | } // end namespace multihead_attn 69 | 70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 71 | m.def("forward", 72 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 73 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 74 | 75 | m.def("backward", 76 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 77 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 78 | 79 | m.def("get_batch_per_block", 80 | &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block, 81 | "Return Batch per block size." 82 | ); 83 | } 84 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "scaled_masked_softmax.h" 11 | #include "type_shim.h" 12 | 13 | namespace multihead_attn { 14 | namespace fused_softmax { 15 | namespace scaled_masked_softmax { 16 | 17 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){ 18 | return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads); 19 | } 20 | 21 | 22 | torch::Tensor fwd_cuda( 23 | torch::Tensor const& input, 24 | torch::Tensor const& mask, 25 | float scale_factor) 26 | { 27 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 28 | const int batches = input.size(0); 29 | const int pad_batches = mask.size(0); 30 | const int attn_heads = input.size(1); 31 | const int query_seq_len = input.size(2); 32 | const int key_seq_len = input.size(3); 33 | TORCH_INTERNAL_ASSERT(key_seq_len <= 4096); 34 | TORCH_INTERNAL_ASSERT(query_seq_len > 1); 35 | TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); 36 | TORCH_INTERNAL_ASSERT(mask.size(1) == 1); 37 | TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len); 38 | TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len); 39 | 40 | // Output 41 | auto act_options = input.options().requires_grad(false); 42 | torch::Tensor softmax_results = 43 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 44 | 45 | // Softmax Intermediate Result Ptr 46 | void* input_ptr = static_cast(input.data_ptr()); 47 | void* mask_ptr = static_cast(mask.data_ptr()); 48 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 49 | 50 | DISPATCH_HALF_AND_BFLOAT( 51 | input.scalar_type(), 52 | "dispatch_scaled_masked_softmax_forward", 53 | dispatch_scaled_masked_softmax_forward( 54 | reinterpret_cast(softmax_results_ptr), 55 | reinterpret_cast(input_ptr), 56 | reinterpret_cast(mask_ptr), 57 | scale_factor, 58 | query_seq_len, 59 | key_seq_len, 60 | batches, 61 | attn_heads, 62 | pad_batches); 63 | ); 64 | return softmax_results; 65 | } 66 | 67 | torch::Tensor bwd_cuda( 68 | torch::Tensor const& output_grads_, 69 | torch::Tensor const& softmax_results_, 70 | float scale_factor) { 71 | 72 | auto output_grads = output_grads_.contiguous(); 73 | auto softmax_results = softmax_results_.contiguous(); 74 | 75 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 76 | const int batches = output_grads.size(0); 77 | const int attn_heads = output_grads.size(1); 78 | const int query_seq_len = output_grads.size(2); 79 | const int key_seq_len = output_grads.size(3); 80 | 81 | auto act_options = output_grads.options().requires_grad(false); 82 | torch::Tensor input_grads = 83 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 84 | 85 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 86 | void* input_grads_ptr = static_cast(input_grads.data_ptr()); 87 | 88 | //Softmax Grad 89 | DISPATCH_HALF_AND_BFLOAT( 90 | output_grads_.scalar_type(), 91 | "dispatch_scaled_masked_softmax_backward", 92 | dispatch_scaled_masked_softmax_backward( 93 | reinterpret_cast(input_grads_ptr), 94 | reinterpret_cast(output_grads_ptr), 95 | reinterpret_cast(softmax_results.data_ptr()), 96 | scale_factor, 97 | query_seq_len, 98 | key_seq_len, 99 | batches, 100 | attn_heads); 101 | ); 102 | 103 | return input_grads; 104 | } 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | float scale_factor); 14 | 15 | torch::Tensor bwd_cuda( 16 | torch::Tensor const& output_grads, 17 | torch::Tensor const& softmax_results, 18 | float scale_factor); 19 | 20 | torch::Tensor fwd( 21 | torch::Tensor const& input, 22 | float scale_factor) { 23 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 24 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 25 | (input.scalar_type() == at::ScalarType::BFloat16), 26 | "Only fp16 and bf16 are supported"); 27 | 28 | return fwd_cuda(input, scale_factor); 29 | } 30 | 31 | torch::Tensor bwd( 32 | torch::Tensor const& output_grads, 33 | torch::Tensor const& softmax_results, 34 | float scale_factor) { 35 | 36 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 37 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 38 | 39 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 40 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 41 | "Only fp16 and bf16 are supported"); 42 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 43 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 44 | "Only fp16 and bf16 are supported"); 45 | 46 | return bwd_cuda(output_grads, softmax_results, scale_factor); 47 | } 48 | 49 | } // end namespace scaled_softmax 50 | } // end namespace fused_softmax 51 | } // end namespace multihead_attn 52 | 53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 54 | m.def("forward", 55 | &multihead_attn::fused_softmax::scaled_softmax::fwd, 56 | "Self Multihead Attention scaled, softmax -- Forward."); 57 | m.def("backward", 58 | &multihead_attn::fused_softmax::scaled_softmax::bwd, 59 | "Self Multihead Attention scaled, softmax -- Backward."); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "scaled_masked_softmax.h" 11 | #include "type_shim.h" 12 | 13 | namespace multihead_attn { 14 | namespace fused_softmax { 15 | namespace scaled_softmax { 16 | 17 | torch::Tensor fwd_cuda( 18 | torch::Tensor const& input, 19 | float scale_factor) 20 | { 21 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 22 | const int batches = input.size(0); 23 | const int attn_heads = input.size(1); 24 | const int query_seq_len = input.size(2); 25 | const int key_seq_len = input.size(3); 26 | TORCH_INTERNAL_ASSERT(key_seq_len <= 4096); 27 | TORCH_INTERNAL_ASSERT(query_seq_len > 1); 28 | 29 | // Output 30 | auto act_options = input.options().requires_grad(false); 31 | torch::Tensor softmax_results = 32 | torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); 33 | 34 | // Softmax Intermediate Result Ptr 35 | void* input_ptr = static_cast(input.data_ptr()); 36 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 37 | 38 | DISPATCH_HALF_AND_BFLOAT( 39 | input.scalar_type(), 40 | "dispatch_scaled_softmax_forward", 41 | dispatch_scaled_softmax_forward( 42 | reinterpret_cast(softmax_results_ptr), 43 | reinterpret_cast(input_ptr), 44 | scale_factor, 45 | query_seq_len, 46 | key_seq_len, 47 | batches, 48 | attn_heads); 49 | ); 50 | return softmax_results; 51 | } 52 | 53 | torch::Tensor bwd_cuda( 54 | torch::Tensor const& output_grads_, 55 | torch::Tensor const& softmax_results_, 56 | float scale_factor) { 57 | 58 | auto output_grads = output_grads_.contiguous(); 59 | auto softmax_results = softmax_results_.contiguous(); 60 | 61 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 62 | const int batches = output_grads.size(0); 63 | const int attn_heads = output_grads.size(1); 64 | const int query_seq_len = output_grads.size(2); 65 | const int key_seq_len = output_grads.size(3); 66 | 67 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 68 | 69 | //Softmax Grad 70 | DISPATCH_HALF_AND_BFLOAT( 71 | output_grads_.scalar_type(), 72 | "dispatch_scaled_masked_softmax_backward", 73 | dispatch_scaled_masked_softmax_backward( 74 | reinterpret_cast(output_grads_ptr), 75 | reinterpret_cast(output_grads_ptr), 76 | reinterpret_cast(softmax_results.data_ptr()), 77 | scale_factor, 78 | query_seq_len, 79 | key_seq_len, 80 | batches, 81 | attn_heads); 82 | ); 83 | 84 | //backward pass is completely in-place 85 | return output_grads; 86 | } 87 | } 88 | } 89 | } 90 | 91 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace multihead_attn { 8 | namespace fused_softmax { 9 | namespace scaled_upper_triang_masked_softmax { 10 | 11 | torch::Tensor fwd_cuda( 12 | torch::Tensor const& input, 13 | float scale_factor); 14 | 15 | torch::Tensor bwd_cuda( 16 | torch::Tensor const& output_grads, 17 | torch::Tensor const& softmax_results, 18 | float scale_factor); 19 | 20 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 21 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 22 | AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || 23 | (input.scalar_type() == at::ScalarType::BFloat16), 24 | "Only fp16 and bf16 are supported"); 25 | 26 | return fwd_cuda(input, scale_factor); 27 | } 28 | 29 | torch::Tensor bwd( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor) { 33 | 34 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 35 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 36 | 37 | AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || 38 | (output_grads.scalar_type() == at::ScalarType::BFloat16), 39 | "Only fp16 and bf16 are supported"); 40 | AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || 41 | (softmax_results.scalar_type() == at::ScalarType::BFloat16), 42 | "Only fp16 and bf16 are supported"); 43 | 44 | return bwd_cuda(output_grads, softmax_results, scale_factor); 45 | } 46 | 47 | } // end namespace scaled_upper_triang_masked_softmax 48 | } // end namespace fused_softmax 49 | } // end namespace multihead_attn 50 | 51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 52 | m.def("forward", 53 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 54 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 55 | m.def("backward", 56 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 57 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 58 | } 59 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "scaled_upper_triang_masked_softmax.h" 11 | #include "type_shim.h" 12 | 13 | namespace multihead_attn { 14 | namespace fused_softmax { 15 | namespace scaled_upper_triang_masked_softmax { 16 | 17 | torch::Tensor fwd_cuda( 18 | torch::Tensor const& input, 19 | float scale_factor) 20 | { 21 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 22 | const int attn_batches = input.size(0); 23 | const int seq_len = input.size(1); 24 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 25 | 26 | // Output 27 | auto act_options = input.options().requires_grad(false); 28 | torch::Tensor softmax_results = 29 | torch::empty({attn_batches, seq_len, seq_len}, act_options); 30 | 31 | // Softmax Intermediate Result Ptr 32 | void* input_ptr = static_cast(input.data_ptr()); 33 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 34 | 35 | DISPATCH_HALF_AND_BFLOAT( 36 | input.scalar_type(), 37 | "dispatch_scaled_upper_triang_masked_softmax_forward", 38 | dispatch_scaled_upper_triang_masked_softmax_forward( 39 | reinterpret_cast(softmax_results_ptr), 40 | reinterpret_cast(input_ptr), 41 | scale_factor, 42 | seq_len, 43 | seq_len, 44 | attn_batches); 45 | ); 46 | return softmax_results; 47 | } 48 | 49 | 50 | torch::Tensor bwd_cuda( 51 | torch::Tensor const& output_grads_, 52 | torch::Tensor const& softmax_results_, 53 | float scale_factor) { 54 | 55 | auto output_grads = output_grads_.contiguous(); 56 | auto softmax_results = softmax_results_.contiguous(); 57 | 58 | //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 59 | const int attn_batches = output_grads.size(0); 60 | const int seq_len = output_grads.size(1); 61 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 62 | 63 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 64 | 65 | //Softmax Grad 66 | DISPATCH_HALF_AND_BFLOAT( 67 | output_grads_.scalar_type(), 68 | "dispatch_scaled_upper_triang_masked_softmax_backward", 69 | dispatch_scaled_upper_triang_masked_softmax_backward( 70 | reinterpret_cast(output_grads_ptr), 71 | reinterpret_cast(output_grads_ptr), 72 | reinterpret_cast(softmax_results.data_ptr()), 73 | scale_factor, 74 | seq_len, 75 | seq_len, 76 | attn_batches); 77 | ); 78 | 79 | //backward pass is completely in-place 80 | return output_grads; 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/fused_kernels/type_shim.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | 4 | #include 5 | #include "compat.h" 6 | 7 | 8 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 9 | switch(TYPE) \ 10 | { \ 11 | case at::ScalarType::Half: \ 12 | { \ 13 | using scalar_t = at::Half; \ 14 | __VA_ARGS__; \ 15 | break; \ 16 | } \ 17 | case at::ScalarType::BFloat16: \ 18 | { \ 19 | using scalar_t = at::BFloat16; \ 20 | __VA_ARGS__; \ 21 | break; \ 22 | } \ 23 | default: \ 24 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 25 | } 26 | 27 | 28 | #define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...) \ 29 | switch(TYPE) \ 30 | { \ 31 | case at::ScalarType::Half: \ 32 | { \ 33 | using scalar_t = at::Half; \ 34 | __VA_ARGS__; \ 35 | break; \ 36 | } \ 37 | case at::ScalarType::BFloat16: \ 38 | { \ 39 | using scalar_t = at::BFloat16; \ 40 | __VA_ARGS__; \ 41 | break; \ 42 | } \ 43 | case at::ScalarType::Float: \ 44 | { \ 45 | using scalar_t = float; \ 46 | __VA_ARGS__; \ 47 | break; \ 48 | } \ 49 | default: \ 50 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 51 | } 52 | 53 | 54 | 55 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ 56 | switch(TYPEIN) \ 57 | { \ 58 | case at::ScalarType::Float: \ 59 | { \ 60 | using scalar_t_in = float; \ 61 | switch(TYPEOUT) \ 62 | { \ 63 | case at::ScalarType::Float: \ 64 | { \ 65 | using scalar_t_out = float; \ 66 | __VA_ARGS__; \ 67 | break; \ 68 | } \ 69 | case at::ScalarType::Half: \ 70 | { \ 71 | using scalar_t_out = at::Half; \ 72 | __VA_ARGS__; \ 73 | break; \ 74 | } \ 75 | case at::ScalarType::BFloat16: \ 76 | { \ 77 | using scalar_t_out = at::BFloat16; \ 78 | __VA_ARGS__; \ 79 | break; \ 80 | } \ 81 | default: \ 82 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ 83 | } \ 84 | break; \ 85 | } \ 86 | case at::ScalarType::Half: \ 87 | { \ 88 | using scalar_t_in = at::Half; \ 89 | using scalar_t_out = at::Half; \ 90 | __VA_ARGS__; \ 91 | break; \ 92 | } \ 93 | case at::ScalarType::BFloat16: \ 94 | { \ 95 | using scalar_t_in = at::BFloat16; \ 96 | using scalar_t_out = at::BFloat16; \ 97 | __VA_ARGS__; \ 98 | break; \ 99 | } \ 100 | default: \ 101 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ 102 | } 103 | 104 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 4 | 5 | from .distributed import DistributedDataParallel 6 | from .bert_model import BertModel 7 | from .gpt_model import GPTModel 8 | from .t5_model import T5Model 9 | from .language_model import get_language_model 10 | from .module import Float16Module 11 | from .enums import ModelType 12 | -------------------------------------------------------------------------------- /megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class ModelType(enum.Enum): 6 | encoder_or_decoder = 1 7 | encoder_and_decoder = 2 8 | encoder_or_decoder_with_lbl = 3 9 | 10 | class LayerType(enum.Enum): 11 | encoder = 1 12 | decoder = 2 13 | 14 | class AttnType(enum.Enum): 15 | self_attn = 1 16 | cross_attn = 2 17 | 18 | class AttnMaskType(enum.Enum): 19 | padding = 1 20 | causal = 2 21 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 7 | # 1/sqrt(2*pi)-> 0.3989423 8 | # 1/sqrt(2) -> 0.70710678 9 | # sqrt(2/pi) -> 0.79788456 10 | # this function is tanh approximation of gelu 11 | # actual gelu is: 12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 13 | 14 | @torch.jit.script 15 | def bias_gelu(bias, y): 16 | x = bias + y 17 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 18 | 19 | # gradient of tanh approximation of gelu 20 | # gradient of actual gelu is: 21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 22 | @torch.jit.script 23 | def bias_gelu_back(g, bias, y): 24 | x = bias + y 25 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 26 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 27 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 28 | return ff*g 29 | 30 | class GeLUFunction(torch.autograd.Function): 31 | @staticmethod 32 | # bias is an optional argument 33 | def forward(ctx, input, bias): 34 | ctx.save_for_backward(input, bias) 35 | return bias_gelu(bias, input) 36 | 37 | @staticmethod 38 | def backward(ctx, grad_output): 39 | input, bias = ctx.saved_tensors 40 | tmp = bias_gelu_back(grad_output, bias, input) 41 | return tmp, tmp 42 | 43 | bias_gelu_impl = GeLUFunction.apply 44 | -------------------------------------------------------------------------------- /megatron/model/megablocks_utils.py: -------------------------------------------------------------------------------- 1 | """Adapter to expose MegaBlocks package, if available.""" 2 | try: 3 | import megablocks 4 | except ImportError: 5 | megablocks = None 6 | 7 | def megablocks_is_available(): 8 | return megablocks is not None 9 | 10 | def assert_megablocks_is_available(): 11 | assert megablocks_is_available(), ( 12 | 'MegaBlocks not available. Please run `pip install megablocks`.') 13 | 14 | def param_is_expert_model_parallel(param): 15 | if megablocks_is_available(): 16 | return megablocks.layers.mpu.param_is_expert_model_parallel(param) 17 | return False 18 | 19 | def copy_expert_model_parallel_attributes(destination_tensor, source_tensor): 20 | if not megablocks_is_available(): 21 | return 22 | megablocks.layers.mpu.copy_expert_model_parallel_attributes( 23 | destination_tensor, source_tensor) 24 | 25 | moe = megablocks.layers.moe if megablocks_is_available() else None 26 | dmoe = megablocks.layers.dmoe if megablocks_is_available() else None 27 | arguments = megablocks.layers.arguments if megablocks_is_available() else None 28 | -------------------------------------------------------------------------------- /megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for models.""" 4 | 5 | import math 6 | 7 | import torch 8 | 9 | from megatron import get_args 10 | 11 | def init_method_normal(sigma): 12 | """Init method based on N(0, sigma).""" 13 | def init_(tensor): 14 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 15 | 16 | return init_ 17 | 18 | 19 | def scaled_init_method_normal(sigma, num_layers): 20 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 21 | std = sigma / math.sqrt(2.0 * num_layers) 22 | 23 | def init_(tensor): 24 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 25 | 26 | return init_ 27 | 28 | 29 | def attention_mask_func(attention_scores, attention_mask): 30 | attention_scores.masked_fill_(attention_mask, -10000.0) 31 | return attention_scores 32 | 33 | 34 | def get_linear_layer(rows, columns, init_method): 35 | """Simple linear layer with weight initialization.""" 36 | layer = torch.nn.Linear(rows, columns) 37 | if get_args().perform_initialization: 38 | init_method(layer.weight) 39 | with torch.no_grad(): 40 | layer.bias.zero_() 41 | return layer 42 | 43 | @torch.jit.script 44 | def gelu_impl(x): 45 | """OpenAI's gelu implementation.""" 46 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 47 | (1.0 + 0.044715 * x * x))) 48 | def openai_gelu(x): 49 | return gelu_impl(x) 50 | 51 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 52 | @torch.jit.script 53 | def erf_gelu(x): 54 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 55 | -------------------------------------------------------------------------------- /megatron/model/vision/classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Vision Transformer(VIT) model.""" 4 | 5 | import torch 6 | from torch.nn.init import trunc_normal_ 7 | from megatron import get_args 8 | from megatron.model.utils import get_linear_layer 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead 10 | from megatron.model.vision.mit_backbone import mit_b3_avg 11 | from megatron.model.module import MegatronModule 12 | 13 | class VitClassificationModel(MegatronModule): 14 | """Vision Transformer Model.""" 15 | 16 | def __init__(self, num_classes, finetune=False, 17 | pre_process=True, post_process=True): 18 | super(VitClassificationModel, self).__init__() 19 | args = get_args() 20 | 21 | self.hidden_size = args.hidden_size 22 | self.num_classes = num_classes 23 | self.finetune = finetune 24 | self.pre_process = pre_process 25 | self.post_process = post_process 26 | self.backbone = VitBackbone( 27 | pre_process=self.pre_process, 28 | post_process=self.post_process, 29 | single_token_output=True 30 | ) 31 | 32 | if self.post_process: 33 | if not self.finetune: 34 | self.head = VitMlpHead(self.hidden_size, self.num_classes) 35 | else: 36 | self.head = get_linear_layer( 37 | self.hidden_size, 38 | self.num_classes, 39 | torch.nn.init.zeros_ 40 | ) 41 | 42 | def set_input_tensor(self, input_tensor): 43 | """See megatron.model.transformer.set_input_tensor()""" 44 | self.backbone.set_input_tensor(input_tensor) 45 | 46 | def forward(self, input): 47 | hidden_states = self.backbone(input) 48 | 49 | if self.post_process: 50 | hidden_states = self.head(hidden_states) 51 | 52 | return hidden_states 53 | 54 | 55 | class MitClassificationModel(MegatronModule): 56 | """Mix vision Transformer Model.""" 57 | 58 | def __init__(self, num_classes, 59 | pre_process=True, post_process=True): 60 | super(MitClassificationModel, self).__init__() 61 | args = get_args() 62 | 63 | self.hidden_size = args.hidden_size 64 | self.num_classes = num_classes 65 | 66 | self.backbone = mit_b3_avg() 67 | self.head = torch.nn.Linear(512, num_classes) 68 | self.apply(self._init_weights) 69 | 70 | def _init_weights(self, m): 71 | if isinstance(m, torch.nn.Linear): 72 | trunc_normal_(m.weight, std=.02) 73 | if isinstance(m, torch.nn.Linear) and m.bias is not None: 74 | torch.nn.init.constant_(m.bias, 0) 75 | 76 | def set_input_tensor(self, input_tensor): 77 | """See megatron.model.transformer.set_input_tensor()""" 78 | pass 79 | 80 | def forward(self, input): 81 | hidden_states = self.backbone(input) 82 | hidden_states = self.head(hidden_states) 83 | 84 | return hidden_states 85 | -------------------------------------------------------------------------------- /megatron/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def resize(input, 7 | size=None, 8 | scale_factor=None, 9 | mode='nearest', 10 | align_corners=None, 11 | warning=True): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ((output_h > 1 and output_w > 1 and input_h > 1 18 | and input_w > 1) and (output_h - 1) % (input_h - 1) 19 | and (output_w - 1) % (input_w - 1)): 20 | warnings.warn( 21 | f'When align_corners={align_corners}, ' 22 | 'the output would more aligned if ' 23 | f'input size {(input_h, input_w)} is `x+1` and ' 24 | f'out size {(output_h, output_w)} is `nx+1`') 25 | if isinstance(size, torch.Size): 26 | size = tuple(int(x) for x in size) 27 | return F.interpolate(input, size, scale_factor, mode, align_corners) 28 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import os 5 | import random 6 | import numpy 7 | import torch 8 | 9 | import mpu 10 | 11 | 12 | class IdentityLayer(torch.nn.Module): 13 | def __init__(self, size, scale=1.0): 14 | super(IdentityLayer, self).__init__() 15 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 16 | 17 | def forward(self): 18 | return self.weight 19 | 20 | 21 | def set_random_seed(seed): 22 | """Set random seed for reproducability.""" 23 | random.seed(seed) 24 | numpy.random.seed(seed) 25 | torch.manual_seed(seed) 26 | mpu.model_parallel_cuda_manual_seed(seed) 27 | 28 | 29 | def initialize_distributed(backend='nccl'): 30 | """Initialize torch.distributed.""" 31 | # Get local rank in case it is provided. 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--local_rank', type=int, default=None, 34 | help='local rank passed from distributed launcher') 35 | args = parser.parse_args() 36 | local_rank = args.local_rank 37 | 38 | # Get rank and world size. 39 | rank = int(os.getenv('RANK', '0')) 40 | world_size = int(os.getenv("WORLD_SIZE", '1')) 41 | 42 | print('> initializing torch.distributed with local rank: {}, ' 43 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 44 | 45 | # Set the device id. 46 | device = rank % torch.cuda.device_count() 47 | if local_rank is not None: 48 | device = local_rank 49 | torch.cuda.set_device(device) 50 | 51 | # Call the init process. 52 | init_method = 'tcp://' 53 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 54 | master_port = os.getenv('MASTER_PORT', '6000') 55 | init_method += master_ip + ':' + master_port 56 | torch.distributed.init_process_group( 57 | backend=backend, 58 | world_size=world_size, 59 | rank=rank, 60 | init_method=init_method) 61 | 62 | 63 | def print_separator(message): 64 | torch.distributed.barrier() 65 | filler_len = (78 - len(message)) // 2 66 | filler = '-' * filler_len 67 | string = '\n' + filler + ' {} '.format(message) + filler 68 | if torch.distributed.get_rank() == 0: 69 | print(string, flush=True) 70 | torch.distributed.barrier() 71 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import set_random_seed 4 | from commons import IdentityLayer 5 | from commons import print_separator 6 | from commons import initialize_distributed 7 | from mpu.cross_entropy import vocab_parallel_cross_entropy 8 | import mpu 9 | import torch.nn.functional as F 10 | import torch 11 | import random 12 | import sys 13 | sys.path.append("../..") 14 | 15 | 16 | def torch_cross_entropy(batch_size, seq_length, vocab_size, 17 | logits_scale, seed): 18 | set_random_seed(seed) 19 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 20 | scale=logits_scale).cuda() 21 | logits = identity() 22 | target = torch.cuda.LongTensor( 23 | size=(batch_size, seq_length)).random_(0, vocab_size) 24 | loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), 25 | target.view(-1), 26 | reduction='none').view_as(target).mean() 27 | loss.backward() 28 | return loss, identity.weight.grad 29 | 30 | 31 | def mpu_cross_entropy(batch_size, seq_length, vocab_size, 32 | logits_scale, seed): 33 | set_random_seed(seed) 34 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 35 | scale=logits_scale).cuda() 36 | logits = identity() 37 | logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits) 38 | target = torch.cuda.LongTensor( 39 | size=(batch_size, seq_length)).random_(0, vocab_size) 40 | loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() 41 | loss.backward() 42 | return loss, identity.weight.grad 43 | 44 | 45 | def test_cross_entropy(tensor_model_parallel_size): 46 | 47 | if torch.distributed.get_rank() == 0: 48 | print('> testing cross entropy with model parallel size {} ...'. 49 | format(tensor_model_parallel_size)) 50 | 51 | mpu.initialize_model_parallel(tensor_model_parallel_size) 52 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 53 | 54 | batch_size = 13 55 | seq_length = 17 56 | vocab_size_per_partition = 11 57 | logits_scale = 1000.0 58 | vocab_size = vocab_size_per_partition * tensor_model_parallel_size 59 | seed = 1234 60 | 61 | loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, 62 | vocab_size, logits_scale, 63 | seed) 64 | loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, 65 | vocab_size, logits_scale, 66 | seed) 67 | 68 | error = loss_torch.sub_(loss_mpu).abs().max() 69 | print(' max error in loss on global rank {}: {}'.format( 70 | torch.distributed.get_rank(), error)) 71 | assert error < 1.0e-6 72 | 73 | error = grad_torch.sub_(grad_mpu).abs().max() 74 | print(' max error in grad on global rank {}: {}'.format( 75 | torch.distributed.get_rank(), error)) 76 | assert error < 1.0e-6 77 | 78 | # Reset groups 79 | mpu.destroy_tensor_model_parallel() 80 | 81 | torch.distributed.barrier() 82 | if torch.distributed.get_rank() == 0: 83 | print('>> passed the test :-)') 84 | 85 | 86 | if __name__ == '__main__': 87 | 88 | initialize_distributed() 89 | world_size = torch.distributed.get_world_size() 90 | 91 | tensor_model_parallel_size = 1 92 | while tensor_model_parallel_size <= world_size: 93 | print_separator('test cross entropy') 94 | test_cross_entropy(tensor_model_parallel_size) 95 | tensor_model_parallel_size *= 2 96 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import print_separator 4 | from commons import initialize_distributed 5 | from mpu import data as data_utils 6 | import mpu 7 | import torch 8 | import functools 9 | import operator 10 | import sys 11 | sys.path.append("../..") 12 | 13 | 14 | def test_broadcast_data(tensor_model_parallel_size): 15 | 16 | if torch.distributed.get_rank() == 0: 17 | print('> testing broadcast_data with model parallel size {} ...'. 18 | format(tensor_model_parallel_size)) 19 | 20 | mpu.initialize_model_parallel(tensor_model_parallel_size) 21 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 22 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 23 | 24 | key_size_t = {'key1': [7, 11], 25 | 'key2': [8, 2, 1], 26 | 'key3': [13], 27 | 'key4': [5, 1, 2], 28 | 'key5': [5, 12]} 29 | keys = list(key_size_t.keys()) 30 | 31 | data = {} 32 | data_t = {} 33 | for key in key_size_t: 34 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 35 | data_t[key] = data[key].clone() 36 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 37 | data_t['keyX'] = data['keyX'].clone() 38 | if mpu.get_tensor_model_parallel_rank() != 0: 39 | data = None 40 | 41 | data_utils._check_data_types(keys, data_t, torch.int64) 42 | key_size, key_numel, \ 43 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 44 | for key in keys: 45 | assert key_size[key] == key_size_t[key] 46 | total_numel_t = 0 47 | for key in keys: 48 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 49 | assert key_numel[key] == target_size 50 | total_numel_t += target_size 51 | assert total_numel == total_numel_t 52 | 53 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 54 | for key in keys: 55 | tensor = data_t[key].cuda() 56 | assert data_b[key].sub(tensor).abs().max() == 0 57 | 58 | # Reset groups 59 | mpu.destroy_tensor_model_parallel() 60 | 61 | torch.distributed.barrier() 62 | if torch.distributed.get_rank() == 0: 63 | print('>> passed the test :-)') 64 | 65 | 66 | if __name__ == '__main__': 67 | 68 | initialize_distributed() 69 | world_size = torch.distributed.get_world_size() 70 | 71 | tensor_model_parallel_size = 1 72 | while tensor_model_parallel_size <= world_size: 73 | print_separator('test test broadcast data') 74 | test_broadcast_data(tensor_model_parallel_size) 75 | tensor_model_parallel_size *= 2 76 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import print_separator 4 | from commons import initialize_distributed 5 | import mpu 6 | import torch 7 | import sys 8 | sys.path.append("../..") 9 | 10 | 11 | def test_initialize_model_parallel(tensor_model_parallel_size): 12 | 13 | if torch.distributed.get_rank() == 0: 14 | print('> testing initialize_model_parallel with size {} ...'.format( 15 | tensor_model_parallel_size)) 16 | tensor_model_parallel_size_ = min(tensor_model_parallel_size, 17 | torch.distributed.get_world_size()) 18 | assert not mpu.model_parallel_is_initialized() 19 | mpu.initialize_model_parallel(tensor_model_parallel_size_) 20 | assert mpu.model_parallel_is_initialized() 21 | 22 | # Checks. 23 | def check(group, world_size, rank): 24 | assert world_size == torch.distributed.get_world_size(group=group) 25 | assert rank == torch.distributed.get_rank(group=group) 26 | 27 | # Model parallel. 28 | world_size = tensor_model_parallel_size_ 29 | rank = torch.distributed.get_rank() % tensor_model_parallel_size_ 30 | assert world_size == mpu.get_tensor_model_parallel_world_size() 31 | assert rank == mpu.get_tensor_model_parallel_rank() 32 | check(mpu.get_tensor_model_parallel_group(), world_size, rank) 33 | 34 | # Data parallel. 35 | world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_ 36 | rank = torch.distributed.get_rank() // tensor_model_parallel_size 37 | assert world_size == mpu.get_data_parallel_world_size() 38 | assert rank == mpu.get_data_parallel_rank() 39 | check(mpu.get_data_parallel_group(), world_size, rank) 40 | 41 | # Reset groups 42 | mpu.destroy_model_parallel() 43 | 44 | torch.distributed.barrier() 45 | if torch.distributed.get_rank() == 0: 46 | print('>> passed the test :-)') 47 | 48 | 49 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): 50 | 51 | if torch.distributed.get_rank() == 0: 52 | print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format( 53 | tensor_model_parallel_size_)) 54 | tensor_model_parallel_size = min(tensor_model_parallel_size_, 55 | torch.distributed.get_world_size()) 56 | assert not mpu.model_parallel_is_initialized() 57 | mpu.initialize_model_parallel(tensor_model_parallel_size) 58 | assert mpu.model_parallel_is_initialized() 59 | 60 | # Checks 61 | src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank() 62 | assert mpu.get_tensor_model_parallel_src_rank() == src_rank 63 | 64 | # Reset groups 65 | mpu.destroy_model_parallel() 66 | 67 | torch.distributed.barrier() 68 | if torch.distributed.get_rank() == 0: 69 | print('>> passed the test :-)') 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | initialize_distributed() 75 | world_size = torch.distributed.get_world_size() 76 | tensor_model_parallel_size = 1 77 | while tensor_model_parallel_size <= world_size: 78 | print_separator('test initialize model parallel') 79 | test_initialize_model_parallel(tensor_model_parallel_size) 80 | print_separator('test model parallel source rank') 81 | test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) 82 | tensor_model_parallel_size *= 2 83 | -------------------------------------------------------------------------------- /megatron/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Megatron 9 | 71 | 72 | 73 |
74 |

Prompt Megatron

75 | 76 | 77 | 78 | 79 | 80 |
81 | 0 82 | / 1000 83 |
84 | 85 |
86 | 87 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /megatron/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /megatron/text_generation/beam_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | ## from huggingface beam search 19 | class BeamHypotheses(object): 20 | def __init__(self, num_beams, length_penalty=1.0, early_stopping=False): 21 | """ 22 | Initialize n-best list of hypotheses. 23 | """ 24 | self.length_penalty = length_penalty 25 | self.early_stopping = early_stopping 26 | self.num_beams = num_beams 27 | self.beams = [] 28 | self.worst_score = 1e9 29 | 30 | def __len__(self): 31 | """ 32 | Number of hypotheses in the list. 33 | """ 34 | return len(self.beams) 35 | 36 | def add(self, hyp, sum_logprobs, length): 37 | """ 38 | Add a new hypothesis to the list. 39 | """ 40 | score = sum_logprobs / length ** self.length_penalty 41 | if len(self) < self.num_beams or score > self.worst_score: 42 | self.beams.append((score, hyp)) 43 | if len(self) > self.num_beams: 44 | sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) 45 | del self.beams[sorted_scores[0][1]] 46 | self.worst_score = sorted_scores[1][0] 47 | else: 48 | self.worst_score = min(score, self.worst_score) 49 | 50 | def is_done(self, best_sum_logprobs, cur_len): 51 | """ 52 | If there are enough hypotheses and that none of the hypotheses being generated 53 | can become better than the worst one in the heap, then we are done with this sentence. 54 | """ 55 | 56 | if len(self) < self.num_beams: 57 | return False 58 | elif self.early_stopping: 59 | return True 60 | else: 61 | cur_score = best_sum_logprobs / cur_len ** self.length_penalty 62 | ret = self.worst_score >= cur_score 63 | return ret 64 | 65 | -------------------------------------------------------------------------------- /megatron/text_generation/sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Sampling utilities. 4 | Part of this code is inspired by: 5 | - https://github.com/ari-holtzman/degen/blob/master/gen.py 6 | - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html 7 | """ 8 | 9 | 10 | import torch 11 | 12 | 13 | 14 | def modify_logits_for_top_k_filtering(logits, top_k): 15 | """Set the logits for none top-k values to -inf.""" 16 | 17 | filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] 18 | logits.masked_fill_(filter_, float('-Inf')) 19 | 20 | 21 | 22 | def modify_logits_for_top_p_filtering(logits, top_p): 23 | """Set the logits for none top-p values to -inf.""" 24 | 25 | # First sort and calculate cumulative sum of probabilities. 26 | sorted_logits, sorted_indices = torch.sort(logits, descending=True) 27 | cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) 28 | 29 | # Filteration based on the cumulative sum. 30 | filter_ = cumulative_probs > top_p 31 | # This shift by 1 is weird and I cannot justify it. This existed 32 | # in the original implementation: 33 | # https://github.com/ari-holtzman/degen/blob/master/gen.py 34 | # and I guess it is needed so keeping it for now. 35 | filter_[:, 1:] = filter_[:, :-1].clone() 36 | # Make sure we at least have one token to select from. 37 | filter_[..., 0] = 0 38 | 39 | # Fill in the filtered part 40 | filter_ = filter_.scatter(1, sorted_indices, filter_) 41 | logits.masked_fill_(filter_, float('-Inf')) 42 | 43 | 44 | 45 | def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None): 46 | """ Sample and generate a token. 47 | Note: logits has the dimension [b, v] where b is the batch size 48 | and v is the vocabulary size. 49 | If vocab_size is provided, we will make sure the sample that is 50 | generated is in [0, vocab-size). This will avoid out of vocabulary 51 | generations due to padding. 52 | """ 53 | 54 | # Check logits for consistency. 55 | assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.' 56 | assert logits.type() == 'torch.cuda.FloatTensor', \ 57 | 'input logits should be floats.' 58 | 59 | 60 | # Greedy is just simple argmax. 61 | if top_k == 1: 62 | assert top_p == 0.0, 'cannot set both greedy and top-p samplings.' 63 | samples = torch.argmax(logits, dim=-1) 64 | 65 | # Top-k or top-p sampling. 66 | else: 67 | # Clone so we do not modify the inputs, 68 | logits = logits.clone() 69 | # Apply temperature in place. 70 | if temperature != 1.0: 71 | logits.div_(temperature) 72 | 73 | if top_k > 1: 74 | assert top_p == 0.0, 'cannot set both top-k and top-p samplings.' 75 | assert top_k <= logits.size(1), 'top-k is larger than logit size.' 76 | if vocab_size: 77 | assert top_k < vocab_size, 'top-k is larger than vocab size.' 78 | modify_logits_for_top_k_filtering(logits, top_k) 79 | 80 | elif top_p > 0.0: 81 | assert top_p <= 1.0, 'top-p should be in (0, 1].' 82 | modify_logits_for_top_p_filtering(logits, top_p) 83 | 84 | # After filtering, we need to recalculate the distribution. 85 | probs = logits.softmax(dim=-1) 86 | samples = torch.multinomial(probs, num_samples=1).view(-1) 87 | 88 | # If vocab size is provided, make sure the samples are in 89 | # in the range [0, vocab-size). 90 | if vocab_size: 91 | samples = torch.clamp(samples, min=0, max=(vocab_size - 1)) 92 | 93 | return samples 94 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /pretrain_vision_classify.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Pretrain VIT""" 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from functools import partial 8 | from megatron import get_args, get_timers, print_rank_0 9 | from megatron.data.vit_dataset import build_train_valid_datasets 10 | from megatron.model import ModelType 11 | from megatron.model.vision.classification import VitClassificationModel 12 | from megatron.model.vision.classification import MitClassificationModel 13 | from megatron.training import pretrain 14 | from megatron.utils import average_losses_across_data_parallel_group 15 | 16 | 17 | def model_provider(pre_process=True, post_process=True): 18 | """Build the model.""" 19 | 20 | args = get_args() 21 | 22 | if args.vision_backbone_type == 'vit': 23 | print_rank_0("building VIT model ...") 24 | model = VitClassificationModel(num_classes=args.num_classes, 25 | pre_process=pre_process, 26 | post_process=post_process) 27 | elif args.vision_backbone_type == 'mit': 28 | print_rank_0("building MIT model ...") 29 | model = MitClassificationModel(num_classes=args.num_classes, 30 | pre_process=pre_process, 31 | post_process=post_process) 32 | else: 33 | raise Exception('{} vision backbone is not supported.'.format( 34 | args.vision_backbone_type)) 35 | return model 36 | 37 | 38 | def get_batch(data_iterator): 39 | """Build the batch.""" 40 | data = next(data_iterator) 41 | 42 | # only data parallelism; no need for broadcast 43 | images = data[0].cuda() 44 | labels = data[1].cuda() 45 | 46 | return images, labels 47 | 48 | 49 | def loss_func(labels, output_tensor): 50 | logits = output_tensor.contiguous().float() 51 | loss = F.cross_entropy(logits, labels) 52 | 53 | outputs = torch.argmax(logits, -1) 54 | correct = (outputs == labels).float() 55 | accuracy = torch.mean(correct) 56 | 57 | averaged_loss = average_losses_across_data_parallel_group([loss, accuracy]) 58 | 59 | return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]} 60 | 61 | 62 | def forward_step(data_iterator, model): 63 | """Forward step.""" 64 | timers = get_timers() 65 | 66 | # Get the batch. 67 | timers("batch-generator", log_level=2).start() 68 | ( 69 | images, 70 | labels, 71 | ) = get_batch(data_iterator) 72 | timers("batch-generator").stop() 73 | 74 | # Forward model. lm_labels 75 | output_tensor = model(images) 76 | 77 | return output_tensor, partial(loss_func, labels) 78 | 79 | def train_valid_test_datasets_provider(train_val_test_num_samples): 80 | """Build train, valid, and test datasets.""" 81 | args = get_args() 82 | 83 | print_rank_0( 84 | "> building train, validation, and test datasets " "for VIT ..." 85 | ) 86 | train_ds, valid_ds = build_train_valid_datasets( 87 | data_path=args.data_path, 88 | image_size=(args.img_h, args.img_w) 89 | ) 90 | print_rank_0("> finished creating VIT datasets ...") 91 | 92 | return train_ds, valid_ds, None 93 | 94 | 95 | if __name__ == "__main__": 96 | 97 | pretrain( 98 | train_valid_test_datasets_provider, 99 | model_provider, 100 | ModelType.encoder_or_decoder, 101 | forward_step, 102 | args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True} 103 | ) 104 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="megatron.core", 5 | version="0.1", 6 | description="Core components of Megatron.", 7 | packages=find_packages( 8 | include=("megatron.core") 9 | ) 10 | ) 11 | -------------------------------------------------------------------------------- /tasks/data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Tasks data utility.""" 4 | 5 | import re 6 | import numpy as np 7 | 8 | 9 | def clean_text(text): 10 | """Remove new lines and multiple spaces and adjust end of sentence dot.""" 11 | 12 | text = text.replace("\n", " ") 13 | text = re.sub(r'\s+', ' ', text) 14 | for _ in range(3): 15 | text = text.replace(' . ', '. ') 16 | 17 | return text 18 | 19 | 20 | def build_sample(ids, types, paddings, label, unique_id): 21 | """Convert to numpy and return a sample consumed by the batch producer.""" 22 | 23 | ids_np = np.array(ids, dtype=np.int64) 24 | types_np = np.array(types, dtype=np.int64) 25 | paddings_np = np.array(paddings, dtype=np.int64) 26 | sample = ({'text': ids_np, 27 | 'types': types_np, 28 | 'padding_mask': paddings_np, 29 | 'label': int(label), 30 | 'uid': int(unique_id)}) 31 | 32 | return sample 33 | 34 | 35 | def build_tokens_types_paddings_from_text(text_a, text_b, 36 | tokenizer, max_seq_length): 37 | """Build token types and paddings, trim if needed, and pad if needed.""" 38 | 39 | text_a_ids = tokenizer.tokenize(text_a) 40 | text_b_ids = None 41 | if text_b is not None: 42 | text_b_ids = tokenizer.tokenize(text_b) 43 | 44 | return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, 45 | max_seq_length, tokenizer.cls, 46 | tokenizer.sep, tokenizer.pad) 47 | 48 | 49 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length, 50 | cls_id, sep_id, pad_id): 51 | """Build token types and paddings, trim if needed, and pad if needed.""" 52 | 53 | ids = [] 54 | types = [] 55 | paddings = [] 56 | 57 | # [CLS]. 58 | ids.append(cls_id) 59 | types.append(0) 60 | paddings.append(1) 61 | 62 | # A. 63 | len_text_a = len(text_a_ids) 64 | ids.extend(text_a_ids) 65 | types.extend([0] * len_text_a) 66 | paddings.extend([1] * len_text_a) 67 | 68 | # [SEP]. 69 | ids.append(sep_id) 70 | types.append(0) 71 | paddings.append(1) 72 | 73 | # B. 74 | if text_b_ids is not None: 75 | len_text_b = len(text_b_ids) 76 | ids.extend(text_b_ids) 77 | types.extend([1] * len_text_b) 78 | paddings.extend([1] * len_text_b) 79 | 80 | # Cap the size. 81 | trimmed = False 82 | if len(ids) >= max_seq_length: 83 | max_seq_length_m1 = max_seq_length - 1 84 | ids = ids[0:max_seq_length_m1] 85 | types = types[0:max_seq_length_m1] 86 | paddings = paddings[0:max_seq_length_m1] 87 | trimmed = True 88 | 89 | # [SEP]. 90 | if (text_b_ids is not None) or trimmed: 91 | ids.append(sep_id) 92 | if text_b_ids is None: 93 | types.append(0) 94 | else: 95 | types.append(1) 96 | paddings.append(1) 97 | 98 | # Padding. 99 | padding_length = max_seq_length - len(ids) 100 | if padding_length > 0: 101 | ids.extend([pad_id] * padding_length) 102 | types.extend([pad_id] * padding_length) 103 | paddings.extend([0] * padding_length) 104 | 105 | return ids, types, paddings 106 | -------------------------------------------------------------------------------- /tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """GLUE dataset.""" 4 | 5 | from abc import ABC 6 | from abc import abstractmethod 7 | 8 | from torch.utils.data import Dataset 9 | 10 | from megatron import print_rank_0 11 | from tasks.data_utils import build_sample 12 | from tasks.data_utils import build_tokens_types_paddings_from_text 13 | 14 | 15 | class GLUEAbstractDataset(ABC, Dataset): 16 | """GLUE base dataset class.""" 17 | 18 | def __init__(self, task_name, dataset_name, datapaths, 19 | tokenizer, max_seq_length): 20 | # Store inputs. 21 | self.task_name = task_name 22 | self.dataset_name = dataset_name 23 | self.tokenizer = tokenizer 24 | self.max_seq_length = max_seq_length 25 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 26 | self.dataset_name)) 27 | # Process the files. 28 | string = ' > paths:' 29 | for path in datapaths: 30 | string += ' ' + path 31 | print_rank_0(string) 32 | self.samples = [] 33 | for datapath in datapaths: 34 | self.samples.extend(self.process_samples_from_single_path(datapath)) 35 | print_rank_0(' >> total number of samples: {}'.format( 36 | len(self.samples))) 37 | 38 | def __len__(self): 39 | return len(self.samples) 40 | 41 | def __getitem__(self, idx): 42 | raw_sample = self.samples[idx] 43 | ids, types, paddings = build_tokens_types_paddings_from_text( 44 | raw_sample['text_a'], raw_sample['text_b'], 45 | self.tokenizer, self.max_seq_length) 46 | sample = build_sample(ids, types, paddings, 47 | raw_sample['label'], raw_sample['uid']) 48 | return sample 49 | 50 | @abstractmethod 51 | def process_samples_from_single_path(self, datapath): 52 | """Abstract method that takes a single path / filename and 53 | returns a list of dataset samples, each sample being a dict of 54 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 55 | """ 56 | pass 57 | -------------------------------------------------------------------------------- /tasks/glue/finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """GLUE finetuning/evaluation.""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from megatron import get_tokenizer 8 | from megatron.model.classification import Classification 9 | from tasks.eval_utils import accuracy_func_provider 10 | from tasks.finetune_utils import finetune 11 | 12 | 13 | def glue_classification(num_classes, Dataset, 14 | name_from_datapath_func): 15 | 16 | def train_valid_datasets_provider(): 17 | """Build train and validation dataset.""" 18 | args = get_args() 19 | tokenizer = get_tokenizer() 20 | 21 | train_dataset = Dataset('training', args.train_data, 22 | tokenizer, args.seq_length) 23 | valid_dataset = Dataset('validation', args.valid_data, 24 | tokenizer, args.seq_length) 25 | 26 | return train_dataset, valid_dataset 27 | 28 | def model_provider(pre_process=True, post_process=True): 29 | """Build the model.""" 30 | args = get_args() 31 | 32 | print_rank_0('building classification model for {} ...'.format( 33 | args.task)) 34 | model = Classification(num_classes=num_classes, num_tokentypes=2, 35 | pre_process=pre_process, post_process=post_process) 36 | 37 | return model 38 | 39 | def metrics_func_provider(): 40 | """Privde metrics callback function.""" 41 | def single_dataset_provider(datapath): 42 | args = get_args() 43 | tokenizer = get_tokenizer() 44 | 45 | name = name_from_datapath_func(datapath) 46 | return Dataset(name, [datapath], tokenizer, args.seq_length) 47 | return accuracy_func_provider(single_dataset_provider) 48 | 49 | """Finetune/evaluate.""" 50 | finetune(train_valid_datasets_provider, model_provider, 51 | end_of_epoch_callback_provider=metrics_func_provider) 52 | 53 | 54 | def main(): 55 | args = get_args() 56 | 57 | if args.task == 'MNLI': 58 | 59 | num_classes = 3 60 | from tasks.glue.mnli import MNLIDataset as Dataset 61 | 62 | def name_from_datapath(datapath): 63 | return datapath.split('MNLI')[-1].strip( 64 | '.tsv').strip('/').replace('_', '-') 65 | 66 | elif args.task == 'QQP': 67 | 68 | num_classes = 2 69 | from tasks.glue.qqp import QQPDataset as Dataset 70 | 71 | def name_from_datapath(datapath): 72 | return datapath.split('QQP')[-1].strip( 73 | '.tsv').strip('/').replace('_', '-') 74 | 75 | else: 76 | raise NotImplementedError('GLUE task {} is not implemented.'.format( 77 | args.task)) 78 | 79 | glue_classification(num_classes, Dataset, name_from_datapath) 80 | -------------------------------------------------------------------------------- /tasks/glue/mnli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """MNLI dataset.""" 4 | 5 | from megatron import print_rank_0 6 | from tasks.data_utils import clean_text 7 | from .data import GLUEAbstractDataset 8 | 9 | 10 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} 11 | 12 | 13 | class MNLIDataset(GLUEAbstractDataset): 14 | 15 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 16 | test_label='contradiction'): 17 | self.test_label = test_label 18 | super().__init__('MNLI', name, datapaths, 19 | tokenizer, max_seq_length) 20 | 21 | def process_samples_from_single_path(self, filename): 22 | """"Implement abstract method.""" 23 | print_rank_0(' > Processing {} ...'.format(filename)) 24 | 25 | samples = [] 26 | total = 0 27 | first = True 28 | is_test = False 29 | with open(filename, 'r') as f: 30 | for line in f: 31 | row = line.strip().split('\t') 32 | if first: 33 | first = False 34 | if len(row) == 10: 35 | is_test = True 36 | print_rank_0( 37 | ' reading {}, {} and {} columns and setting ' 38 | 'labels to {}'.format( 39 | row[0].strip(), row[8].strip(), 40 | row[9].strip(), self.test_label)) 41 | else: 42 | print_rank_0(' reading {} , {}, {}, and {} columns ' 43 | '...'.format( 44 | row[0].strip(), row[8].strip(), 45 | row[9].strip(), row[-1].strip())) 46 | continue 47 | 48 | text_a = clean_text(row[8].strip()) 49 | text_b = clean_text(row[9].strip()) 50 | unique_id = int(row[0].strip()) 51 | label = row[-1].strip() 52 | if is_test: 53 | label = self.test_label 54 | 55 | assert len(text_a) > 0 56 | assert len(text_b) > 0 57 | assert label in LABELS 58 | assert unique_id >= 0 59 | 60 | sample = {'text_a': text_a, 61 | 'text_b': text_b, 62 | 'label': LABELS[label], 63 | 'uid': unique_id} 64 | total += 1 65 | samples.append(sample) 66 | 67 | if total % 50000 == 0: 68 | print_rank_0(' > processed {} so far ...'.format(total)) 69 | 70 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 71 | return samples 72 | -------------------------------------------------------------------------------- /tasks/glue/qqp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """QQP dataset.""" 4 | 5 | from megatron import print_rank_0 6 | from tasks.data_utils import clean_text 7 | from .data import GLUEAbstractDataset 8 | 9 | 10 | LABELS = [0, 1] 11 | 12 | 13 | class QQPDataset(GLUEAbstractDataset): 14 | 15 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 16 | test_label=0): 17 | self.test_label = test_label 18 | super().__init__('QQP', name, datapaths, 19 | tokenizer, max_seq_length) 20 | 21 | def process_samples_from_single_path(self, filename): 22 | """"Implement abstract method.""" 23 | print_rank_0(' > Processing {} ...'.format(filename)) 24 | 25 | samples = [] 26 | total = 0 27 | first = True 28 | is_test = False 29 | with open(filename, 'r') as f: 30 | for line in f: 31 | row = line.strip().split('\t') 32 | if first: 33 | first = False 34 | if len(row) == 3: 35 | is_test = True 36 | print_rank_0(' reading {}, {}, and {} columns and ' 37 | 'setting labels to {}'.format( 38 | row[0].strip(), row[1].strip(), 39 | row[2].strip(), self.test_label)) 40 | else: 41 | assert len(row) == 6 42 | print_rank_0(' reading {}, {}, {}, and {} columns' 43 | ' ...'.format( 44 | row[0].strip(), row[3].strip(), 45 | row[4].strip(), row[5].strip())) 46 | continue 47 | 48 | if is_test: 49 | assert len(row) == 3, 'expected length 3: {}'.format(row) 50 | uid = int(row[0].strip()) 51 | text_a = clean_text(row[1].strip()) 52 | text_b = clean_text(row[2].strip()) 53 | label = self.test_label 54 | assert len(text_a) > 0 55 | assert len(text_b) > 0 56 | else: 57 | if len(row) == 6: 58 | uid = int(row[0].strip()) 59 | text_a = clean_text(row[3].strip()) 60 | text_b = clean_text(row[4].strip()) 61 | label = int(row[5].strip()) 62 | else: 63 | print_rank_0('***WARNING*** index error, ' 64 | 'skipping: {}'.format(row)) 65 | continue 66 | if len(text_a) == 0: 67 | print_rank_0('***WARNING*** zero length a, ' 68 | 'skipping: {}'.format(row)) 69 | continue 70 | if len(text_b) == 0: 71 | print_rank_0('***WARNING*** zero length b, ' 72 | 'skipping: {}'.format(row)) 73 | continue 74 | assert label in LABELS 75 | assert uid >= 0 76 | 77 | sample = {'uid': uid, 78 | 'text_a': text_a, 79 | 'text_b': text_b, 80 | 'label': label} 81 | total += 1 82 | samples.append(sample) 83 | 84 | if total % 50000 == 0: 85 | print_rank_0(' > processed {} so far ...'.format(total)) 86 | 87 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 88 | return samples 89 | -------------------------------------------------------------------------------- /tasks/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework. 5 | 6 | ## Multi-Stage Dialogue Prompting 7 | 8 | ### Data Preparation 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/) 10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets. 11 | 12 | ### Stage-1: Prompting for Knowledge Generation 13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation. 14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation. 15 | 16 | ### Stage-2: Prompting for Response Generation 17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file). 18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation. 19 | 3. We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation. 20 | -------------------------------------------------------------------------------- /tasks/msdp/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Model evaluation""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from tasks.msdp.metrics import F1Metric 8 | from tqdm import tqdm 9 | 10 | 11 | def evaluate_f1(guess_file, answer_file): 12 | """Evaluating F1 Score""" 13 | 14 | guess_list = [] 15 | print_rank_0('reading %s' % guess_file) 16 | with open(guess_file, "r") as f: 17 | for i, line in enumerate(tqdm(f)): 18 | line = line.strip() 19 | if "<|endoftext|>" in line: 20 | line = line.replace("<|endoftext|>", "") 21 | guess_list.append(line) 22 | 23 | answer_list = [] 24 | print_rank_0('reading %s' % answer_file) 25 | with open(answer_file, "r") as f: 26 | for i, line in enumerate(tqdm(f)): 27 | line = line.strip() 28 | if line == "no_passages_used": 29 | line = "" 30 | answer_list.append(line) 31 | 32 | assert len(guess_list) == len(answer_list), \ 33 | "lengths of guess and answer are different!" 34 | 35 | precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list) 36 | print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1)) 37 | 38 | print_rank_0('done :-)') 39 | 40 | 41 | def main(): 42 | args = get_args() 43 | 44 | evaluate_f1(args.guess_file, args.answer_file) 45 | 46 | -------------------------------------------------------------------------------- /tasks/msdp/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Run multi-stage dialogue prompting (MSDP).""" 4 | 5 | import os 6 | import sys 7 | sys.path.append(os.path.abspath(os.path.join( 8 | os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir))) 9 | from megatron import get_args 10 | from megatron.initialize import initialize_megatron 11 | 12 | 13 | def get_tasks_args(parser): 14 | """Provide extra arguments required for tasks.""" 15 | group = parser.add_argument_group(title='tasks') 16 | 17 | # parameters for the knowledgeable dialogue generation 18 | group.add_argument('--task', type=str, required=True, 19 | help='Task name.') 20 | group.add_argument("--sample-input-file", type=str, default=None, 21 | help='Get input from file instead of interactive mode, ' 22 | 'each line is an input.') 23 | group.add_argument("--sample-output-file", type=str, default=None, 24 | help='Output file got from --sample-input-file') 25 | group.add_argument('--prompt-file', type=str, default=None, 26 | help='prompting file') 27 | group.add_argument('--prompt-type', type=str, default=None, 28 | choices=['knowledge', 'response'], 29 | help='prompt type (knowledge or response)') 30 | group.add_argument('--num-prompt-examples', type=int, default=10, 31 | help='number of prompt examples') 32 | group.add_argument('--guess-file', type=str, default=None, 33 | help='datapath for generated sentences') 34 | group.add_argument('--answer-file', type=str, default=None, 35 | help='datapath for golden sentences') 36 | group.add_argument('--out-seq-length', type=int, default=100, 37 | help='output sequence length') 38 | group.add_argument('--api-prompt', default=False, action="store_true", 39 | help='setup model api for prompting') 40 | group.add_argument('--megatron-api-url', type=str, default=None, 41 | help='url of the megatron api') 42 | 43 | return parser 44 | 45 | 46 | if __name__ == '__main__': 47 | 48 | initialize_megatron(extra_args_provider=get_tasks_args) 49 | 50 | args = get_args() 51 | 52 | if args.num_layers_per_virtual_pipeline_stage is not None: 53 | print("Interleaved pipeline schedule is not yet supported for downstream tasks.") 54 | exit() 55 | 56 | if args.task == 'MSDP-PROMPT': 57 | from tasks.msdp.prompt import main 58 | 59 | elif args.task == 'MSDP-EVAL-F1': 60 | from tasks.msdp.evaluate import main 61 | 62 | else: 63 | raise NotImplementedError('Task {} is not implemented.'.format( 64 | args.task)) 65 | 66 | main() 67 | -------------------------------------------------------------------------------- /tasks/msdp/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | # The following code is adapted from 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 4 | # which is licensed under the MIT license. More details on the license can be 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE. 6 | 7 | """Provides standard metric evaluations for dialog.""" 8 | 9 | from collections import Counter 10 | from typing import List 11 | import numpy as np 12 | import re 13 | 14 | re_art = re.compile(r'\b(a|an|the)\b') 15 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') 16 | 17 | 18 | def normalize_answer(s): 19 | """ 20 | Lower text and remove punctuation, articles and extra whitespace. 21 | """ 22 | s = s.lower() 23 | s = re_punc.sub(' ', s) 24 | s = re_art.sub(' ', s) 25 | s = ' '.join(s.split()) 26 | return s 27 | 28 | 29 | class F1Metric: 30 | """ 31 | Helper class which computes token-level F1. 32 | """ 33 | 34 | @staticmethod 35 | def _prec_recall_f1_score(pred_items, gold_items): 36 | """ 37 | Compute precision, recall and f1 given a set of gold and prediction items. 38 | :param pred_items: iterable of predicted values 39 | :param gold_items: iterable of gold values 40 | :return: tuple (p, r, f1) for precision, recall, f1 41 | """ 42 | common = Counter(gold_items) & Counter(pred_items) 43 | num_same = sum(common.values()) 44 | if num_same == 0: 45 | return 0, 0, 0 46 | precision = 1.0 * num_same / len(pred_items) 47 | recall = 1.0 * num_same / len(gold_items) 48 | f1 = (2 * precision * recall) / (precision + recall) 49 | return precision, recall, f1 50 | 51 | @staticmethod 52 | def compute_each_pair(guess: str, answer: str): 53 | if answer == "": 54 | return None, None, None 55 | if guess == "": 56 | return 0, 0, 0 57 | g_tokens = normalize_answer(guess).split() 58 | a_tokens = normalize_answer(answer).split() 59 | 60 | precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens) 61 | return precision, recall, f1 62 | 63 | @staticmethod 64 | def compute_all_pairs(guesses: List[str], answers: List[str]): 65 | # additional augment: 66 | assert len(guesses) == len(answers) 67 | 68 | precision_list, recall_list, f1_list = [], [], [] 69 | for guess, answer in zip(guesses, answers): 70 | precision, recall, f1 = F1Metric.compute_each_pair(guess, answer) 71 | if precision is None or recall is None or f1 is None: 72 | continue 73 | precision_list.append(precision) 74 | recall_list.append(recall) 75 | f1_list.append(f1) 76 | 77 | return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list) 78 | -------------------------------------------------------------------------------- /tasks/orqa/README.md: -------------------------------------------------------------------------------- 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering 2 | 3 | Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408). 4 | 5 | ## Retriever Training 6 | 7 | #### Unsupervised pretraining 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body. 9 | 10 |
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | 
20 | 21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training. 22 | 23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf). 24 | 25 | #### Supervised finetuning 26 | 27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906). 28 | 29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model. 30 | 31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408). 32 | 33 | ## Reader Training 34 | 35 | The reader component will be available soon. 36 | 37 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | from megatron import get_args, print_rank_0 6 | from megatron.indexer import IndexBuilder 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator 8 | 9 | def main(): 10 | """ 11 | Main program 12 | """ 13 | 14 | args = get_args() 15 | 16 | """ 17 | Create a BlockData data structure by running an IndexBuilder over an 18 | ICT Dataset and then evaluate on NQ task 19 | """ 20 | 21 | print_rank_0("Starting index builder!") 22 | 23 | index_builder = IndexBuilder() 24 | index_builder.build_and_save_index() 25 | print_rank_0("Build and save indices: done!") 26 | 27 | 28 | print_rank_0("Starting evaluations!") 29 | 30 | # Set up the model and evaluator 31 | evaluator = ORQAEvaluator() 32 | 33 | # Run evaluation 34 | if args.qa_data_dev is not None: 35 | evaluator.evaluate(args.qa_data_dev, "DEV") 36 | 37 | if args.qa_data_test is not None: 38 | evaluator.evaluate(args.qa_data_test, "TEST") 39 | 40 | -------------------------------------------------------------------------------- /tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Race.""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from megatron import get_tokenizer 8 | from megatron.model.multiple_choice import MultipleChoice 9 | from tasks.eval_utils import accuracy_func_provider 10 | from tasks.finetune_utils import finetune 11 | from tasks.race.data import RaceDataset 12 | 13 | 14 | def train_valid_datasets_provider(): 15 | """Provide train and validation datasets.""" 16 | args = get_args() 17 | tokenizer = get_tokenizer() 18 | 19 | train_dataset = RaceDataset('training', args.train_data, 20 | tokenizer, args.seq_length) 21 | valid_dataset = RaceDataset('validation', args.valid_data, 22 | tokenizer, args.seq_length) 23 | 24 | return train_dataset, valid_dataset 25 | 26 | 27 | def model_provider(pre_process=True, post_process=True): 28 | """Build the model.""" 29 | 30 | print_rank_0('building multichoice model for RACE ...') 31 | model = MultipleChoice(num_tokentypes=2, 32 | pre_process=pre_process, 33 | post_process=post_process) 34 | 35 | return model 36 | 37 | 38 | def metrics_func_provider(): 39 | """Privde metrics callback function.""" 40 | args = get_args() 41 | tokenizer = get_tokenizer() 42 | 43 | def single_dataset_provider(datapath): 44 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 45 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 46 | 47 | return accuracy_func_provider(single_dataset_provider) 48 | 49 | 50 | def main(): 51 | 52 | finetune(train_valid_datasets_provider, model_provider, 53 | end_of_epoch_callback_provider=metrics_func_provider) 54 | -------------------------------------------------------------------------------- /tasks/vision/classification/classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Vision-classification finetuning/evaluation.""" 4 | 5 | import torch.nn.functional as F 6 | from functools import partial 7 | from megatron import get_args, get_timers 8 | from megatron import print_rank_0 9 | from megatron.model.vision.classification import VitClassificationModel 10 | from megatron.data.vit_dataset import build_train_valid_datasets 11 | from tasks.vision.classification.eval_utils import accuracy_func_provider 12 | from tasks.vision.finetune_utils import finetune 13 | from megatron.utils import average_losses_across_data_parallel_group 14 | 15 | 16 | def classification(): 17 | def train_valid_datasets_provider(): 18 | """Build train and validation dataset.""" 19 | args = get_args() 20 | 21 | train_ds, valid_ds = build_train_valid_datasets( 22 | data_path=args.data_path, 23 | image_size=(args.img_h, args.img_w), 24 | ) 25 | return train_ds, valid_ds 26 | 27 | def model_provider(pre_process=True, post_process=True): 28 | """Build the model.""" 29 | args = get_args() 30 | 31 | print_rank_0("building classification model for ImageNet ...") 32 | 33 | return VitClassificationModel(num_classes=args.num_classes, finetune=True, 34 | pre_process=pre_process, post_process=post_process) 35 | 36 | def process_batch(batch): 37 | """Process batch and produce inputs for the model.""" 38 | images = batch[0].cuda().contiguous() 39 | labels = batch[1].cuda().contiguous() 40 | return images, labels 41 | 42 | def cross_entropy_loss_func(labels, output_tensor): 43 | logits = output_tensor 44 | 45 | # Cross-entropy loss. 46 | loss = F.cross_entropy(logits.contiguous().float(), labels) 47 | 48 | # Reduce loss for logging. 49 | averaged_loss = average_losses_across_data_parallel_group([loss]) 50 | 51 | return loss, {'lm loss': averaged_loss[0]} 52 | 53 | def _cross_entropy_forward_step(batch, model): 54 | """Simple forward step with cross-entropy loss.""" 55 | timers = get_timers() 56 | 57 | # Get the batch. 58 | timers("batch generator", log_level=2).start() 59 | try: 60 | batch_ = next(batch) 61 | except BaseException: 62 | batch_ = batch 63 | images, labels = process_batch(batch_) 64 | timers("batch generator").stop() 65 | 66 | # Forward model. 67 | output_tensor = model(images) 68 | 69 | return output_tensor, partial(cross_entropy_loss_func, labels) 70 | 71 | """Finetune/evaluate.""" 72 | finetune( 73 | train_valid_datasets_provider, 74 | model_provider, 75 | forward_step=_cross_entropy_forward_step, 76 | end_of_epoch_callback_provider=accuracy_func_provider, 77 | ) 78 | 79 | def main(): 80 | classification() 81 | 82 | -------------------------------------------------------------------------------- /tasks/vision/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | import os 6 | import sys 7 | 8 | sys.path.append( 9 | os.path.abspath( 10 | os.path.join( 11 | os.path.join(os.path.dirname(__file__), os.path.pardir), 12 | os.path.pardir, 13 | ) 14 | ) 15 | ) 16 | from megatron import get_args 17 | from megatron.initialize import initialize_megatron 18 | 19 | def get_tasks_args(parser): 20 | """Provide extra arguments required for tasks.""" 21 | group = parser.add_argument_group(title="tasks") 22 | 23 | group.add_argument('--task', type=str, default='segment', 24 | choices=['classify', 'segment_setr', 'segment_segformer'], 25 | help='task name.') 26 | group.add_argument("--epochs", type=int, default=None, 27 | help="Number of finetunning epochs. Zero results in " 28 | "evaluation only.") 29 | group.add_argument('--pretrained-checkpoint-type', type=str, default='default', 30 | choices=['default', 'external', 'constrastive'], 31 | help='Type of pretrained checkpoint') 32 | group.add_argument("--pretrained-checkpoint", type=str, default=None, 33 | help="Pretrained checkpoint used for finetunning.") 34 | group.add_argument('--seg-stride', type=int, default=None, 35 | help='sliding window stride during evaluation') 36 | return parser 37 | 38 | 39 | if __name__ == "__main__": 40 | 41 | initialize_megatron(extra_args_provider=get_tasks_args) 42 | args = get_args() 43 | 44 | if args.task == 'classify': 45 | from tasks.vision.classification.classification import main 46 | main() 47 | elif args.task == 'segment_setr': 48 | from tasks.vision.segmentation.finetune_setr import main 49 | main() 50 | elif args.task == 'segment_segformer': 51 | from tasks.vision.segmentation.finetune_segformer import main 52 | main() 53 | 54 | -------------------------------------------------------------------------------- /tasks/vision/segmentation/seg_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import math 3 | import einops 4 | import torch 5 | import apex 6 | import torch.nn.functional as F 7 | from megatron import get_args 8 | from megatron.model.module import MegatronModule 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead 10 | from megatron.model.vision.mit_backbone import mit_b3, mit_b5 11 | from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead 12 | 13 | 14 | class SetrSegmentationModel(MegatronModule): 15 | 16 | def __init__(self, 17 | num_classes, 18 | pre_process=True, 19 | post_process=True): 20 | super(SetrSegmentationModel, self).__init__() 21 | args = get_args() 22 | assert post_process & pre_process 23 | self.hidden_size = args.hidden_size 24 | self.num_classes = num_classes 25 | self.backbone = VitBackbone( 26 | pre_process=pre_process, 27 | post_process=post_process, 28 | class_token=False, 29 | post_layer_norm=False, 30 | drop_path_rate=0.1 31 | ) 32 | 33 | self.head = SetrSegmentationHead( 34 | self.hidden_size, 35 | self.num_classes 36 | ) 37 | 38 | def set_input_tensor(self, input_tensor): 39 | """See megatron.model.transformer.set_input_tensor()""" 40 | pass 41 | 42 | def forward(self, input): 43 | # [b hw c] 44 | hidden_states = self.backbone(input) 45 | result_final = self.head(hidden_states) 46 | return result_final 47 | 48 | 49 | class SegformerSegmentationModel(MegatronModule): 50 | 51 | def __init__(self, 52 | num_classes, 53 | pre_process=True, 54 | post_process=True): 55 | super(SegformerSegmentationModel, self).__init__() 56 | args = get_args() 57 | self.hidden_size = args.hidden_size 58 | self.num_classes = num_classes 59 | self.pre_process = pre_process 60 | self.post_process = post_process 61 | 62 | self.backbone = mit_b5() 63 | self.head = SegformerSegmentationHead( 64 | feature_strides=[4, 8, 16, 32], 65 | in_channels=[64, 128, 320, 512], 66 | embedding_dim=768, 67 | dropout_ratio=0.1 68 | ) 69 | 70 | def set_input_tensor(self, input_tensor): 71 | """See megatron.model.transformer.set_input_tensor()""" 72 | pass 73 | 74 | def forward(self, input): 75 | # [b hw c] 76 | hidden_states = self.backbone(input) 77 | hidden_states = self.head(hidden_states) 78 | return hidden_states 79 | 80 | -------------------------------------------------------------------------------- /tasks/vision/segmentation/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import numpy as np 4 | from megatron import get_args 5 | 6 | def slidingcrops(img, mask): 7 | # img: [b c h w] 8 | # mask: [b h w] 9 | args = get_args() 10 | assert args.img_h == args.img_w 11 | crop_size = args.img_h 12 | stride = args.seg_stride 13 | ignore_index = args.ignore_index 14 | n, c, h, w = img.shape 15 | assert h >= crop_size 16 | assert w >= crop_size 17 | long_size = max(h, w) 18 | 19 | img_slices, mask_slices, slices_info = [], [], [] 20 | if long_size > crop_size: 21 | assert stride <= crop_size 22 | h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1 23 | w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1 24 | for yy in range(h_step_num): 25 | for xx in range(w_step_num): 26 | sy, sx = yy * stride, xx * stride 27 | ey, ex = sy + crop_size, sx + crop_size 28 | img_sub = img[:, :, sy: ey, sx: ex] 29 | mask_sub = mask[:, sy: ey, sx: ex] 30 | 31 | # padding 32 | sub_h, sub_w = img_sub.shape[2:] 33 | pad_h = max(crop_size - sub_h, 0) 34 | pad_w = max(crop_size - sub_w, 0) 35 | img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index) 36 | mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h)) 37 | 38 | img_slices.append(img_sub) 39 | mask_slices.append(mask_sub) 40 | slices_info.append([sy, ey, sx, ex, sub_h, sub_w]) 41 | 42 | return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w) 43 | else: 44 | return img, mask, [[0, h, 0, w, h, w]], (h, w) 45 | 46 | 47 | def slidingjoins(preds, probs, labels, slices_info, img_size): 48 | args = get_args() 49 | num_slices = len(slices_info) 50 | 51 | if num_slices == 1: 52 | return preds, labels 53 | 54 | h, w = img_size 55 | split_size = args.micro_batch_size 56 | 57 | preds_split = torch.split(preds, split_size) 58 | probs_split = torch.split(probs, split_size) 59 | labels_split = torch.split(labels, split_size) 60 | 61 | assert(len(preds_split) == num_slices) 62 | 63 | total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda') 64 | total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda') 65 | total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda') 66 | 67 | for i in range(num_slices): 68 | sy, ey, sx, ex, sub_h, sub_w = slices_info[i] 69 | assert sy + sub_h <= h 70 | assert sx + sub_w <= w 71 | curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] 72 | curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w] 73 | 74 | local_max_probs = probs_split[i][:, :sub_h, : sub_w] 75 | local_preds = preds_split[i][:, :sub_h, :sub_w] 76 | 77 | result_max_probs = torch.maximum(curr_max_probs, local_max_probs) 78 | result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds) 79 | 80 | total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs 81 | total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds 82 | total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w] 83 | 84 | return total_preds, total_labels 85 | 86 | -------------------------------------------------------------------------------- /tasks/zeroshot_gpt/detokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Detokenization.""" 4 | 5 | import re 6 | 7 | 8 | def ptb_detokenizer(string): 9 | string = string.replace(" '", "'") 10 | string = string.replace(" \n", "\n") 11 | string = string.replace("\n ", "\n") 12 | string = string.replace(" n't", "n't") 13 | string = string.replace(" N ", "1 ") 14 | string = string.replace("$ 1", "$1") 15 | string = string.replace("# 1", "#1") 16 | return string 17 | 18 | 19 | def wikitext_detokenizer(string): 20 | # contractions 21 | string = string.replace("s '", "s'") 22 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 23 | # number separators 24 | string = string.replace(" @-@ ", "-") 25 | string = string.replace(" @,@ ", ",") 26 | string = string.replace(" @.@ ", ".") 27 | # punctuation 28 | string = string.replace(" : ", ": ") 29 | string = string.replace(" ; ", "; ") 30 | string = string.replace(" . ", ". ") 31 | string = string.replace(" ! ", "! ") 32 | string = string.replace(" ? ", "? ") 33 | string = string.replace(" , ", ", ") 34 | # double brackets 35 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 36 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 37 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 38 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 39 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 40 | # miscellaneous 41 | string = string.replace("= = = =", "====") 42 | string = string.replace("= = =", "===") 43 | string = string.replace("= =", "==") 44 | string = string.replace(" " + chr(176) + " ", chr(176)) 45 | string = string.replace(" \n", "\n") 46 | string = string.replace("\n ", "\n") 47 | string = string.replace(" N ", " 1 ") 48 | string = string.replace(" 's", "'s") 49 | 50 | return string 51 | 52 | 53 | def lambada_detokenizer(string): 54 | return string 55 | 56 | 57 | _DETOKENIZERS = { 58 | 'ptb': ptb_detokenizer, 59 | 'wiki': wikitext_detokenizer, 60 | 'lambada': lambada_detokenizer, 61 | } 62 | 63 | 64 | def get_detokenizer(path): 65 | for key in _DETOKENIZERS.keys(): 66 | if key in path: 67 | return _DETOKENIZERS[key] 68 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/tests/__init__.py -------------------------------------------------------------------------------- /tests/tensor_parallel/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy 2 | import torch 3 | from tests.test_utilities import Utils 4 | import numpy as np 5 | 6 | def test_vocab_parallel_cross_entropy(): 7 | Utils.initialize_model_parallel(4,2) 8 | vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda() 9 | target = torch.arange(0,32,2).cuda() 10 | output = vocab_parallel_cross_entropy(vocab_parallel_logits, target) 11 | expected_output = torch.tensor([10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309, 12 | 10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309]).cuda() 13 | assert(torch.equal(torch.round(expected_output), torch.round(output))) 14 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/tensor_parallel/test_data.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.data import broadcast_data 2 | import torch 3 | from tests.test_utilities import Utils 4 | 5 | def test_broadcast_data(): 6 | Utils.initialize_model_parallel(2,4) 7 | input_data = { 8 | 0 : torch.ones((8,8)).cuda() * 0.0, 9 | 1 : torch.ones((8,8)).cuda() * 1.0, 10 | 2 : torch.ones((8,8)).cuda() * 2.0, 11 | 3 : torch.ones((8,8)).cuda() * 3.0, 12 | 4 : torch.ones((8,8)).cuda() * 4.0, 13 | 5 : torch.ones((8,8)).cuda() * 5.0, 14 | 6 : torch.ones((8,8)).cuda() * 6.0, 15 | 7 : torch.ones((8,8)).cuda() * 7.0 16 | } 17 | dtype = torch.float32 18 | actual_output = broadcast_data([0,1],input_data, dtype) 19 | assert(torch.equal(actual_output[0], input_data[0])) 20 | assert(torch.equal(actual_output[1], input_data[1])) 21 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/tensor_parallel/test_random.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed 3 | from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER 4 | from megatron.core.tensor_parallel.random import checkpoint 5 | from tests.test_utilities import Utils 6 | import pytest 7 | import torch 8 | 9 | def test_cuda_rng_states_tracker(): 10 | rng_tracker = CudaRNGStatesTracker() 11 | rng_tracker.set_states({"state1":1234}) 12 | assert(rng_tracker.get_states()["state1"] == 1234) 13 | rng_tracker.reset() 14 | assert(rng_tracker.get_states() == {}) 15 | seed = 1111 16 | rng_tracker.add("state2",seed) 17 | with pytest.raises(Exception): 18 | assert(rng_tracker.add("state3",seed)) 19 | with pytest.raises(Exception): 20 | assert(rng_tracker.add("state2",111)) 21 | assert(rng_tracker.get_states()['state2'] is not None) 22 | with pytest.raises(Exception): 23 | assert() 24 | 25 | rng_tracker.fork("state2") 26 | torch.cuda.manual_seed(seed) 27 | rng_state = torch.cuda.get_rng_state() 28 | assert torch.equal(rng_tracker.get_states()['state2'], rng_state) 29 | 30 | def test_model_parallel_cuda_manual_seed(): 31 | Utils.initialize_model_parallel(4,2) 32 | model_parallel_cuda_manual_seed(0) 33 | assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None) 34 | Utils.destroy_model_parallel() 35 | 36 | def test_checkpoint(): 37 | def test_forward(*input): 38 | return input[0]+input[1] 39 | assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2))) 40 | Utils.initialize_model_parallel() 41 | input1 = torch.ones((4,4)) 42 | checkpoint(test_forward, True, input1, torch.ones((4,4))*2) 43 | assert(torch.equal(torch.ones(input1.numel()).cuda(), input1)) 44 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/tensor_parallel/test_tensor_parallel_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import megatron.core.tensor_parallel.utils as util 3 | import megatron.core.parallel_state as ps 4 | from tests.test_utilities import Utils 5 | 6 | rank = Utils.rank 7 | 8 | def test_split_tensor_along_last_dim(): 9 | input_tensor = torch.rand((3,4)) 10 | torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0]) 11 | torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1]) 12 | 13 | def test_split_tensor_into_1d_equal_chunks(): 14 | Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) 15 | input_tensor = torch.rand((3,4)) 16 | output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor) 17 | if rank % 2 == 0 : 18 | start = 0 19 | end = int(input_tensor.numel()/2) 20 | else : 21 | start = int(input_tensor.numel()/2) 22 | end = input_tensor.numel() 23 | 24 | assert torch.equal(output_tensor, input_tensor.flatten()[start:end]) 25 | Utils.destroy_model_parallel() 26 | 27 | def test_gather_split_1d_tensor(): 28 | Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) 29 | input_tensor = torch.ones((2,4)).cuda() * rank 30 | actual_output_tensor = util.gather_split_1d_tensor(input_tensor) 31 | if rank %2 == 0: 32 | expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1)) 33 | else : 34 | expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten())) 35 | assert(torch.equal(actual_output_tensor, expected_output_tensor)) 36 | Utils.destroy_model_parallel() 37 | 38 | def test_vocab(): 39 | global_vocab_size = 1600 40 | per_partition_vocab_size = 1600 / Utils.world_size 41 | assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size))) 42 | assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size))) 43 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import megatron.core.parallel_state as ps 4 | 5 | class Utils: 6 | 7 | world_size = torch.cuda.device_count() 8 | rank = int(os.environ['LOCAL_RANK']) 9 | 10 | @staticmethod 11 | def initialize_distributed(): 12 | print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') 13 | torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) 14 | init_method = 'tcp://' 15 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 16 | master_port = os.getenv('MASTER_PORT', '6000') 17 | init_method += master_ip + ':' + master_port 18 | torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method) 19 | 20 | @staticmethod 21 | def destroy_model_parallel(): 22 | ps.destroy_model_parallel() 23 | torch.distributed.barrier() 24 | 25 | @staticmethod 26 | def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None): 27 | ps.destroy_model_parallel() 28 | if not torch.distributed.is_initialized(): 29 | Utils.initialize_distributed() 30 | ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank) -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import megatron.core.utils as util 4 | import numpy as np 5 | 6 | def test_divide_properly(): 7 | assert util.divide(4,2) == 2 8 | 9 | def test_divide_improperly(): 10 | with pytest.raises(AssertionError): 11 | util.divide(4,5) 12 | 13 | def test_global_memory_buffer(): 14 | global_memory_buffer = util.GlobalMemoryBuffer() 15 | obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor") 16 | expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device()) 17 | assert torch.equal(obtained_tensor, expected_tensor) 18 | 19 | def test_make_viewless_tensor(): 20 | inp = torch.rand((3,4)) 21 | assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True))) 22 | assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False))) 23 | 24 | def test_safely_set_viewless_tensor_data(): 25 | tensor = torch.zeros((3,4)) 26 | new_data_tensor = torch.tensor(np.random.rand(3,4)) 27 | util.safely_set_viewless_tensor_data(tensor, new_data_tensor) 28 | assert(torch.equal(tensor, new_data_tensor)) 29 | 30 | def test_assert_viewless_tensor(): 31 | tensor = torch.rand((3,4)) 32 | assert(torch.equal(util.assert_viewless_tensor(tensor), tensor)) 33 | input_tensor_list=[tensor,tensor,tensor] 34 | output_tensor_list = util.assert_viewless_tensor(input_tensor_list) 35 | for inp,out in zip(input_tensor_list, output_tensor_list): 36 | assert(torch.equal(inp,out)) 37 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/merge_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 6 | os.path.pardir))) 7 | 8 | from megatron.data import indexed_dataset 9 | 10 | 11 | def main(args): 12 | 13 | prefixes = set() 14 | for basename in os.listdir(args.input): 15 | prefix, ext = os.path.splitext(basename) 16 | 17 | if prefix in prefixes: 18 | continue 19 | 20 | if not os.path.isfile(os.path.join(args.input, basename)): 21 | continue 22 | 23 | ext_pair = '.bin' if ext == '.idx' else '.idx' 24 | assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \ 25 | f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}' 26 | 27 | prefixes.add(prefix) 28 | 29 | builder = None 30 | for prefix in sorted(prefixes): 31 | if builder is None: 32 | dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer') 33 | 34 | if isinstance(dataset, indexed_dataset.MMapIndexedDataset): 35 | builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype) 36 | else: 37 | builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin') 38 | 39 | del dataset 40 | 41 | builder.merge_file_(os.path.join(args.input, prefix)) 42 | 43 | builder.finalize(args.output_prefix + '.idx') 44 | 45 | 46 | if __name__ == '__main__': 47 | parser = argparse.ArgumentParser() 48 | 49 | group = parser.add_argument_group(title='input data') 50 | group.add_argument('--input', type=str, required=True, 51 | help='Path to directory containing all document files to merge') 52 | 53 | group = parser.add_argument_group(title='output data') 54 | group.add_argument('--output-prefix', type=str, required=True, 55 | help='Path to binary output file without suffix') 56 | 57 | args = parser.parse_args() 58 | 59 | assert os.path.isdir(args.input), \ 60 | f'ERROR: {args.input} is not a directory or does not exist' 61 | 62 | assert os.path.isdir(os.path.dirname(args.output_prefix)), \ 63 | f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist' 64 | 65 | main(args) 66 | 67 | -------------------------------------------------------------------------------- /tools/openwebtext/README.md: -------------------------------------------------------------------------------- 1 | The following steps show how to prepare training dataset to train the mode. 2 | 3 | # Libraries to install 4 | 5 | ``` 6 | pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 7 | git clone https://github.com/mattilyra/LSH 8 | cd LSH 9 | python setup.py install 10 | ``` 11 | 12 | # Download the dataset 13 | 14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ) 15 | 2. Remove blacklisted URLs. 16 | ``` 17 | python blacklist_urls.py 18 | ``` 19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 20 | 21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique. 22 | 23 | # Prepare the data for GPT training: 24 | 25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards. 26 | ``` 27 | python cleanup_dataset.py 28 | ``` 29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`. 30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`. 31 | ``` 32 | python find_duplicates.py --inputs --output 33 | ``` 34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest. 35 | ``` 36 | python group_duplicate_urls.py 37 | ``` 38 | 4. Remove similar documents that were detected in the last step. 39 | ``` 40 | python remove_group_duplicates.py 41 | ``` 42 | 43 | 5. Shuffle the dataset. 44 | ``` 45 | shuf -o train_data.json 46 | ``` 47 | 48 | # Deduplicating ngrams 49 | 50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command. 51 | 52 | ``` 53 | python filter_ngrams.py --tasks --dedup-dataset --output 54 | ``` 55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments. 56 | 57 | Only for the lambada task, we need to provide the path, `--lambada-path `. 58 | 59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details. 60 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | 8 | """ 9 | This code adds id to each json object in a json file. User can add prefix 10 | to the ids. 11 | """ 12 | 13 | if __name__ == '__main__': 14 | 15 | print('parsing the arguments ...') 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 19 | ' json file where id needs to be added') 20 | parser.add_argument('--output-file', type=str, default=None, help=\ 21 | 'Output file name with id') 22 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 23 | 'Id prefix') 24 | parser.add_argument('--log-interval', type=int, default=100, 25 | help='Log interval') 26 | args = parser.parse_args() 27 | 28 | print('Adding ids to dataset ...') 29 | 30 | f_input = open(args.input_file, 'r', encoding='utf-8') 31 | f_output = open(args.output_file, 'wb') 32 | 33 | unique_ids = 1 34 | start_time = time.time() 35 | for row in f_input: 36 | each_row = json.loads(row) 37 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 38 | each_row['adlr_id'] = adlr_id_string 39 | myjson = json.dumps(each_row, ensure_ascii=False) 40 | 41 | f_output.write(myjson.encode('utf-8')) 42 | f_output.write('\n'.encode('utf-8')) 43 | 44 | if unique_ids % args.log_interval == 0: 45 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 46 | unique_ids, time.time() - start_time), flush=True) 47 | 48 | unique_ids += 1 49 | 50 | # Close the file. 51 | f_input.close() 52 | f_output.close() 53 | 54 | print('done :-)', flush=True) 55 | -------------------------------------------------------------------------------- /tools/openwebtext/group_duplicate_url.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import json 4 | import time 5 | import sys 6 | 7 | 8 | if __name__ == '__main__': 9 | 10 | 11 | print('grouping duplicate urls ...') 12 | 13 | input = sys.argv[1] 14 | output = sys.argv[2] 15 | if len(sys.argv) > 3: 16 | jaccard_similarity_threshold = float(sys.argv[3]) 17 | else: 18 | jaccard_similarity_threshold = 0.7 19 | 20 | url_to_index = {} 21 | index_to_urls = [] 22 | counter = 0 23 | start_time = time.time() 24 | with open(input, 'r') as f: 25 | for line in f: 26 | counter += 1 27 | myjson = json.loads(line) 28 | urls = [] 29 | for main_url in myjson.keys(): 30 | urls.append(main_url) 31 | for value in myjson[main_url]: 32 | for other_url, js in value.items(): 33 | if js >= jaccard_similarity_threshold: 34 | urls.append(other_url) 35 | current_index = -1 36 | other_indices = set() 37 | for url in urls: 38 | if url in url_to_index: 39 | if current_index == -1: 40 | current_index = url_to_index[url] 41 | elif current_index != url_to_index[url]: 42 | other_indices.add(url_to_index[url]) 43 | if current_index == -1: 44 | current_index = len(index_to_urls) 45 | index_to_urls.append(set()) 46 | for url in urls: 47 | url_to_index[url] = current_index 48 | index_to_urls[current_index].add(url) 49 | for index in other_indices: 50 | for url in index_to_urls[index]: 51 | index_to_urls[current_index].add(url) 52 | url_to_index[url] = current_index 53 | index_to_urls[index] = None 54 | 55 | if counter % 100000 == 0: 56 | print(' > processed {} lines in {} seconds ...'.format( 57 | counter, time.time() - start_time)) 58 | 59 | 60 | total_remove = 0 61 | total_remain = 0 62 | for urls in index_to_urls: 63 | if urls is not None: 64 | if len(urls) > 1: 65 | total_remove += (len(urls) - 1) 66 | total_remain += 1 67 | print('out of {} urls, only {} are unique and {} should be removed'.format( 68 | total_remove+total_remain, total_remain, total_remove)) 69 | 70 | with open(output, 'wb') as f: 71 | for i, urls in enumerate(index_to_urls): 72 | if urls is not None: 73 | if len(urls) > 1: 74 | myjson = json.dumps({str(i): list(urls)}, 75 | ensure_ascii=False) 76 | f.write(myjson.encode('utf-8')) 77 | f.write('\n'.encode('utf-8')) 78 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import glob 5 | import sys 6 | import json 7 | import argparse 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--json_path", type=str, default=".", 13 | help="path where all the json files are located") 14 | 15 | parser.add_argument("--output_file", type=str, default="merged_output.json", 16 | help="filename where the merged json should go") 17 | 18 | args = parser.parse_args() 19 | 20 | json_path = args.json_path 21 | out_file = args.output_file 22 | 23 | json_files = glob.glob(json_path + '/*.json') 24 | 25 | counter = 0 26 | 27 | with open(out_file, 'w') as outfile: 28 | for fname in json_files: 29 | counter += 1 30 | 31 | if counter % 1024 == 0: 32 | print("Merging at ", counter, flush=True) 33 | 34 | with open(fname, 'r') as infile: 35 | for row in infile: 36 | each_row = json.loads(row) 37 | outfile.write(row) 38 | 39 | 40 | print("Merged file", out_file, flush=True) 41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import json 5 | import time 6 | import sys 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | url_filename = sys.argv[1] 12 | data_filename = sys.argv[2] 13 | output_filename = sys.argv[3] 14 | 15 | urls = set() 16 | with open(url_filename, 'r') as f: 17 | for line in f: 18 | myjson = json.loads(line) 19 | for key in myjson: 20 | this_urls = myjson[key] 21 | for i in range(1, len(this_urls)): 22 | urls.add(this_urls[i]) 23 | print('will be removing {} urls'.format(len(urls)), flush=True) 24 | 25 | written_docs = 0 26 | removed_docs = 0 27 | removed_chars = 0 28 | start_time = time.time() 29 | with open(output_filename, 'wb') as fout: 30 | with open(data_filename, 'r') as fin: 31 | for line in fin: 32 | try: 33 | myjson = json.loads(line) 34 | url = myjson['url'] 35 | if url in urls: 36 | print('removing', myjson) 37 | removed_docs += 1 38 | removed_chars += len(myjson['text']) 39 | continue 40 | myjson = json.dumps(myjson, ensure_ascii=False) 41 | fout.write(myjson.encode('utf-8')) 42 | fout.write('\n'.encode('utf-8')) 43 | written_docs += 1 44 | if written_docs % 10000 == 0: 45 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 46 | '| removed: {} (char: {})'.format( 47 | time.time() - start_time, 48 | written_docs, removed_docs, removed_chars)) 49 | except Exception as e: 50 | print('[SKIPPING]', line, e) 51 | 52 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 53 | '| removed: {} (char: {})'.format( 54 | time.time() - start_time, 55 | written_docs, removed_docs, removed_chars)) 56 | print('done :-)') 57 | -------------------------------------------------------------------------------- /tools/run_text_generation_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Sample Generate GPT""" 4 | import os 5 | import sys 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 7 | os.path.pardir))) 8 | import socket 9 | from megatron import get_args 10 | from megatron import print_rank_0 11 | from megatron.core import mpu 12 | from megatron.checkpointing import load_checkpoint 13 | from megatron.initialize import initialize_megatron 14 | from megatron.model import GPTModel 15 | from megatron.training import get_model 16 | from megatron.text_generation_server import MegatronServer 17 | from megatron.text_generation import generate_and_post_process 18 | from megatron.text_generation import beam_search_and_post_process 19 | import torch 20 | 21 | def model_provider(pre_process=True, post_process=True): 22 | """Build the model.""" 23 | 24 | print_rank_0('building GPT model ...') 25 | model = GPTModel(num_tokentypes=0, parallel_output=False, pre_process=pre_process, post_process=post_process) 26 | 27 | return model 28 | 29 | def add_text_generate_args(parser): 30 | group = parser.add_argument_group(title='text generation') 31 | 32 | group.add_argument("--temperature", type=float, default=1.0, 33 | help='Sampling temperature.') 34 | group.add_argument("--top_p", type=float, default=0.0, 35 | help='Top p sampling.') 36 | group.add_argument("--top_k", type=int, default=0, 37 | help='Top k sampling.') 38 | group.add_argument("--out-seq-length", type=int, default=1024, 39 | help='Size of the output generated text.') 40 | return parser 41 | 42 | 43 | if __name__ == "__main__": 44 | initialize_megatron(extra_args_provider=add_text_generate_args, 45 | args_defaults={'tokenizer_type': 'GPT2BPETokenizer', 46 | 'no_load_rng': True, 47 | 'no_load_optim': True}) 48 | 49 | args = get_args() 50 | if args.num_layers_per_virtual_pipeline_stage is not None: 51 | print("Interleaved pipeline schedule is not yet supported for text generation.") 52 | exit() 53 | # Set up model and load checkpoint 54 | model = get_model(model_provider, wrap_with_ddp=False) 55 | 56 | if args.load is not None: 57 | _ = load_checkpoint(model, None, None) 58 | 59 | assert len(model) == 1, "Above condition should have caught this" 60 | model = model[0] 61 | if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: 62 | server = MegatronServer(model) 63 | server.run("0.0.0.0") 64 | 65 | while True: 66 | choice = torch.cuda.LongTensor(1) 67 | torch.distributed.broadcast(choice, 0) 68 | if choice[0].item() == 0: 69 | try: 70 | generate_and_post_process(model) 71 | except ValueError as ve: 72 | pass 73 | elif choice[0].item() == 1: 74 | try: 75 | beam_search_and_post_process(model) 76 | except ValueError as ve: 77 | pass 78 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import json 3 | import sys 4 | import urllib2 5 | class PutRequest(urllib2.Request): 6 | '''class to handling putting with urllib2''' 7 | 8 | def get_method(self, *args, **kwargs): 9 | return 'PUT' 10 | 11 | if __name__ == "__main__": 12 | url = sys.argv[1] 13 | while True: 14 | sentence = raw_input("Enter prompt: ") 15 | tokens_to_generate = int(input("Enter number of tokens to generate: ")) 16 | data = json.dumps({"prompts": [sentence], "tokens_to_generate":tokens_to_generate}) 17 | req = PutRequest(url, data, {'Content-Type': 'application/json'}) 18 | response = urllib2.urlopen(req) 19 | resp_sentences = json.load(response) 20 | print("Megatron Response: ") 21 | print(resp_sentences["text"][0]) 22 | --------------------------------------------------------------------------------