├── .coveragerc
├── .gitignore
├── .gitlab-ci.yml
├── LICENSE
├── README.md
├── docs
    ├── distrib_optimizer.md
    └── images
    │   └── distrib_optimizer
    │       ├── data_flow.png
    │       └── sharding_scheme.png
├── examples
    ├── detxoify_lm
    │   ├── README.md
    │   ├── annotations
    │   │   ├── filter-selfgeneration.py
    │   │   ├── perspective_api_annotate.py
    │   │   └── preprocess.sh
    │   ├── finetune_gpt.py
    │   ├── finetune_gpt_distributed-1.3b.sh
    │   ├── generate-1.3b.sh
    │   ├── generate_samples_gpt.py
    │   ├── perspective_api.py
    │   └── self_generation
    │   │   └── selfgenerate-1.3b-unconditional.sh
    ├── evaluate_retriever_nq.sh
    ├── evaluate_zeroshot_gpt.sh
    ├── finetune_mnli_distributed.sh
    ├── finetune_race_distributed.sh
    ├── finetune_retriever_distributed.sh
    ├── merge_mp_bert.sh
    ├── msdp
    │   ├── README.md
    │   ├── data_processing.sh
    │   ├── eval_knwl_generation.sh
    │   ├── eval_resp_generation.sh
    │   ├── prep_resp_gen.sh
    │   ├── prompt_knwl_gen.sh
    │   └── prompt_resp_gen.sh
    ├── pretrain_bert.sh
    ├── pretrain_bert_distributed.sh
    ├── pretrain_bert_distributed_with_mp.sh
    ├── pretrain_gpt.sh
    ├── pretrain_gpt3_175B.sh
    ├── pretrain_gpt_distributed.sh
    ├── pretrain_gpt_distributed_with_mp.sh
    ├── pretrain_ict.sh
    ├── pretrain_t5.sh
    ├── pretrain_t5_distributed.sh
    ├── pretrain_t5_distributed_with_mp.sh
    ├── run_text_generation_server_345M.sh
    ├── run_text_generation_server_345M_8_tensor_parallel.sh
    └── sc21
    │   ├── CONFIG.sh
    │   ├── README.md
    │   ├── SBATCH.sh
    │   ├── SRUN.sh
    │   ├── run_figure_11.sh
    │   ├── run_figure_12.sh
    │   ├── run_figure_13.sh
    │   ├── run_figure_14.sh
    │   ├── run_figure_15.sh
    │   ├── run_figure_16.sh
    │   ├── run_figure_17.sh
    │   ├── run_figure_18.sh
    │   └── run_table_1.sh
├── images
    ├── Achieved_petaFLOPs.png
    └── cases_april2021.png
├── megatron
    ├── __init__.py
    ├── arguments.py
    ├── checkpointing.py
    ├── core
    │   ├── __init__.py
    │   ├── parallel_state.py
    │   ├── tensor_parallel
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   └── utils.py
    │   └── utils.py
    ├── data
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── bert_dataset.py
    │   ├── biencoder_dataset_utils.py
    │   ├── blendable_dataset.py
    │   ├── data_samplers.py
    │   ├── dataset_utils.py
    │   ├── gpt_dataset.py
    │   ├── helpers.cpp
    │   ├── ict_dataset.py
    │   ├── image_folder.py
    │   ├── indexed_dataset.py
    │   ├── orqa_wiki_dataset.py
    │   ├── realm_dataset_utils.py
    │   ├── realm_index.py
    │   ├── t5_dataset.py
    │   ├── test
    │   │   ├── test_indexed_dataset.py
    │   │   └── test_preprocess_data.sh
    │   └── vit_dataset.py
    ├── dist_signal_handler.py
    ├── fp16_deprecated
    │   └── loss_scaler.py
    ├── fused_kernels
    │   ├── __init__.py
    │   ├── compat.h
    │   ├── fused_weight_gradient_dense.cpp
    │   ├── fused_weight_gradient_dense.cu
    │   ├── layer_norm_cuda.cpp
    │   ├── layer_norm_cuda_kernel.cu
    │   ├── scaled_masked_softmax.cpp
    │   ├── scaled_masked_softmax.h
    │   ├── scaled_masked_softmax_cuda.cu
    │   ├── scaled_softmax.cpp
    │   ├── scaled_softmax_cuda.cu
    │   ├── scaled_upper_triang_masked_softmax.cpp
    │   ├── scaled_upper_triang_masked_softmax.h
    │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_fused_kernels.py
    │   └── type_shim.h
    ├── global_vars.py
    ├── indexer.py
    ├── initialize.py
    ├── memory.py
    ├── microbatches.py
    ├── model
    │   ├── __init__.py
    │   ├── bert_model.py
    │   ├── biencoder_model.py
    │   ├── classification.py
    │   ├── distributed.py
    │   ├── enums.py
    │   ├── fused_bias_gelu.py
    │   ├── fused_layer_norm.py
    │   ├── fused_softmax.py
    │   ├── gpt_model.py
    │   ├── language_model.py
    │   ├── megablocks_utils.py
    │   ├── module.py
    │   ├── multiple_choice.py
    │   ├── realm_model.py
    │   ├── t5_model.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── vision
    │   │   ├── classification.py
    │   │   ├── dino.py
    │   │   ├── esvit_swin_backbone.py
    │   │   ├── inpainting.py
    │   │   ├── knn_monitor.py
    │   │   ├── mit_backbone.py
    │   │   ├── swin_backbone.py
    │   │   ├── utils.py
    │   │   └── vit_backbone.py
    ├── mpu
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_layers.py
    │   │   └── test_random.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── adafactor.py
    │   ├── clip_grads.py
    │   ├── distrib_optimizer.py
    │   ├── grad_scaler.py
    │   └── optimizer.py
    ├── optimizer_param_scheduler.py
    ├── p2p_communication.py
    ├── schedules.py
    ├── static
    │   └── index.html
    ├── text_generation
    │   ├── __init__.py
    │   ├── api.py
    │   ├── beam_utils.py
    │   ├── communication.py
    │   ├── forward_step.py
    │   ├── generation.py
    │   ├── sampling.py
    │   └── tokenization.py
    ├── text_generation_server.py
    ├── timers.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── bert_tokenization.py
    │   ├── gpt2_tokenization.py
    │   └── tokenizer.py
    ├── training.py
    └── utils.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_ict.py
├── pretrain_t5.py
├── pretrain_vision_classify.py
├── pretrain_vision_dino.py
├── pretrain_vision_inpaint.py
├── setup.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── main.py
    ├── msdp
    │   ├── README.md
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── prompt.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification
    │   │   ├── classification.py
    │   │   └── eval_utils.py
    │   ├── finetune_utils.py
    │   ├── main.py
    │   └── segmentation
    │   │   ├── cityscapes.py
    │   │   ├── data.py
    │   │   ├── finetune_segformer.py
    │   │   ├── finetune_setr.py
    │   │   ├── metrics.py
    │   │   ├── seg_heads.py
    │   │   ├── seg_models.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── __init__.py
    ├── tensor_parallel
    │   ├── test_cross_entropy.py
    │   ├── test_data.py
    │   ├── test_mappings.py
    │   ├── test_random.py
    │   └── test_tensor_parallel_utils.py
    ├── test_basic.py
    ├── test_parallel_state.py
    ├── test_utilities.py
    └── test_utils.py
└── tools
    ├── checkpoint_loader_megatron.py
    ├── checkpoint_saver_megatron.py
    ├── checkpoint_util.py
    ├── linter.py
    ├── merge_datasets.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    ├── preprocess_data.py
    ├── preprocess_data_nmt.py
    ├── preprocess_data_partitions.py
    ├── run_text_generation_server.py
    └── text_generation_cli.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [html]
2 | directory = coverage
3 | 
4 | [run]
5 | data_file = .coverage_$LOCAL_RANK
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.so
3 | build
4 | .coverage_*
5 | *.egg-info
6 | *~
7 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
 2 | 
 3 | test:
 4 |   tags:
 5 |     - docker_gpu_enabled
 6 |   script:
 7 |     - torchrun --nproc_per_node=8  -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/
 8 |   coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
 9 |   artifacts:
10 |     paths:
11 |       - coverage
12 |     expire_in: 30 days
13 |     


--------------------------------------------------------------------------------
/docs/distrib_optimizer.md:
--------------------------------------------------------------------------------
 1 | # Distributed Optimizer
 2 | 
 3 | The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks, versus the current method of replicating the optimizer state across data parallel ranks. As described in https://arxiv.org/abs/1910.02054, this branch specifically implements the following:
 4 | 
 5 | - [yes] distribute all 'non-overlapping' optimizer state (i.e., model params already in fp32 are NOT distributed)
 6 | - [no] distribute model gradients
 7 | - [no] distribute model parameters
 8 | 
 9 | Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In the current implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size):
10 | 
11 | |        | Non-distributed optim | Distributed optim |
12 | | ------ | ------ | ------ |
13 | | float16 param, float16 grads | 20 | 4 + 16/d |
14 | | float16 param, fp32 grads    | 18 | 6 + 12/d |
15 | | fp32 param, fp32 grads       | 16 | 8 + 8/d  |
16 | 
17 | The implementation of the distributed optimizer is centered on using the contiguous grad buffer for communicating grads & params between the model state and the optimizer state. The grad buffer at any given moment either holds:
18 | 
19 | 1. all model grads
20 | 2. a 1/d size _copy_ of the main grads (before copying to the optimizer state)
21 | 3. a 1/d size _copy_ of the main params (after copying from the optimizer state)
22 | 4. all model params
23 | 5. zeros (or None), between iterations
24 | 
25 | The grad buffer is used for performing reduce-scatter and all-gather operations, for passing grads & params between the model state and optimizer state. With this implementation, no dynamic buffers are allocated.
26 | 
27 | The figures below illustrate the grad buffer's sharding scheme, and the key steps of the distributed optimizer's param update:
28 | 
29 | ## Data flow
30 | 
31 | ![Data flow](images/distrib_optimizer/data_flow.png)
32 | 
33 | ## Sharding scheme
34 | 
35 | ![Sharding scheme](images/distrib_optimizer/sharding_scheme.png)
36 | 
37 | ## Key steps
38 | 
39 | _(note: using illustrations above, and assuming fp16 grads)_
40 | 
41 | - Backward pass finishes (grad buffer holds 16 fp16 grad elements)
42 | - Call reduce-scatter on each DP rank
43 | - Each DP rank now has 4 elements within the grad buffer that are fully reduced (remaining 12 elements are garbage)
44 | - Each DP rank copies its relevant 4 fp16 grad elements from the grad buffer into 4 fp32 main grad elements (separate buffer, owned by the optimizer); i.e.
45 |   - DP rank 0 copies elements [0:4]
46 |   - DP rank 1 copies elements [4:8]
47 |   - DP rank 2 copies elements [8:12]
48 |   - DP rank 3 copies elements [12:16]
49 | - Optimizer.step()
50 | - Each DP rank copies its 4 fp32 main (/optimizer) param elements into the corresponding 4 fp16 elements in the grad buffer
51 | - Call all-gather on each DP rank
52 | - Grad buffer now contains all 16, fully updated, fp16 model param elements
53 | - Copy updated model params from grad buffer into their respective param tensors
54 | - (At this point, grad buffer is ready to be zero'd for the next iteration)
55 | 


--------------------------------------------------------------------------------
/docs/images/distrib_optimizer/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/docs/images/distrib_optimizer/data_flow.png


--------------------------------------------------------------------------------
/docs/images/distrib_optimizer/sharding_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/docs/images/distrib_optimizer/sharding_scheme.png


--------------------------------------------------------------------------------
/examples/detxoify_lm/annotations/filter-selfgeneration.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from typing import Dict, Optional, List
 4 | 
 5 | import joblib
 6 | from googleapiclient import discovery
 7 | from googleapiclient.errors import HttpError
 8 | 
 9 | import argparse
10 | 
11 | from tqdm import tqdm
12 | 
13 | parser = argparse.ArgumentParser(description='Process some integers.')
14 | parser.add_argument('--data-path', type=str, default='',
15 |                     help='data path to load the jsonl')
16 | parser.add_argument('--out-path', type=str, default='',
17 |                     help='data path to load the jsonl')
18 | parser.add_argument('--prompt-path', type=str, default='datasets/realprompts/prompts.jsonl',
19 |                     help='data path to load the prompt jsonl')
20 | parser.add_argument('--workers', type=int, default=10,
21 |                    help='Number of worker processes to launch')
22 | 
23 | 
24 | def get_corpus_scores(lines):
25 |     scores = []
26 |     corpus = []
27 | 
28 |     for line in tqdm(lines):
29 |         data = json.loads(line)
30 |         score = data['score']
31 |         if score is not None:
32 |             scores.append(score['toxicity'])
33 |             corpus.append(data)
34 |     return scores, corpus
35 | 
36 | 
37 | def main():
38 |     args = parser.parse_args()
39 | 
40 |     with open(args.data_path) as f:
41 |         lines = f.readlines()
42 | 
43 |     print(f"total line number: {len(lines)}")
44 | 
45 |     scores, corpus = get_corpus_scores(lines)
46 |     import numpy as np
47 |     scores = np.array(scores)
48 |     indices = np.argsort(scores)
49 | 
50 |     print(f"total valid samples: {len(scores)}")
51 | 
52 |     print(f"Avg toxicity (all): {np.mean(scores)} +- {np.std(scores)}")
53 |     print(f"Avg toxicity (toxic): {np.mean(scores[scores > 0.5])} +- {np.std(scores[scores > 0.5])}")
54 |     print(f"Toxic Percentage {sum(scores > 0.5) / len(scores)}")
55 |     print(f"Avg toxicity (nontoxic): {np.mean(scores[scores <= 0.5])} +- {np.std(scores[scores <= 0.5])}")
56 |     print(f"Nontoxic Percentage {sum(scores <= 0.5) / len(scores)}")
57 | 
58 |     samples_left = len(lines) // 2
59 |     print(f"After filtering: {samples_left} of samples are left")
60 |     nontoxic_indices = indices[:samples_left]
61 |     print(f"Avg toxicity (filtered): {np.mean(scores[nontoxic_indices])} +- {np.std(scores[nontoxic_indices])}")
62 |     print(f"Toxicity Range (filtered): {np.min(scores[nontoxic_indices])} ~ {np.max(scores[nontoxic_indices])}")
63 |     nontoxic_data = [corpus[ind] for ind in nontoxic_indices]
64 |     print(f"Total samples after filtering: {len(nontoxic_data)}")
65 |     print(f"Examples: {nontoxic_data[:3]}")
66 | 
67 |     from sklearn.utils import shuffle
68 |     nontoxic_data = shuffle(nontoxic_data)
69 | 
70 |     with open(args.out_path, 'w') as f:
71 |         for x in nontoxic_data:
72 |             f.write(json.dumps(x) + '\n')
73 | 
74 | 
75 | main()


--------------------------------------------------------------------------------
/examples/detxoify_lm/annotations/preprocess.sh:
--------------------------------------------------------------------------------
 1 | VOCAB_FILE=pt2-vocab.json
 2 | MERGE_FILE=gpt2-merges.txt
 3 | 
 4 | python3 tools/preprocess_data.py \
 5 |     --input $1 \
 6 |     --output-prefix $2 \
 7 |     --vocab-file $VOCAB_FILE \
 8 |     --merge-file $MERGE_FILE \
 9 |     --tokenizer-type GPT2BPETokenizer \
10 |     --append-eod  --workers 20 --chunk-size 25
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Change for multinode config
 4 | GPUS_PER_NODE=16
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=$(($RANDOM + 1024))
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | # input
12 | DATA_PATH=$1
13 | SHARE_DATA=$PWD                       # current work dir
14 | FINETUNED_PATH="$SHARE_DATA/$2"
15 | lr=$3
16 | bs=$4
17 | iter=$5
18 | CHECKPOINT_PATH=$6
19 | 
20 | # vocab
21 | VOCAB_FILE=gpt2-vocab.json           # Your gpt-2 vocab
22 | MERGE_FILE=gpt2-merges.txt           # Your gpt-2 merge file
23 | 
24 | # tensorboard
25 | TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2"
26 | mkdir -p ${TENSORBOARD_DIR}
27 | 
28 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
29 | 
30 | python -m torch.distributed.run $DISTRIBUTED_ARGS \
31 |      examples/detxoify_lm/finetune_gpt.py \
32 |      --num-layers 24 \
33 |      --hidden-size 2048 \
34 |      --num-attention-heads 32 \
35 |      --micro-batch-size 4 \
36 |      --global-batch-size $bs \
37 |      --seq-length 2048 \
38 |      --max-position-embeddings 2048 \
39 |      --train-iters $iter \
40 |      --save $FINETUNED_PATH \
41 |      --load $CHECKPOINT_PATH \
42 |      --data-path $DATA_PATH \
43 |      --data-path2 ${DATA_BLEND} \
44 |      --vocab-file $VOCAB_FILE \
45 |      --merge-file $MERGE_FILE \
46 |      --data-impl mmap \
47 |      --split 100,0,0 \
48 |      --distributed-backend nccl \
49 |      --lr-decay-style constant \
50 |      --lr $lr \
51 |      --clip-grad 1.0 \
52 |      --weight-decay 0.1 \
53 |      --adam-beta1 0.9 \
54 |      --adam-beta2 0.95 \
55 |      --checkpoint-activations \
56 |      --log-interval 1 \
57 |      --save-interval 78 \
58 |      --eval-interval 78 \
59 |      --eval-iters 50 \
60 |      --fp16 \
61 |      --DDP-impl local \
62 |      --finetune --no-load-optim \
63 |      --log-validation-ppl-to-tensorboard \
64 |      --tensorboard-dir ${TENSORBOARD_DIR}
65 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/generate-1.3b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | VOCAB_FILE=gpt2-vocab.json
 4 | MERGE_FILE=gpt2-merges.txt
 5 | 
 6 | GPUS_PER_NODE=1
 7 | # Change for multinode config
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=$(($RANDOM + 1024))
10 | NNODES=1
11 | NODE_RANK=0
12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
13 | NUM_SAMPLES=$(wc -l < $1)
14 | PREFIX=$(basename $2)
15 | SEED=$(($RANDOM))
16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl
17 | 
18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
19 | 
20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
21 |        --tensor-model-parallel-size 1 \
22 |        --num-layers 24 \
23 |        --hidden-size 2048 \
24 |        --load $CHECKPOINT_PATH \
25 |        --num-attention-heads 32 \
26 |        --max-position-embeddings 2048 \
27 |        --tokenizer-type GPT2BPETokenizer \
28 |        --fp16 \
29 |        --micro-batch-size 400 \
30 |        --seq-length 2048 \
31 |        --out-seq-length 20 \
32 |        --temperature 1.0 \
33 |        --vocab-file $VOCAB_FILE \
34 |        --merge-file $MERGE_FILE \
35 |        --sample-input-file $1 \
36 |        --sample-output-file $OUTPUT \
37 |        --num-samples $NUM_SAMPLES \
38 |        --max-tokens-to-oom 1200000 \
39 |        --top_p 0.9 \
40 |        --seed $SEED
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | SHARE_DATA=$PWD             # current work dir
 4 | VOCAB_FILE=gpt2-vocab.json  # Your gpt-2 vocab
 5 | MERGE_FILE=gpt2-merges.txt  # Your gpt-2 merge file
 6 | 
 7 | GPUS_PER_NODE=1
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=$(($RANDOM + 1024))
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | SEED=$3
15 | SUFFIX=$(basename $CHECKPOINT_PATH)
16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/
17 | mkdir -p $save_dir
18 | echo $save_dir/$SEED.out
19 | 
20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
21 | 
22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
23 |        --tensor-model-parallel-size 1 \
24 |        --num-layers 24 \
25 |        --hidden-size 2048 \
26 |        --load $CHECKPOINT_PATH \
27 |        --num-attention-heads 32 \
28 |        --max-position-embeddings 2048 \
29 |        --tokenizer-type GPT2BPETokenizer \
30 |        --fp16 \
31 |        --micro-batch-size 150 \
32 |        --seq-length 2048 \
33 |        --out-seq-length 1000 \
34 |        --temperature 1.0 \
35 |        --vocab-file $VOCAB_FILE \
36 |        --merge-file $MERGE_FILE \
37 |        --num-samples $1 \
38 |        --top_p 0.9 \
39 |        --max-tokens-to-oom 1200000 \
40 |        --genfile $save_dir/$SEED.out  \
41 |        --seed $SEED
42 | 
43 | 


--------------------------------------------------------------------------------
/examples/evaluate_retriever_nq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained
 4 | # ICT model or a finetuned model for Natural Question task
 5 | 
 6 | # Datasets can be downloaded from the following link:
 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 8 | 
 9 | EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
10 | EMBEDDING_PATH=<Specify path of the embeddings>
11 | CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
12 | 
13 | QA_FILE=<Path of the natural question dev or test dataset>
14 | 
15 | python tasks/main.py \
16 |     --task RETRIEVER-EVAL \
17 |     --tokenizer-type BertWordPieceLowerCase \
18 |     --num-layers 12 \
19 |     --hidden-size 768 \
20 |     --num-attention-heads 12 \
21 |     --tensor-model-parallel-size 1 \
22 |     --micro-batch-size 128 \
23 |     --activations-checkpoint-method uniform \
24 |     --seq-length 512 \
25 |     --max-position-embeddings 512 \
26 |     --load ${CHECKPOINT_PATH} \
27 |     --evidence-data-path ${EVIDENCE_DATA_DIR} \
28 |     --embedding-path ${EMBEDDING_PATH} \
29 |     --retriever-seq-length 256 \
30 |     --vocab-file  bert-vocab.txt\
31 |     --qa-data-test ${QA_FILE} \
32 |     --faiss-use-gpu \
33 |     --retriever-report-topk-accuracies 1 5 20 100 \
34 |     --fp16 \
35 |     --indexer-log-interval 1000 \
36 |     --indexer-batch-size 128
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/examples/evaluate_zeroshot_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TASK="LAMBADA"
12 | 
13 | VALID_DATA=<lambada path>
14 | VOCAB_FILE=gpt2-vocab.json
15 | MERGE_FILE=gpt2-merges.txt
16 | CHECKPOINT=checkpoints/gpt2_345m
17 | 
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
20 |                --task $TASK \
21 |                --valid-data $VALID_DATA \
22 |                --tokenizer-type GPT2BPETokenizer \
23 |                --strict-lambada \
24 |                --vocab-file $VOCAB_FILE \
25 |                --merge-file $MERGE_FILE \
26 |                --load $CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 8 \
32 |                --activations-checkpoint-method uniform \
33 |                --seq-length 1024 \
34 |                --max-position-embeddings 1024 \
35 |                --log-interval 10 \
36 |                --fp16 \
37 |                --no-load-optim \
38 |                --no-load-rng
39 | 


--------------------------------------------------------------------------------
/examples/finetune_mnli_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv"
12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
13 |             data/glue_data/MNLI/dev_mismatched.tsv"
14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
15 | VOCAB_FILE=bert-vocab.txt
16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task MNLI \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 5 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 8 \
32 |                --activations-checkpoint-method uniform \
33 |                --lr 5.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.065 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 500000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --fp16
45 | 


--------------------------------------------------------------------------------
/examples/finetune_race_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/RACE/train/middle"
12 | VALID_DATA="data/RACE/dev/middle \
13 |             data/RACE/dev/high"
14 | VOCAB_FILE=bert-vocab.txt
15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
16 | CHECKPOINT_PATH=checkpoints/bert_345m_race
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task RACE \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 3 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 4 \
32 |                --activations-checkpoint-method uniform \
33 |                --lr 1.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.06 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 100000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --clip-grad 1.0 \
45 |                --hidden-dropout 0.1 \
46 |                --attention-dropout 0.1 \
47 |                --fp16
48 | 


--------------------------------------------------------------------------------
/examples/finetune_retriever_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Finetune a BERT or pretrained ICT model using Google natural question data 
 4 | # Datasets can be downloaded from the following link:
 5 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 6 | 
 7 | WORLD_SIZE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
16 | 
17 | # Load either of the below
18 | BERT_LOAD_PATH=<Path of BERT pretrained model>
19 | PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
20 | 
21 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
22 |         --task RET-FINETUNE-NQ \
23 |         --train-with-neg \
24 |         --train-hard-neg 1 \
25 |         --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
26 |         --num-layers 12 \
27 |         --hidden-size 768 \
28 |         --num-attention-heads 12 \
29 |         --tensor-model-parallel-size 1 \
30 |         --tokenizer-type BertWordPieceLowerCase \
31 |         --train-data nq-train.json \
32 |         --valid-data nq-dev.json \
33 |         --save ${CHECKPOINT_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --vocab-file bert-vocab.txt \
36 |         --bert-load ${BERT_LOAD_PATH} \
37 |         --save-interval 5000 \
38 |         --log-interval 10 \
39 |         --eval-interval 20000 \
40 |         --eval-iters 100 \
41 |         --indexer-log-interval 1000 \
42 |         --faiss-use-gpu \
43 |         --DDP-impl torch \
44 |         --fp16 \
45 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
46 |         --seq-length 512 \
47 |         --retriever-seq-length 256 \
48 |         --max-position-embeddings 512 \
49 |         --retriever-score-scaling \
50 |         --epochs 80 \
51 |         --micro-batch-size 8 \
52 |         --eval-micro-batch-size 16 \
53 |         --indexer-batch-size 128 \
54 |         --lr 2e-5 \
55 |         --lr-warmup-fraction 0.01 \
56 |         --weight-decay 1e-1
57 | 


--------------------------------------------------------------------------------
/examples/merge_mp_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TENSOR_MODEL_PARALLEL_SIZE=2
 4 | 
 5 | VOCAB_FILE=bert-vocab.txt
 6 | CHECKPOINT_PATH=checkpoints/bert_345m
 7 | 
 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
 9 |                                 --model-type BERT \
10 |                                 --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
11 |                                 --tokenizer-type BertWordPieceLowerCase \
12 |                                 --vocab-file $VOCAB_FILE \
13 |                                 --num-layers 24 \
14 |                                 --hidden-size 1024 \
15 |                                 --num-attention-heads 16 \
16 |                                 --seq-length 512 \
17 |                                 --max-position-embeddings 512 \
18 |                                 --load $CHECKPOINT_PATH
19 | 


--------------------------------------------------------------------------------
/examples/msdp/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
3 | 
4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/msdp/data_processing.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Data preparation for our framework: preprocessing the WoW and WoI datasets
 4 | # The datasets can be downloaded through the following links:
 5 | # WoW: https://parl.ai/projects/wizard_of_wikipedia/
 6 | # WoI: https://parl.ai/projects/sea/
 7 | 
 8 | DIR=`pwd`
 9 | # Before running the preprocessing, please download 
10 | # the wizard of wikipedia and wizard datasets
11 | WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
12 | WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
13 | 
14 | # We provide examples for processing the raw data from Wizard of Wikipedia
15 | # Processing the train dataset (train.json)
16 | python ${DIR}/tasks/msdp/preprocessing.py \
17 |         --func process_wow_dataset \
18 |         --raw_file ${WOW_DATA_FOLDER}/train.json \
19 |         --processed_file ${WOW_DATA_FOLDER}/train_processed.txt
20 | 
21 | # Processing test seen dataset (test_random_split.json)
22 | python ${DIR}/tasks/msdp/preprocessing.py \
23 |         --func process_wow_dataset \
24 |         --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
25 |         --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
26 |         --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
27 |         --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt
28 | 
29 | # processing test unseen dataset (test_topic_split.json)
30 | python ${DIR}/tasks/msdp/preprocessing.py \
31 |         --func process_wow_dataset \
32 |         --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
33 |         --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
34 |         --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
35 |         --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt
36 | 
37 | 
38 | # We provide the following script to process the raw data from Wizard of Internet
39 | # Processing the test dataset (test.jsonl)
40 | python ${DIR}/tasks/msdp/preprocessing.py \
41 |         --func process_woi_dataset \
42 |         --raw_file ${WOI_DATA_FOLDER}/test.jsonl \
43 |         --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
44 |         --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
45 |         --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt
46 | 
47 | 
48 | # Get the knowledge generation prompts for the each test dataset in WoW and WoI
49 | MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL> 
50 | # WoW test seen
51 | python ${DIR}/tasks/msdp/preprocessing.py \
52 |         --func get_knwl_gen_prompts \
53 |         --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
54 |         --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
55 |         --model_file ${MODEL_FILE} \
56 |         --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
57 |         --data_type wow_seen
58 | 
59 | # WoW test unseen
60 | python ${DIR}/tasks/msdp/preprocessing.py \
61 |         --func get_knwl_gen_prompts \
62 |         --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
63 |         --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
64 |         --model_file ${MODEL_FILE} \
65 |         --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
66 |         --data_type wow_unseen
67 | 
68 | # WoI
69 | python ${DIR}/tasks/msdp/preprocessing.py \
70 |         --func get_knwl_gen_prompts \
71 |         --test_file ${WOI_DATA_FOLDER}/test_processed.txt \
72 |         --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
73 |         --model_file ${MODEL_FILE} \
74 |         --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
75 |         --data_type woi
76 | 
77 | 
78 | # Get the response generation prompts (can be applied for all the test datasets)
79 | python ${DIR}/tasks/msdp/preprocessing.py \
80 |         --func get_resp_gen_prompts \
81 |         --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
82 |         --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt
83 | 
84 | 


--------------------------------------------------------------------------------
/examples/msdp/eval_knwl_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #########################
 4 | # Evaluate the F1 scores.
 5 | #########################
 6 | 
 7 | WORLD_SIZE=1
 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 9 |                   --nnodes 1 \
10 |                   --node_rank 0 \
11 |                   --master_addr localhost \
12 |                   --master_port 6000"
13 |                   
14 | MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
15 |         (e.g., /testseen_knowledge_generations.txt)
16 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
17 |         (e.g., /testseen_knowledge_reference.txt)
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
20 |         --num-layers 24 \
21 |         --hidden-size 1024 \
22 |         --num-attention-heads 16 \
23 |         --seq-length 2048 \
24 |         --max-position-embeddings 2048 \
25 |         --micro-batch-size 4 \
26 |         --task MSDP-EVAL-F1 \
27 |         --guess-file ${MODEL_GEN_PATH} \
28 |         --answer-file ${GROUND_TRUTH_PATH}
29 | 
30 | 
31 | ############################################
32 | # Evaluate BLEU, METEOR, and ROUGE-L scores.
33 | ############################################
34 | 
35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 
37 | 
38 | # To evaluate on these metrics, please setup the environments based on 
39 | # the nlg-eval github, and run the corresponding evaluation commands.
40 | 
41 | nlg-eval \
42 |     --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
43 |     --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
44 | 


--------------------------------------------------------------------------------
/examples/msdp/eval_resp_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #########################
 4 | # Evaluate the F1 scores.
 5 | #########################
 6 | 
 7 | WORLD_SIZE=1
 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 9 |                   --nnodes 1 \
10 |                   --node_rank 0 \
11 |                   --master_addr localhost \
12 |                   --master_port 6000"
13 |                   
14 | MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
15 |         (e.g., /testseen_response_generations.txt)
16 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \ 
17 |         (e.g., /testseen_response_reference.txt)
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
20 |         --num-layers 24 \
21 |         --hidden-size 1024 \
22 |         --num-attention-heads 16 \
23 |         --seq-length 2048 \
24 |         --max-position-embeddings 2048 \
25 |         --micro-batch-size 4 \
26 |         --task MSDP-EVAL-F1 \
27 |         --guess-file ${MODEL_GEN_PATH} \
28 |         --answer-file ${GROUND_TRUTH_PATH}
29 | 
30 | 
31 | ##########################
32 | # Evaluate the KF1 scores.
33 | ##########################
34 |                   
35 | MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
36 |         (e.g., /testseen_response_generations.txt)
37 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
38 |         (e.g., /testseen_knowledge_reference.txt)
39 | 
40 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
41 |         --num-layers 24 \
42 |         --hidden-size 1024 \
43 |         --num-attention-heads 16 \
44 |         --seq-length 2048 \
45 |         --max-position-embeddings 2048 \
46 |         --micro-batch-size 4 \
47 |         --task MSDP-EVAL-F1 \
48 |         --guess-file ${MODEL_GEN_PATH} \
49 |         --answer-file ${GROUND_TRUTH_PATH}
50 | 
51 | 
52 | ############################################
53 | # Evaluate BLEU, METEOR, and ROUGE-L scores.
54 | ############################################
55 | 
56 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
57 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 
58 | 
59 | # To evaluate on these metrics, please setup the environments based on 
60 | # the nlg-eval github, and run the corresponding evaluation commands.
61 | 
62 | nlg-eval \
63 |     --hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
64 |     --references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
65 | 


--------------------------------------------------------------------------------
/examples/msdp/prep_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Preparing the input file for the response generation (second-stage prompting)
 4 | 
 5 | DIR=`pwd`
 6 | 
 7 | TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
 8 |         (e.g., /testseen_processed.txt)
 9 | KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
10 |         (e.g., /testseen_knowledge_generations.txt)
11 | PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
12 |         (e.g., /testseen_processed_with_generated_knowledge.txt)
13 | 
14 | python ${DIR}/tasks/msdp/preprocessing.py \
15 |         --func prepare_input \
16 |         --test_file ${TEST_FILE} \
17 |         --knwl_gen_file ${KNOWLEDGE_FILE} \
18 |         --processed_file ${PROCESSED_FILE}
19 | 


--------------------------------------------------------------------------------
/examples/msdp/prompt_knwl_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge
 5 | # The size of the pretrained language model is 357M
 6 | 
 7 | WORLD_SIZE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
16 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
17 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
18 | INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \ 
19 |         (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
21 |         (e.g., /testseen_knowledge_prompts.json)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /testseen_knowledge_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type knowledge \
42 |         --num-prompt-examples 10 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/msdp/prompt_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response
 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1
 5 | # The output is the corresponding response.
 6 | # The size of the pretrained language model is 357M
 7 | 
 8 | WORLD_SIZE=8
 9 | 
10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
11 |                   --nnodes 1 \
12 |                   --node_rank 0 \
13 |                   --master_addr localhost \
14 |                   --master_port 6000"
15 | 
16 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
17 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
18 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
19 | INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
21 |         (e.g., /response_prompts.txt)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /output_testseen_response_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type response \
42 |         --num-prompt-examples 20 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0
 4 | WORLD_SIZE=1
 5 | DATA_PATH=<Specify path and file prefix>_text_sentence
 6 | CHECKPOINT_PATH=<Specify path>
 7 | 
 8 | python pretrain_bert.py \
 9 |        --num-layers 24 \
10 |        --hidden-size 1024 \
11 |        --num-attention-heads 16 \
12 |        --micro-batch-size 4 \
13 |        --global-batch-size 8 \
14 |        --seq-length 512 \
15 |        --max-position-embeddings 512 \
16 |        --train-iters 2000000 \
17 |        --lr-decay-iters 990000 \
18 |        --save $CHECKPOINT_PATH \
19 |        --load $CHECKPOINT_PATH \
20 |        --data-path $DATA_PATH \
21 |        --vocab-file bert-vocab.txt \
22 |        --data-impl mmap \
23 |        --split 949,50,1 \
24 |        --lr 0.0001 \
25 |        --min-lr 0.00001 \
26 |        --lr-decay-style linear \
27 |        --lr-warmup-fraction .01 \
28 |        --weight-decay 1e-2 \
29 |        --clip-grad 1.0 \
30 |        --log-interval 100 \
31 |        --save-interval 10000 \
32 |        --eval-interval 1000 \
33 |        --eval-iters 10 \
34 |        --fp16
35 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>_text_sentence
12 | CHECKPOINT_PATH=<Specify path>
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |        pretrain_bert.py \
18 |        --num-layers 24 \
19 |        --hidden-size 1024 \
20 |        --num-attention-heads 16 \
21 |        --micro-batch-size 4 \
22 |        --global-batch-size 32 \
23 |        --seq-length 512 \
24 |        --max-position-embeddings 512 \
25 |        --train-iters 1000000 \
26 |        --save $CHECKPOINT_PATH \
27 |        --load $CHECKPOINT_PATH \
28 |        --data-path $DATA_PATH \
29 |        --vocab-file bert-vocab.txt \
30 |        --data-impl mmap \
31 |        --split 949,50,1 \
32 |        --distributed-backend nccl \
33 |        --lr 0.0001 \
34 |        --lr-decay-style linear \
35 |        --min-lr 1.0e-5 \
36 |        --lr-decay-iters 990000 \
37 |        --weight-decay 1e-2 \
38 |        --clip-grad 1.0 \
39 |        --lr-warmup-fraction .01 \
40 |        --log-interval 100 \
41 |        --save-interval 10000 \
42 |        --eval-interval 1000 \
43 |        --eval-iters 10 \
44 |        --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>_text_sentence
12 | VOCAB_FILE=<Specify path to vocab.txt>
13 | CHECKPOINT_PATH=<Specify path>
14 | 
15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
16 | 
17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
18 |        pretrain_bert.py \
19 |        --tensor-model-parallel-size 2 \
20 |        --pipeline-model-parallel-size 2 \
21 |        --num-layers 24 \
22 |        --hidden-size 1024 \
23 |        --num-attention-heads 16 \
24 |        --micro-batch-size 2 \
25 |        --global-batch-size 16 \
26 |        --seq-length 512 \
27 |        --max-position-embeddings 512 \
28 |        --train-iters 1000000 \
29 |        --save $CHECKPOINT_PATH \
30 |        --load $CHECKPOINT_PATH \
31 |        --data-path $DATA_PATH \
32 |        --vocab-file $VOCAB_FILE \
33 |        --data-impl mmap \
34 |        --split 949,50,1 \
35 |        --distributed-backend nccl \
36 |        --lr 0.0001 \
37 |        --lr-decay-style linear \
38 |        --min-lr 1.0e-5 \
39 |        --lr-decay-iters 990000 \
40 |        --weight-decay 1e-2 \
41 |        --clip-grad 1.0 \
42 |        --lr-warmup-fraction .01 \
43 |        --log-interval 100 \
44 |        --save-interval 10000 \
45 |        --eval-interval 1000 \
46 |        --eval-iters 10 \
47 |        --fp16
48 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | DATA_PATH=<Specify path and file prefix>_text_document
 9 | CHECKPOINT_PATH=<Specify path>
10 | 
11 | 
12 | python pretrain_gpt.py \
13 |        --num-layers 24 \
14 |        --hidden-size 1024 \
15 |        --num-attention-heads 16 \
16 |        --micro-batch-size 4 \
17 |        --global-batch-size 8 \
18 |        --seq-length 1024 \
19 |        --max-position-embeddings 1024 \
20 |        --train-iters 500000 \
21 |        --lr-decay-iters 320000 \
22 |        --save $CHECKPOINT_PATH \
23 |        --load $CHECKPOINT_PATH \
24 |        --data-path $DATA_PATH \
25 |        --vocab-file gpt2-vocab.json \
26 |        --merge-file gpt2-merges.txt \
27 |        --data-impl mmap \
28 |        --split 949,50,1 \
29 |        --distributed-backend nccl \
30 |        --lr 0.00015 \
31 |        --min-lr 1.0e-5 \
32 |        --lr-decay-style cosine \
33 |        --weight-decay 1e-2 \
34 |        --clip-grad 1.0 \
35 |        --lr-warmup-fraction .01 \
36 |        --activations-checkpoint-method uniform \
37 |        --log-interval 100 \
38 |        --save-interval 10000 \
39 |        --eval-interval 1000 \
40 |        --eval-iters 10 \
41 |        --fp16
42 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_175B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | #SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
 5 | 
 6 | 
 7 | DIR=`pwd`
 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 9 | mkdir -p $DIR/logs
10 | 
11 | 
12 | DATASET_1="<PATH TO THE FIRST DATASET>"
13 | DATASET_2="<PATH TO THE SECOND DATASET>"
14 | DATASET_3="<PATH TO THE THIRD DATASET>"
15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
16 | 
17 | 
18 | options=" \
19 | 	--tensor-model-parallel-size 8 \
20 | 	--pipeline-model-parallel-size 16 \
21 |         --num-layers 96 \
22 |         --hidden-size 12288 \
23 |         --num-attention-heads 96 \
24 |         --seq-length 2048 \
25 |         --max-position-embeddings 2048 \
26 | 	--micro-batch-size 1 \
27 | 	--global-batch-size 1536 \
28 | 	--rampup-batch-size 16 16 5859375 \
29 | 	--train-samples 146484375 \
30 |        	--lr-decay-samples 126953125 \
31 |         --lr-warmup-samples 183105 \
32 |         --lr 6.0e-5 \
33 | 	--min-lr 6.0e-6 \
34 |         --lr-decay-style cosine \
35 |         --log-interval 10 \
36 |         --eval-iters 40 \
37 |         --eval-interval 1000 \
38 | 	--data-path ${DATASET} \
39 | 	--vocab-file <PATH TO gpt-vocab.json> \
40 | 	--merge-file <PATH TO gpt-merges.txt> \
41 | 	--save-interval 1000 \
42 | 	--save <PATH TO CHECKPOINTS DIRECTORY> \
43 | 	--load <PATH TO CHECKPOINTS DIRECTORY> \
44 |         --split 98,2,0 \
45 |         --clip-grad 1.0 \
46 | 	--weight-decay 0.1 \
47 | 	--adam-beta1 0.9 \
48 | 	--adam-beta2 0.95 \
49 | 	--init-method-std 0.006 \
50 | 	--tensorboard-dir <TENSORBOARD DIRECTORY> \
51 |         --fp16 \
52 | 	--activations-checkpoint-method uniform "
53 | 
54 | 
55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
56 | 
57 | 
58 | srun -l \
59 |      --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
60 |      --container-mounts "<DIRECTORIES TO MOUNT>" \
61 |      --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
62 | 
63 | 
64 | set +x
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | DATA_PATH=<Specify path and file prefix>_text_document
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19 |        pretrain_gpt.py \
20 |        --num-layers 24 \
21 |        --hidden-size 1024 \
22 |        --num-attention-heads 16 \
23 |        --micro-batch-size 8 \
24 |        --global-batch-size 64 \
25 |        --seq-length 1024 \
26 |        --max-position-embeddings 1024 \
27 |        --train-iters 500000 \
28 |        --lr-decay-iters 320000 \
29 |        --save $CHECKPOINT_PATH \
30 |        --load $CHECKPOINT_PATH \
31 |        --data-path $DATA_PATH \
32 |        --vocab-file gpt2-vocab.json \
33 |        --merge-file gpt2-merges.txt \
34 |        --data-impl mmap \
35 |        --split 949,50,1 \
36 |        --distributed-backend nccl \
37 |        --lr 0.00015 \
38 |        --lr-decay-style cosine \
39 |        --min-lr 1.0e-5 \
40 |        --weight-decay 1e-2 \
41 |        --clip-grad 1.0 \
42 |        --lr-warmup-fraction .01 \
43 |        --activations-checkpoint-method uniform \
44 |        --log-interval 100 \
45 |        --save-interval 10000 \
46 |        --eval-interval 1000 \
47 |        --eval-iters 10 \
48 |        --fp16
49 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | DATA_PATH=<Specify path and file prefix>_text_document
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19 |        pretrain_gpt.py \
20 |        --tensor-model-parallel-size 2 \
21 |        --pipeline-model-parallel-size 2 \
22 |        --sequence-parallel \
23 |        --num-layers 24 \
24 |        --hidden-size 1024 \
25 |        --num-attention-heads 16 \
26 |        --micro-batch-size 4 \
27 |        --global-batch-size 16 \
28 |        --seq-length 1024 \
29 |        --max-position-embeddings 1024 \
30 |        --train-iters 500000 \
31 |        --lr-decay-iters 320000 \
32 |        --save $CHECKPOINT_PATH \
33 |        --load $CHECKPOINT_PATH \
34 |        --data-path $DATA_PATH \
35 |        --vocab-file gpt2-vocab.json \
36 |        --merge-file gpt2-merges.txt \
37 |        --data-impl mmap \
38 |        --split 949,50,1 \
39 |        --distributed-backend nccl \
40 |        --lr 0.00015 \
41 |        --lr-decay-style cosine \
42 |        --min-lr 1.0e-5 \
43 |        --weight-decay 1e-2 \
44 |        --clip-grad 1.0 \
45 |        --lr-warmup-fraction .01 \
46 |        --activations-checkpoint-method uniform \
47 |        --log-interval 100 \
48 |        --save-interval 10000 \
49 |        --eval-interval 1000 \
50 |        --eval-iters 10 \
51 |        --fp16
52 | 


--------------------------------------------------------------------------------
/examples/pretrain_ict.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "217M" parameter biencoder model for ICT retriever
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
 9 | TEXT_DATA_PATH=<Specify path and file prefix of the text data>
10 | TITLE_DATA_PATH=<Specify path and file prefix od the titles>
11 | CHECKPOINT_PATH=<Specify path>
12 | 
13 | 
14 | python pretrain_ict.py \
15 |         --num-layers 12 \
16 |         --hidden-size 768 \
17 |         --num-attention-heads 12 \
18 |         --tensor-model-parallel-size 1 \
19 |         --micro-batch-size 32 \
20 |         --seq-length 256 \
21 |         --max-position-embeddings 512 \
22 |         --train-iters 100000 \
23 |         --vocab-file bert-vocab.txt \
24 |         --tokenizer-type BertWordPieceLowerCase \
25 |         --DDP-impl torch \
26 |         --bert-load ${PRETRAINED_BERT_PATH} \
27 |         --log-interval 100 \
28 |         --eval-interval 1000 \
29 |         --eval-iters 10 \
30 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
31 |         --retriever-score-scaling \
32 |         --load $CHECKPOINT_PATH \
33 |         --save $CHECKPOINT_PATH \
34 |         --data-path ${TEXT_DATA_PATH} \
35 |         --titles-data-path ${TITLE_DATA_PATH} \
36 |         --lr 0.0001 \
37 |         --lr-decay-style linear \
38 |         --weight-decay 1e-2 \
39 |         --clip-grad 1.0 \
40 |         --lr-warmup-fraction 0.01 \
41 |         --save-interval 4000 \
42 |         --exit-interval 8000 \
43 |         --query-in-block-prob 0.1 \
44 |         --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0
 4 | WORLD_SIZE=1
 5 | DATA_PATH=<Specify path and file prefix>
 6 | VOCAB_FILE=<Specify path to vocab.txt>
 7 | CHECKPOINT_PATH=<Specify path>
 8 | 
 9 | python pretrain_t5.py \
10 |        --num-layers 12 \
11 |        --hidden-size 768 \
12 |        --num-attention-heads 12 \
13 |        --kv-channels 64 \
14 |        --ffn-hidden-size 3072 \
15 |        --encoder-seq-length 512 \
16 |        --decoder-seq-length 128 \
17 |        --micro-batch-size 16 \
18 |        --global-batch-size 16 \
19 |        --max-position-embeddings 512 \
20 |        --train-iters 1000000 \
21 |        --lr-decay-iters 1000000 \
22 |        --save $CHECKPOINT_PATH \
23 |        --load $CHECKPOINT_PATH \
24 |        --data-path $DATA_PATH \
25 |        --vocab-file $VOCAB_FILE \
26 |        --data-impl mmap \
27 |        --split 949,50,1 \
28 |        --lr 0.0001 \
29 |        --min-lr 0.00001 \
30 |        --lr-decay-style linear \
31 |        --lr-warmup-fraction .01 \
32 |        --weight-decay 1e-2 \
33 |        --clip-grad 1.0 \
34 |        --log-interval 100 \
35 |        --save-interval 10000 \
36 |        --eval-interval 1000 \
37 |        --eval-iters 10 \
38 |        --fp16 \
39 |        --vocab-extra-ids 100
40 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>
12 | VOCAB_FILE=<Specify path to vocab.txt>
13 | CHECKPOINT_PATH=<Specify path>
14 | 
15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
16 | 
17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
18 |        pretrain_t5.py \
19 |        --num-layers 12 \
20 |        --hidden-size 768 \
21 |        --num-attention-heads 12 \
22 |        --kv-channels 64 \
23 |        --ffn-hidden-size 3072 \
24 |        --encoder-seq-length 512 \
25 |        --decoder-seq-length 128 \
26 |        --micro-batch-size 16 \
27 |        --global-batch-size 128 \
28 |        --max-position-embeddings 512 \
29 |        --train-iters 1000000 \
30 |        --lr-decay-iters 1000000 \
31 |        --save $CHECKPOINT_PATH \
32 |        --load $CHECKPOINT_PATH \
33 |        --data-path $DATA_PATH \
34 |        --vocab-file $VOCAB_FILE \
35 |        --data-impl mmap \
36 |        --split 949,50,1 \
37 |        --lr 0.0001 \
38 |        --min-lr 0.00001 \
39 |        --lr-decay-style linear \
40 |        --lr-warmup-fraction .01 \
41 |        --weight-decay 1e-2 \
42 |        --clip-grad 1.0 \
43 |        --log-interval 100 \
44 |        --save-interval 10000 \
45 |        --eval-interval 1000 \
46 |        --eval-iters 10 \
47 |        --fp16 \
48 |        --vocab-extra-ids 100
49 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>
12 | CHECKPOINT_PATH=<Specify path>
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |        pretrain_t5.py \
18 |        --tensor-model-parallel-size 2 \
19 |        --num-layers 12 \
20 |        --hidden-size 768 \
21 |        --num-attention-heads 12 \
22 |        --kv-channels 64 \
23 |        --ffn-hidden-size 3072 \
24 |        --encoder-seq-length 512 \
25 |        --decoder-seq-length 128 \
26 |        --micro-batch-size 16 \
27 |        --global-batch-size 128 \
28 |        --max-position-embeddings 512 \
29 |        --train-iters 1000000 \
30 |        --lr-decay-iters 1000000 \
31 |        --save $CHECKPOINT_PATH \
32 |        --load $CHECKPOINT_PATH \
33 |        --data-path $DATA_PATH \
34 |        --vocab-file t5-vocab.txt \
35 |        --data-impl mmap \
36 |        --split 949,50,1 \
37 |        --lr 0.0001 \
38 |        --min-lr 0.00001 \
39 |        --lr-decay-style linear \
40 |        --lr-warmup-fraction .01 \
41 |        --weight-decay 1e-2 \
42 |        --clip-grad 1.0 \
43 |        --log-interval 100 \
44 |        --save-interval 10000 \
45 |        --eval-interval 1000 \
46 |        --eval-iters 10 \
47 |        --fp16  \
48 |        --vocab-extra-ids 100
49 | 


--------------------------------------------------------------------------------
/examples/run_text_generation_server_345M.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model.
 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | pip install flask-restful
14 | 
15 | python -m torch.distributed.run $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
16 |        --tensor-model-parallel-size 1  \
17 |        --pipeline-model-parallel-size 1  \
18 |        --num-layers 24  \
19 |        --hidden-size 1024  \
20 |        --load ${CHECKPOINT}  \
21 |        --num-attention-heads 16  \
22 |        --max-position-embeddings 1024  \
23 |        --tokenizer-type GPT2BPETokenizer  \
24 |        --fp16  \
25 |        --micro-batch-size 1  \
26 |        --seq-length 1024  \
27 |        --out-seq-length 1024  \
28 |        --temperature 1.0  \
29 |        --vocab-file $VOCAB_FILE  \
30 |        --merge-file $MERGE_FILE  \
31 |        --top_p 0.9  \
32 |        --seed 42
33 | 


--------------------------------------------------------------------------------
/examples/run_text_generation_server_345M_8_tensor_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel
 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | pip install flask-restful
14 | 
15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
16 |        --tensor-model-parallel-size 8  \
17 |        --pipeline-model-parallel-size 1  \
18 |        --num-layers 24  \
19 |        --hidden-size 1024  \
20 |        --load ${CHECKPOINT}  \
21 |        --num-attention-heads 16  \
22 |        --max-position-embeddings 1024  \
23 |        --tokenizer-type GPT2BPETokenizer  \
24 |        --fp16  \
25 |        --micro-batch-size 1  \
26 |        --seq-length 1024  \
27 |        --out-seq-length 1024  \
28 |        --temperature 1.0  \
29 |        --vocab-file $VOCAB_FILE  \
30 |        --merge-file $MERGE_FILE  \
31 |        --top_p 0.9  \
32 |        --seed 42
33 | 


--------------------------------------------------------------------------------
/examples/sc21/CONFIG.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # SLURM options.
 5 | export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
 6 | export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
 7 | 
 8 | 
 9 | # Source code.
10 | export MEGATRON_CODE_DIR=<megatron source code directory>
11 | 
12 | 
13 | # This variable is used to mount the relevant part of the filesystem
14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
15 | # launch directory already get mounted; this variable should be used to
16 | # mount the directories that contain the data and tokenizer files.
17 | export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
18 | 
19 | 
20 | # Data and tokenizer files.
21 | MEGATRON_DATA=<path to megatron processed data>
22 | BPE_VOCAB_FILE=<path to bpe vocab file>
23 | BPE_MERGE_FILE=<path to bpe merges file>
24 | 
25 | 
26 | # Megatron input parameters.
27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
28 | # that are not listed here. 
29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
30 | 	--tensor-model-parallel-size ${TP} \
31 | 	--pipeline-model-parallel-size ${PP} \
32 | 	--micro-batch-size ${MBS} \
33 | 	--global-batch-size ${GBS} \
34 |         --num-layers ${NLS} \
35 |         --hidden-size ${HS} \
36 |         --num-attention-heads ${NAH} \
37 | 	--DDP-impl ${DDP} \
38 | 	--data-path ${MEGATRON_DATA} \
39 | 	--vocab-file ${BPE_VOCAB_FILE} \
40 | 	--merge-file ${BPE_MERGE_FILE} \
41 |         --log-interval 5 \
42 |         --seq-length 2048 \
43 |         --max-position-embeddings 2048 \
44 |         --train-iters 500 \
45 |         --lr-decay-iters 320 \
46 |         --lr 0.0001 \
47 | 	--min-lr 0.00001 \
48 |         --lr-decay-style cosine \
49 |         --lr-warmup-fraction 0.01 \
50 |         --split 969,30,1 \
51 |         --eval-iters 100 \
52 |         --eval-interval 1000 \
53 |         --clip-grad 1.0 \
54 |         --fp16 \
55 | 	--loss-scale 8192 "
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/examples/sc21/README.md:
--------------------------------------------------------------------------------
 1 | # Reproducing Figures in SC21 Paper
 2 | 
 3 | 
 4 | This directory contains some of the scripts that were used to produce the
 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
 9 | schedulers as well.
10 | 
11 | 
12 | ## Setup
13 | 
14 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
15 | update the unspecified values (in angle brackets `<...>`) before launching any
16 | scripts.
17 | 
18 | 
19 | 
20 | ## Scripts
21 | 
22 | Below is a list of scripts that can be used to reproduce various figures in our
23 | [paper](https://arxiv.org/pdf/2104.04473.pdf):
24 | 
25 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
26 | for GPT models ranging from 1 billion to 1 trillion parameters.
27 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
28 | performance of pipeline parallelism.
29 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
30 | the interleaved schedule on a 175B GPT model.
31 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
32 | different degrees of pipeline and tensor model parallelism on a model with
33 | 162.2 billion parameters.
34 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
35 | different degrees of data and pipeline model parallelism on a model with
36 | 5.9 billion parameters.
37 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
38 | different degrees of data and tensor model parallelism on a model with
39 | 5.9 billion parameters.
40 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
41 | microbatch size.
42 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
43 | activation recomputation.
44 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
45 | the scatter-gather communication optimization.
46 | 


--------------------------------------------------------------------------------
/examples/sc21/SBATCH.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | sbatch -p ${SLURM_PARTITION} \
 5 |        -A ${SLURM_ACCOUNT} \
 6 |        --job-name=${JOB_NAME} \
 7 |        --nodes=${NNODES} \
 8 |        --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 9 | 
10 | exit 0
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/sc21/SRUN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 4 | 
 5 | 
 6 | THIS_DIR=`pwd`
 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 8 | mkdir -p ${THIS_DIR}/logs
 9 | 
10 | 
11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
12 | 
13 | 
14 | srun -l \
15 |      --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
16 |      --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
17 |      --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [1, 2, 4, 8].
 8 | PP=1
 9 | 
10 | # Batch size (global batch size) options = [8, 128].
11 | GBS=8
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel size options.
18 | NLS=$((3*PP))
19 | NNODES=${PP}
20 | 
21 | 
22 | # Other params.
23 | TP=8
24 | MBS=1
25 | HS=20480
26 | NAH=128
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Interleaved schedule options = [YES, NO].
 8 | INTERLEAVED=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set interleaved schedule options.
18 | if [ ${INTERLEAVED} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${INTERLEAVED} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and tensor-parallel size options.
18 | TP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | MBS=1
23 | NLS=32
24 | HS=20480
25 | NAH=128
26 | DDP=local
27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
28 | NNODES=8
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and data-parallel size options.
18 | DP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | TP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32].
 8 | TP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set tensor-parallel and data-parallel size options.
18 | DP=$((64/TP))
19 | 
20 | 
21 | # Other params.
22 | PP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Microbatch size options = [1, 2, 4, 8].
 8 | MBS=1
 9 | 
10 | # Batch size (global batch size) options = [128, 512].
11 | GBS=128
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Other params.
18 | TP=8
19 | PP=8
20 | NLS=32
21 | HS=15360
22 | NAH=128
23 | DDP=local
24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
25 | NNODES=8
26 | 
27 | 
28 | # Name of the job.
29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
30 | 
31 | 
32 | # Import the configs.
33 | . `pwd`/CONFIG.sh
34 | 
35 | 
36 | # Submit the job.
37 | . `pwd`/SBATCH.sh
38 | 
39 | 
40 | exit 0
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_17.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Activation recomputation options = [YES, NO].
 8 | ACTIVATION_RECOMPUTATION=YES
 9 | 
10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256].
11 | GBS=1
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set activation recomputation.
18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS=""
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=16
31 | MBS=1
32 | NLS=80
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=16
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_18.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Scatter-gather communication optimization options = [YES, NO].
 8 | SCATTER_GATHER=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set scatter-gather communication optimization options.
18 | if [ ${SCATTER_GATHER} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${SCATTER_GATHER} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_table_1.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # ================================
  4 | # Choose the case to run.
  5 | # ================================
  6 | # model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
  7 | MODEL_SIZE=1.7B
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | if [ ${MODEL_SIZE} == "1.7B" ]; then
 15 |     TP=1
 16 |     PP=1
 17 |     MBS=16
 18 |     GBS=512
 19 |     NLS=24
 20 |     HS=2304
 21 |     NAH=24
 22 |     DDP=torch
 23 |     NNODES=4
 24 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 25 | elif [ ${MODEL_SIZE} == "3.6B" ]; then
 26 |     TP=2
 27 |     PP=1
 28 |     MBS=16
 29 |     GBS=512
 30 |     NLS=30
 31 |     HS=3072
 32 |     NAH=32
 33 |     DDP=torch
 34 |     NNODES=8
 35 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 36 | elif [ ${MODEL_SIZE} == "7.5B" ]; then
 37 |     TP=4
 38 |     PP=1
 39 |     MBS=16
 40 |     GBS=512
 41 |     NLS=36
 42 |     HS=4096
 43 |     NAH=32
 44 |     DDP=torch
 45 |     NNODES=16
 46 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 47 | elif [ ${MODEL_SIZE} == "18B" ]; then
 48 |     TP=8
 49 |     PP=1
 50 |     MBS=8
 51 |     GBS=1024
 52 |     NLS=40
 53 |     HS=6144
 54 |     NAH=48
 55 |     DDP=torch
 56 |     NNODES=32
 57 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 58 | elif [ ${MODEL_SIZE} == "39B" ]; then
 59 |     TP=8
 60 |     PP=2
 61 |     MBS=4
 62 |     GBS=1536
 63 |     NLS=48
 64 |     HS=8192
 65 |     NAH=64
 66 |     DDP=local
 67 |     NNODES=64
 68 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 69 | elif [ ${MODEL_SIZE} == "76B" ]; then
 70 |     TP=8
 71 |     PP=4
 72 |     MBS=2
 73 |     GBS=1792
 74 |     NLS=60
 75 |     HS=10240
 76 |     NAH=80
 77 |     DDP=local
 78 |     NNODES=128
 79 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
 80 | elif [ ${MODEL_SIZE} == "145B" ]; then
 81 |     TP=8
 82 |     PP=8
 83 |     MBS=2
 84 |     GBS=2304
 85 |     NLS=80
 86 |     HS=12288
 87 |     NAH=96
 88 |     DDP=local
 89 |     NNODES=192
 90 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
 91 | elif [ ${MODEL_SIZE} == "310B" ]; then
 92 |     TP=8
 93 |     PP=16
 94 |     MBS=1
 95 |     GBS=2160
 96 |     NLS=96
 97 |     HS=16384
 98 |     NAH=128
 99 |     DDP=local
100 |     NNODES=240
101 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
102 | elif [ ${MODEL_SIZE} == "530B" ]; then
103 |     TP=8
104 |     PP=35
105 |     MBS=1
106 |     GBS=2520
107 |     NLS=105
108 |     HS=20480
109 |     NAH=128
110 |     DDP=local
111 |     NNODES=315
112 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
113 | elif [ ${MODEL_SIZE} == "1T" ]; then
114 |     TP=8
115 |     PP=64
116 |     MBS=1
117 |     GBS=3072
118 |     NLS=128
119 |     HS=25600
120 |     NAH=160
121 |     DDP=local
122 |     NNODES=384
123 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
124 | else
125 |     echo "Invalid configuration"
126 |     exit 1
127 | fi
128 | 
129 | 
130 | # Name of the job
131 | export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
132 | 
133 | 
134 | # Import the configs.
135 | . `pwd`/CONFIG.sh
136 | 
137 | 
138 | # Submit the job.
139 | . `pwd`/SBATCH.sh
140 | 
141 | 
142 | exit 0
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/images/Achieved_petaFLOPs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/images/Achieved_petaFLOPs.png


--------------------------------------------------------------------------------
/images/cases_april2021.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/images/cases_april2021.png


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | 
 4 | from .global_vars import get_args
 5 | from .global_vars import get_current_global_batch_size
 6 | from .global_vars import get_num_microbatches
 7 | from .global_vars import get_signal_handler
 8 | from .global_vars import update_num_microbatches
 9 | from .global_vars import get_tokenizer
10 | from .global_vars import get_tensorboard_writer
11 | from .global_vars import get_adlr_autoresume
12 | from .global_vars import get_timers
13 | from .initialize  import initialize_megatron
14 | 
15 | from .utils import (print_rank_0,
16 |                     is_last_rank,
17 |                     print_rank_last)
18 | 


--------------------------------------------------------------------------------
/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import megatron.core.parallel_state
 2 | import megatron.core.tensor_parallel
 3 | import megatron.core.utils
 4 | 
 5 | # Alias parallel_state as mpu, its legacy name
 6 | mpu = parallel_state
 7 | 
 8 | __all__ = [
 9 |     "parallel_state",
10 |     "tensor_parallel",
11 |     "utils",
12 | ]
13 | 


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cross_entropy import vocab_parallel_cross_entropy
 2 | from .data import broadcast_data
 3 | 
 4 | from .layers import (
 5 |     ColumnParallelLinear,
 6 |     RowParallelLinear,
 7 |     VocabParallelEmbedding,
 8 |     set_tensor_model_parallel_attributes,
 9 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
10 |     copy_tensor_model_parallel_attributes,
11 |     param_is_not_tensor_parallel_duplicate,
12 |     linear_with_grad_accumulation_and_async_allreduce
13 | 
14 | )
15 | 
16 | from .mappings import (
17 |     copy_to_tensor_model_parallel_region,
18 |     gather_from_tensor_model_parallel_region,
19 |     gather_from_sequence_parallel_region,
20 |     scatter_to_tensor_model_parallel_region,
21 |     scatter_to_sequence_parallel_region,
22 | )
23 | 
24 | from .random import (
25 |     checkpoint,
26 |     get_cuda_rng_tracker,
27 |     model_parallel_cuda_manual_seed,
28 | )
29 | 
30 | from .utils import (
31 |     split_tensor_along_last_dim,
32 |     split_tensor_into_1d_equal_chunks,
33 |     gather_split_1d_tensor,
34 | )
35 | 
36 | __all__ = [
37 |     # cross_entropy.py
38 |     "vocab_parallel_cross_entropy",
39 |     # data.py
40 |     "broadcast_data",
41 |     #layers.py
42 |     "ColumnParallelLinear",
43 |     "RowParallelLinear",
44 |     "VocabParallelEmbedding",
45 |     "set_tensor_model_parallel_attributes",
46 |     "set_defaults_if_not_set_tensor_model_parallel_attributes",
47 |     "copy_tensor_model_parallel_attributes",
48 |     "param_is_not_tensor_parallel_duplicate",
49 |     "linear_with_grad_accumulation_and_async_allreduce",
50 |     # mappings.py
51 |     "copy_to_tensor_model_parallel_region",
52 |     "gather_from_tensor_model_parallel_region",
53 |     "gather_from_sequence_parallel_region",
54 | #    "reduce_from_tensor_model_parallel_region",
55 |     "scatter_to_tensor_model_parallel_region",
56 |     "scatter_to_sequence_parallel_region",
57 |     # random.py
58 |     "checkpoint",
59 |     "get_cuda_rng_tracker",
60 |     "model_parallel_cuda_manual_seed",
61 |     # utils.py
62 |     "split_tensor_along_last_dim",
63 |     "split_tensor_into_1d_equal_chunks",
64 |     "gather_split_1d_tensor",
65 | ]
66 | 


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/data.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | import torch
  4 | 
  5 | from megatron.core.parallel_state import (
  6 |     get_tensor_model_parallel_group,
  7 |     get_tensor_model_parallel_rank,
  8 |     get_tensor_model_parallel_src_rank,
  9 | )
 10 | 
 11 | 
 12 | _MAX_DATA_DIM = 5
 13 | 
 14 | 
 15 | def _check_data_types(keys, data, target_dtype):
 16 |     """Check that all the keys have the same target data type."""
 17 |     for key in keys:
 18 |         assert data[key].dtype == target_dtype, '{} has data type {} which '\
 19 |             'is different than {}'.format(key, data[key].dtype, target_dtype)
 20 | 
 21 | 
 22 | def _build_key_size_numel_dictionaries(keys, data):
 23 |     """Build the size on rank 0 and broadcast."""
 24 |     max_dim = _MAX_DATA_DIM
 25 |     sizes = [0 for _ in range(max_dim) for _ in keys]
 26 | 
 27 |     # Pack the sizes on rank zero.
 28 |     if get_tensor_model_parallel_rank() == 0:
 29 |         offset = 0
 30 |         for key in keys:
 31 |             assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
 32 |             size = data[key].size()
 33 |             for i, s in enumerate(size):
 34 |                 sizes[i + offset] = s
 35 |             offset += max_dim
 36 | 
 37 |     # Move to GPU and broadcast.
 38 |     sizes_cuda = torch.cuda.LongTensor(sizes)
 39 |     torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
 40 |                                 group=get_tensor_model_parallel_group())
 41 | 
 42 |     # Move back to cpu and unpack.
 43 |     sizes_cpu = sizes_cuda.cpu()
 44 |     key_size = {}
 45 |     key_numel = {}
 46 |     total_numel = 0
 47 |     offset = 0
 48 |     for key in keys:
 49 |         i = 0
 50 |         size = []
 51 |         numel = 1
 52 |         while sizes_cpu[offset + i] > 0:
 53 |             this_size = sizes_cpu[offset + i]
 54 |             size.append(this_size)
 55 |             numel *= this_size
 56 |             i += 1
 57 |         key_size[key] = size
 58 |         key_numel[key] = numel
 59 |         total_numel += numel
 60 |         offset += max_dim
 61 | 
 62 |     return key_size, key_numel, total_numel
 63 | 
 64 | 
 65 | def broadcast_data(keys, data, datatype):
 66 |     """Broadcast data from rank zero of each model parallel group to the
 67 |     members of the same model parallel group.
 68 | 
 69 |     Arguments:
 70 |         keys: list of keys in the data disctionary to be broadcasted
 71 |         data: data dictionary of string keys and cpu tensor values.
 72 |         datatype: torch data type of all tensors in data associated
 73 |                   with keys.
 74 |     """
 75 |     # Build (key, size) and (key, number of elements) dictionaries along
 76 |     # with the total number of elements on all ranks.
 77 |     key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
 78 |                                                                           data)
 79 | 
 80 |     # Pack on rank zero.
 81 |     if get_tensor_model_parallel_rank() == 0:
 82 |         # Check that all keys have the same data type.
 83 |         _check_data_types(keys, data, datatype)
 84 |         # Flatten the data associated with the keys
 85 |         flatten_data = torch.cat(
 86 |             [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
 87 |     else:
 88 |         flatten_data = torch.empty(total_numel,
 89 |                                    device=torch.cuda.current_device(),
 90 |                                    dtype=datatype)
 91 | 
 92 |     # Broadcast
 93 |     torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),
 94 |                                 group=get_tensor_model_parallel_group())
 95 | 
 96 |     # Unpack
 97 |     output = {}
 98 |     offset = 0
 99 |     for key in keys:
100 |         size = key_size[key]
101 |         numel = key_numel[key]
102 |         output[key] = flatten_data.narrow(0, offset, numel).view(size)
103 |         offset += numel
104 | 
105 |     return output
106 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Blendable dataset."""
 4 | 
 5 | import time
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | 
10 | from megatron import print_rank_0
11 | 
12 | class BlendableDataset(torch.utils.data.Dataset):
13 | 
14 | 
15 |     def __init__(self, datasets, weights):
16 | 
17 |         self.datasets = datasets
18 |         num_datasets = len(datasets)
19 |         assert num_datasets == len(weights)
20 | 
21 |         self.size = 0
22 |         for dataset in self.datasets:
23 |             self.size += len(dataset)
24 | 
25 |         # Normalize weights.
26 |         weights = np.array(weights, dtype=np.float64)
27 |         sum_weights = np.sum(weights)
28 |         assert sum_weights > 0.0
29 |         weights /= sum_weights
30 | 
31 |         # Build indecies.
32 |         start_time = time.time()
33 |         assert num_datasets < 255
34 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
35 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
36 | 
37 |         from megatron.data import helpers
38 |         helpers.build_blending_indices(self.dataset_index,
39 |                                        self.dataset_sample_index,
40 |                                        weights, num_datasets, self.size,
41 |                                        torch.distributed.get_rank() == 0)
42 |         print_rank_0('> elapsed time for building blendable dataset indices: '
43 |                      '{:.2f} (sec)'.format(time.time() - start_time))
44 | 
45 | 
46 |     def __len__(self):
47 |         return self.size
48 | 
49 | 
50 |     def __getitem__(self, idx):
51 |         dataset_idx = self.dataset_index[idx]
52 |         sample_idx = self.dataset_sample_index[idx]
53 |         return self.datasets[dataset_idx][sample_idx]
54 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/megatron/dist_signal_handler.py:
--------------------------------------------------------------------------------
 1 | import signal
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_world_size():
 7 |     if torch.distributed.is_available() and torch.distributed.is_initialized():
 8 |         world_size = torch.distributed.get_world_size()
 9 |     else:
10 |         world_size = 1
11 |     return world_size
12 | 
13 | 
14 | def get_device(local_rank=None):
15 |     backend = torch.distributed.get_backend()
16 |     if backend == 'nccl':
17 |         if local_rank is None:
18 |             device = torch.device('cuda')
19 |         else:
20 |             device = torch.device(f'cuda:{local_rank}')
21 |     elif backend == 'gloo':
22 |         device = torch.device('cpu')
23 |     else:
24 |         raise RuntimeError
25 |     return device
26 | 
27 | 
28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None):
29 |     if not torch.distributed.is_available() or \
30 |        not torch.distributed.is_initialized():
31 |         return [item]
32 | 
33 |     device = get_device(local_rank)
34 | 
35 |     if group is not None:
36 |         group_size = group.size()
37 |     else:
38 |         group_size = get_world_size()
39 | 
40 |     tensor = torch.tensor([item], device=device, dtype=dtype)
41 |     output_tensors = [
42 |         torch.zeros(1, dtype=tensor.dtype, device=tensor.device)
43 |         for _ in range(group_size)
44 |     ]
45 |     torch.distributed.all_gather(output_tensors, tensor, group, async_op)
46 |     output = [elem.item() for elem in output_tensors]
47 |     return output
48 | 
49 | 
50 | class DistributedSignalHandler:
51 |     def __init__(self, sig=signal.SIGTERM):
52 |         self.sig = sig
53 | 
54 |     def signals_received(self):
55 |         all_received = all_gather_item(
56 |             self._signal_received, dtype=torch.int32
57 |         )
58 |         return all_received
59 | 
60 |     def __enter__(self):
61 |         self._signal_received = False
62 |         self.released = False
63 |         self.original_handler = signal.getsignal(self.sig)
64 | 
65 |         def handler(signum, frame):
66 |             self._signal_received = True
67 | 
68 |         signal.signal(self.sig, handler)
69 | 
70 |         return self
71 | 
72 |     def __exit__(self, type, value, tb):
73 |         self.release()
74 | 
75 |     def release(self):
76 |         if self.released:
77 |             return False
78 | 
79 |         signal.signal(self.sig, self.original_handler)
80 |         self.released = True
81 |         return True
82 | 


--------------------------------------------------------------------------------
/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/fused_weight_gradient_dense.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/torch.h>
 2 | #include <torch/extension.h>
 3 | 
 4 | #include <vector>
 5 | #include <stdio.h>
 6 | 
 7 | #include "type_shim.h"
 8 | 
 9 | 
10 | template <typename T>
11 | int wgrad_gemm_accum_fp32_cuda(T *input, T *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
12 | 
13 | void wgrad_gemm_accum_fp32(const at::Tensor input, const at::Tensor d_output, at::Tensor d_weight) {
14 |     at::Tensor input_2d, d_output_2d;
15 |     // input tensor: collapse to the first dim
16 |     auto in_sizes = input.sizes();
17 |     if (input.dim() > 2) {
18 |         input_2d = input.view({-1, in_sizes[in_sizes.size() - 1]});
19 |     } else {
20 |         input_2d = input;
21 |     }
22 |     // d_output tensor: collapse to the first dim
23 |     auto d_out_sizes = d_output.sizes();
24 |     if (d_output.dim() > 2) {
25 |         d_output_2d = d_output.view({-1, d_out_sizes[d_out_sizes.size() - 1]});
26 |     } else {
27 |         d_output_2d = d_output;
28 |     }
29 | 
30 |     int hidden_dim = input_2d.size(0);
31 |     int in_dim = input_2d.size(1);
32 |     int out_dim = d_weight.size(0);
33 | 
34 |     DISPATCH_HALF_BFLOAT_AND_FLOAT(input_2d.scalar_type(), "wgrad_gemm_accum_fp32",
35 |         int result = wgrad_gemm_accum_fp32_cuda<scalar_t>(
36 |             input_2d.data_ptr<scalar_t>(),
37 |             d_output_2d.data_ptr<scalar_t>(),
38 |             d_weight.data_ptr<float>(),
39 |             in_dim,
40 |             hidden_dim,
41 |             out_dim);
42 |     );
43 | }
44 | 
45 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
46 |     m.def("wgrad_gemm_accum_fp32", &wgrad_gemm_accum_fp32, "wgrad gemm accum in fp32");
47 | }
48 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/fused_weight_gradient_dense.cu:
--------------------------------------------------------------------------------
  1 | #include <ATen/ATen.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | #include <assert.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include <torch/torch.h>
  8 | 
  9 | /* Includes, cuda */
 10 | #include <cublas_v2.h>
 11 | #include <cuda_runtime.h>
 12 | 
 13 | 
 14 | // BF16 Tensor core wrapper around cublas GEMMEx
 15 | cublasStatus_t gemmex_wrapper(
 16 |     cublasHandle_t handle,
 17 |     cublasOperation_t transa,
 18 |     cublasOperation_t transb,
 19 |     int m,
 20 |     int n,
 21 |     int k,
 22 |     const float* alpha,
 23 |     at::BFloat16* A,
 24 |     int lda,
 25 |     at::BFloat16* B,
 26 |     int ldb,
 27 |     const float* beta,
 28 |     float* C,
 29 |     int ldc) {
 30 |   return cublasGemmEx(
 31 |       handle,
 32 |       transa,
 33 |       transb,
 34 |       m,
 35 |       n,
 36 |       k,
 37 |       alpha,
 38 |       A,
 39 |       CUDA_R_16BF,
 40 |       lda,
 41 |       B,
 42 |       CUDA_R_16BF,
 43 |       ldb,
 44 |       beta,
 45 |       C,
 46 |       CUDA_R_32F,
 47 |       ldc,
 48 |       CUDA_R_32F,
 49 |       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 50 | }
 51 | 
 52 | // FP16 Tensor core wrapper around cublas GEMMEx
 53 | cublasStatus_t gemmex_wrapper(
 54 |     cublasHandle_t handle,
 55 |     cublasOperation_t transa,
 56 |     cublasOperation_t transb,
 57 |     int m,
 58 |     int n,
 59 |     int k,
 60 |     const float* alpha,
 61 |     at::Half* A,
 62 |     int lda,
 63 |     at::Half* B,
 64 |     int ldb,
 65 |     const float* beta,
 66 |     float* C,
 67 |     int ldc) {
 68 |   return cublasGemmEx(
 69 |       handle,
 70 |       transa,
 71 |       transb,
 72 |       m,
 73 |       n,
 74 |       k,
 75 |       alpha,
 76 |       A,
 77 |       CUDA_R_16F,
 78 |       lda,
 79 |       B,
 80 |       CUDA_R_16F,
 81 |       ldb,
 82 |       beta,
 83 |       C,
 84 |       CUDA_R_32F,
 85 |       ldc,
 86 |       CUDA_R_32F,
 87 |       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 88 | }
 89 | 
 90 | // FP32 Tensor core wrapper around cublas GEMMEx
 91 | cublasStatus_t gemmex_wrapper(
 92 |     cublasHandle_t handle,
 93 |     cublasOperation_t transa,
 94 |     cublasOperation_t transb,
 95 |     int m,
 96 |     int n,
 97 |     int k,
 98 |     const float* alpha,
 99 |     float* A,
100 |     int lda,
101 |     float* B,
102 |     int ldb,
103 |     const float* beta,
104 |     float* C,
105 |     int ldc) {
106 |   return cublasGemmEx(
107 |       handle,
108 |       transa,
109 |       transb,
110 |       m,
111 |       n,
112 |       k,
113 |       alpha,
114 |       A,
115 |       CUDA_R_32F,
116 |       lda,
117 |       B,
118 |       CUDA_R_32F,
119 |       ldb,
120 |       beta,
121 |       C,
122 |       CUDA_R_32F,
123 |       ldc,
124 |       CUDA_R_32F,
125 |       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
126 | }
127 | 
128 | template <typename T>
129 | int wgrad_gemm_accum_fp32_cuda(T *input, T *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim) {
130 |     cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
131 |     cudaStream_t stream;
132 |     cublasGetStream(handle, &stream);
133 |     const float alpha = 1.0;
134 |     const float beta  = 1.0;
135 |     int status = 1;
136 | 
137 |     status = gemmex_wrapper(
138 |         handle,
139 |         CUBLAS_OP_N,
140 |         CUBLAS_OP_T,
141 |         in_dim,
142 |         out_dim,
143 |         hidden_dim,
144 |         &alpha,
145 |         input,
146 |         in_dim,
147 |         d_output,
148 |         out_dim,
149 |         &beta,
150 |         d_weight,
151 |         in_dim);
152 |     return status;
153 | }
154 | 
155 | template int wgrad_gemm_accum_fp32_cuda<at::Half>(at::Half *input, at::Half *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
156 | template int wgrad_gemm_accum_fp32_cuda<at::BFloat16>(at::BFloat16 *input, at::BFloat16 *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
157 | template int wgrad_gemm_accum_fp32_cuda<float>(float *input, float *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
158 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_masked_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     torch::Tensor const& mask,
14 |     float scale_factor);
15 | 
16 | torch::Tensor bwd_cuda(
17 |     torch::Tensor const& output_grads, 
18 |     torch::Tensor const& softmax_results,
19 |     float scale_factor);
20 | 
21 | int get_batch_per_block_cuda(
22 |     int query_seq_len,
23 |     int key_seq_len,
24 |     int batches,
25 |     int attn_heads);
26 | 
27 | torch::Tensor fwd(
28 |     torch::Tensor const& input,
29 |     torch::Tensor const& mask,
30 |     float scale_factor) {
31 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
32 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
33 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
34 |       "Only fp16 and bf16 are supported");
35 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
36 | 
37 |   return fwd_cuda(input, mask, scale_factor);
38 | }
39 | 
40 | torch::Tensor bwd(
41 |     torch::Tensor const& output_grads, 
42 |     torch::Tensor const& softmax_results,
43 |     float scale_factor) {
44 | 
45 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
46 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
47 | 
48 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
49 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
50 |       "Only fp16 and bf16 are supported");
51 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
52 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
53 |       "Only fp16 and bf16 are supported");
54 | 
55 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 | 
58 | int get_batch_per_block(
59 |     int query_seq_len,
60 |     int key_seq_len,
61 |     int batches,
62 |     int attn_heads) {
63 |     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
64 | }
65 | 
66 | } // end namespace scaled_masked_softmax
67 | } // end namespace fused_softmax
68 | } // end namespace multihead_attn
69 | 
70 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
71 |   m.def("forward", 
72 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
73 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
74 | 
75 |   m.def("backward",
76 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
77 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
78 | 
79 |   m.def("get_batch_per_block",
80 |         &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
81 |         "Return Batch per block size."
82 |   );
83 | }
84 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
  2 | 
  3 | #include <ATen/ATen.h>
  4 | #include <cuda.h>
  5 | #include <cuda_runtime.h>
  6 | #include <cuda_fp16.h>
  7 | #include <cuda_profiler_api.h>
  8 | #include <ATen/cuda/CUDAContext.h>
  9 | #include <torch/extension.h>
 10 | #include "scaled_masked_softmax.h"
 11 | #include "type_shim.h"
 12 | 
 13 | namespace multihead_attn {
 14 | namespace fused_softmax {
 15 | namespace scaled_masked_softmax {
 16 | 
 17 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
 18 |     return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
 19 | }
 20 | 
 21 | 
 22 | torch::Tensor fwd_cuda(
 23 |     torch::Tensor const& input,
 24 |     torch::Tensor const& mask,
 25 |     float scale_factor)
 26 | {
 27 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 28 |   const int batches = input.size(0);
 29 |   const int pad_batches = mask.size(0);
 30 |   const int attn_heads = input.size(1);
 31 |   const int query_seq_len = input.size(2);
 32 |   const int key_seq_len = input.size(3);
 33 |   TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
 34 |   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
 35 |   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
 36 |   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
 37 |   TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
 38 |   TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
 39 | 
 40 |   // Output 
 41 |   auto act_options = input.options().requires_grad(false);
 42 |   torch::Tensor softmax_results = 
 43 |       torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
 44 | 
 45 |   // Softmax Intermediate Result Ptr
 46 |   void* input_ptr = static_cast<void*>(input.data_ptr());
 47 |   void* mask_ptr = static_cast<void*>(mask.data_ptr());
 48 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 49 | 
 50 |   DISPATCH_HALF_AND_BFLOAT(
 51 |       input.scalar_type(),
 52 |       "dispatch_scaled_masked_softmax_forward",
 53 |       dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
 54 |       reinterpret_cast<scalar_t*>(softmax_results_ptr),
 55 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
 56 | 	  reinterpret_cast<const uint8_t*>(mask_ptr),
 57 | 	  scale_factor,
 58 | 	  query_seq_len,
 59 | 	  key_seq_len,
 60 | 	  batches,
 61 | 	  attn_heads,
 62 | 	  pad_batches);
 63 |       );
 64 |   return softmax_results;
 65 | }
 66 | 
 67 | torch::Tensor bwd_cuda(
 68 |     torch::Tensor const& output_grads_, 
 69 |     torch::Tensor const& softmax_results_, 
 70 |     float scale_factor)  {
 71 | 	
 72 |   auto output_grads = output_grads_.contiguous();
 73 |   auto softmax_results = softmax_results_.contiguous();
 74 | 
 75 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 76 |   const int batches = output_grads.size(0);
 77 |   const int attn_heads = output_grads.size(1);
 78 |   const int query_seq_len = output_grads.size(2);
 79 |   const int key_seq_len = output_grads.size(3);
 80 | 
 81 |   auto act_options = output_grads.options().requires_grad(false);
 82 |   torch::Tensor input_grads = 
 83 |             torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);  
 84 | 
 85 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 86 |   void* input_grads_ptr = static_cast<void*>(input_grads.data_ptr());
 87 | 
 88 |   //Softmax Grad
 89 |   DISPATCH_HALF_AND_BFLOAT(
 90 |       output_grads_.scalar_type(),
 91 |       "dispatch_scaled_masked_softmax_backward",
 92 |       dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
 93 |       reinterpret_cast<scalar_t*>(input_grads_ptr), 
 94 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
 95 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
 96 | 	  scale_factor,
 97 | 	  query_seq_len,
 98 | 	  key_seq_len,
 99 | 	  batches,
100 | 	  attn_heads);
101 |       );
102 |   
103 |   return input_grads;
104 | }
105 | }
106 | }
107 | }
108 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     float scale_factor);
14 | 
15 | torch::Tensor bwd_cuda(
16 |     torch::Tensor const& output_grads, 
17 |     torch::Tensor const& softmax_results,
18 |     float scale_factor);
19 | 
20 | torch::Tensor fwd(
21 |     torch::Tensor const& input,
22 |     float scale_factor) {
23 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
24 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
25 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
26 |       "Only fp16 and bf16 are supported");
27 | 
28 |   return fwd_cuda(input, scale_factor);
29 | }
30 | 
31 | torch::Tensor bwd(
32 |     torch::Tensor const& output_grads, 
33 |     torch::Tensor const& softmax_results,
34 |     float scale_factor) {
35 | 
36 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
37 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
38 | 
39 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
40 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
41 |       "Only fp16 and bf16 are supported");
42 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
43 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
44 |       "Only fp16 and bf16 are supported");
45 | 
46 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
47 | }
48 | 
49 | } // end namespace scaled_softmax
50 | } // end namespace fused_softmax
51 | } // end namespace multihead_attn
52 | 
53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
54 |   m.def("forward", 
55 |         &multihead_attn::fused_softmax::scaled_softmax::fwd, 
56 | 	"Self Multihead Attention scaled, softmax -- Forward.");
57 |   m.def("backward", 
58 |         &multihead_attn::fused_softmax::scaled_softmax::bwd,
59 | 	"Self Multihead Attention scaled, softmax -- Backward.");
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <cuda.h>
 5 | #include <cuda_runtime.h>
 6 | #include <cuda_fp16.h>
 7 | #include <cuda_profiler_api.h>
 8 | #include <ATen/cuda/CUDAContext.h>
 9 | #include <torch/extension.h>
10 | #include "scaled_masked_softmax.h"
11 | #include "type_shim.h"
12 | 
13 | namespace multihead_attn {
14 | namespace fused_softmax {
15 | namespace scaled_softmax {
16 | 
17 | torch::Tensor fwd_cuda(
18 |     torch::Tensor const& input,
19 |     float scale_factor)
20 | {
21 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
22 |   const int batches = input.size(0);
23 |   const int attn_heads = input.size(1);
24 |   const int query_seq_len = input.size(2);
25 |   const int key_seq_len = input.size(3);
26 |   TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
27 |   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
28 | 
29 |   // Output 
30 |   auto act_options = input.options().requires_grad(false);
31 |   torch::Tensor softmax_results = 
32 |       torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
33 | 
34 |   // Softmax Intermediate Result Ptr
35 |   void* input_ptr = static_cast<void*>(input.data_ptr());
36 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
37 | 
38 |   DISPATCH_HALF_AND_BFLOAT(
39 |       input.scalar_type(),
40 |       "dispatch_scaled_softmax_forward",
41 |       dispatch_scaled_softmax_forward<scalar_t, scalar_t, float>(
42 |           reinterpret_cast<scalar_t*>(softmax_results_ptr),
43 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
44 | 	  scale_factor,
45 | 	  query_seq_len,
46 | 	  key_seq_len,
47 | 	  batches,
48 | 	  attn_heads);
49 |       );
50 |   return softmax_results;
51 | }
52 | 
53 | torch::Tensor bwd_cuda(
54 |     torch::Tensor const& output_grads_, 
55 |     torch::Tensor const& softmax_results_, 
56 |     float scale_factor)  {
57 | 	
58 |   auto output_grads = output_grads_.contiguous();
59 |   auto softmax_results = softmax_results_.contiguous();
60 | 
61 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
62 |   const int batches = output_grads.size(0);
63 |   const int attn_heads = output_grads.size(1);
64 |   const int query_seq_len = output_grads.size(2);
65 |   const int key_seq_len = output_grads.size(3);
66 | 
67 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
68 | 
69 |   //Softmax Grad
70 |   DISPATCH_HALF_AND_BFLOAT(
71 |       output_grads_.scalar_type(),
72 |       "dispatch_scaled_masked_softmax_backward",
73 |       dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
74 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
75 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
76 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
77 | 	  scale_factor,
78 | 	  query_seq_len,
79 | 	  key_seq_len,
80 | 	  batches,
81 | 	  attn_heads);
82 | 			   );
83 |   
84 |   //backward pass is completely in-place
85 |   return output_grads;
86 | }
87 | }
88 | }
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <torch/extension.h>
 5 | #include <vector>
 6 | 
 7 | namespace multihead_attn {
 8 | namespace fused_softmax {
 9 | namespace scaled_upper_triang_masked_softmax {
10 | 
11 | torch::Tensor fwd_cuda(
12 |     torch::Tensor const& input, 
13 |     float scale_factor);
14 | 
15 | torch::Tensor bwd_cuda(
16 |     torch::Tensor const& output_grads, 
17 |     torch::Tensor const& softmax_results,
18 |     float scale_factor);
19 | 
20 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
21 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
22 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
23 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
24 |       "Only fp16 and bf16 are supported");
25 | 
26 |   return fwd_cuda(input, scale_factor);
27 | }
28 | 
29 | torch::Tensor bwd(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor) {
33 | 
34 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
35 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
36 | 
37 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
38 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
39 |       "Only fp16 and bf16 are supported");
40 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
41 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
42 |       "Only fp16 and bf16 are supported");
43 | 
44 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
45 | }
46 | 
47 | } // end namespace scaled_upper_triang_masked_softmax
48 | } // end namespace fused_softmax
49 | } // end namespace multihead_attn
50 | 
51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
52 |   m.def("forward", 
53 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
54 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
55 |   m.def("backward", 
56 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
57 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
58 | }
59 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <cuda.h>
 5 | #include <cuda_runtime.h>
 6 | #include <cuda_fp16.h>
 7 | #include <cuda_profiler_api.h>
 8 | #include <ATen/cuda/CUDAContext.h>
 9 | #include <torch/extension.h>
10 | #include "scaled_upper_triang_masked_softmax.h"
11 | #include "type_shim.h"
12 | 
13 | namespace multihead_attn {
14 | namespace fused_softmax {
15 | namespace scaled_upper_triang_masked_softmax {
16 | 
17 | torch::Tensor fwd_cuda(
18 |     torch::Tensor const& input, 
19 |     float scale_factor)
20 | {
21 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
22 |   const int attn_batches = input.size(0);
23 |   const int seq_len = input.size(1);
24 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
25 | 
26 |   // Output 
27 |   auto act_options = input.options().requires_grad(false);
28 |   torch::Tensor softmax_results = 
29 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
30 | 
31 |   // Softmax Intermediate Result Ptr
32 |   void* input_ptr = static_cast<void*>(input.data_ptr());
33 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
34 | 
35 |   DISPATCH_HALF_AND_BFLOAT(
36 |       input.scalar_type(),
37 |       "dispatch_scaled_upper_triang_masked_softmax_forward",
38 |       dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
39 | 	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
40 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
41 | 	  scale_factor,
42 | 	  seq_len,
43 | 	  seq_len,
44 | 	  attn_batches);
45 |       );
46 |   return softmax_results;
47 | }
48 | 				      
49 | 
50 | torch::Tensor bwd_cuda(
51 |     torch::Tensor const& output_grads_, 
52 |     torch::Tensor const& softmax_results_, 
53 |     float scale_factor)  {
54 | 	
55 |   auto output_grads = output_grads_.contiguous();
56 |   auto softmax_results = softmax_results_.contiguous();
57 | 
58 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
59 |   const int attn_batches = output_grads.size(0);
60 |   const int seq_len = output_grads.size(1);
61 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
62 | 
63 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
64 | 
65 |   //Softmax Grad
66 |   DISPATCH_HALF_AND_BFLOAT(
67 |       output_grads_.scalar_type(),
68 |       "dispatch_scaled_upper_triang_masked_softmax_backward",
69 |       dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
70 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
71 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
72 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
73 | 	  scale_factor,
74 | 	  seq_len,
75 | 	  seq_len,
76 | 	  attn_batches);
77 |       );
78 |   
79 |   //backward pass is completely in-place
80 |   return output_grads;
81 | }
82 | }
83 | }
84 | }
85 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/fused_kernels/type_shim.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
  2 | 
  3 | 
  4 | #include <ATen/ATen.h>
  5 | #include "compat.h"
  6 | 
  7 | 
  8 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
  9 |   switch(TYPE)								\
 10 |     {									\
 11 |     case at::ScalarType::Half:						\
 12 |       {									\
 13 | 	using scalar_t = at::Half;					\
 14 | 	__VA_ARGS__;							\
 15 | 	break;								\
 16 |       }									\
 17 |     case at::ScalarType::BFloat16:					\
 18 |       {									\
 19 | 	using scalar_t = at::BFloat16;					\
 20 | 	__VA_ARGS__;							\
 21 | 	break;								\
 22 |       }									\
 23 |     default:								\
 24 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
 25 |       }
 26 | 
 27 | 
 28 | #define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...)			\
 29 |   switch(TYPE)								\
 30 |     {									\
 31 |     case at::ScalarType::Half:						\
 32 |       {									\
 33 | 	using scalar_t = at::Half;					\
 34 | 	__VA_ARGS__;							\
 35 | 	break;								\
 36 |       }									\
 37 |     case at::ScalarType::BFloat16:					\
 38 |       {									\
 39 | 	using scalar_t = at::BFloat16;					\
 40 | 	__VA_ARGS__;							\
 41 | 	break;								\
 42 |       }									\
 43 |     case at::ScalarType::Float:						\
 44 |       {									\
 45 | 	using scalar_t = float;					\
 46 | 	__VA_ARGS__;							\
 47 | 	break;								\
 48 |       }									\
 49 |     default:								\
 50 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
 51 |       }
 52 | 
 53 | 
 54 | 
 55 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
 56 |   switch(TYPEIN)							\
 57 |     {									\
 58 |     case at::ScalarType::Float:						\
 59 |       {									\
 60 | 	using scalar_t_in = float;					\
 61 | 	switch(TYPEOUT)							\
 62 | 	  {								\
 63 | 	  case at::ScalarType::Float:					\
 64 | 	    {								\
 65 | 	      using scalar_t_out = float;				\
 66 | 	      __VA_ARGS__;						\
 67 | 	      break;							\
 68 | 	    }								\
 69 | 	  case at::ScalarType::Half:					\
 70 | 	    {								\
 71 | 	      using scalar_t_out = at::Half;				\
 72 | 	      __VA_ARGS__;						\
 73 | 	      break;							\
 74 | 	    }								\
 75 | 	  case at::ScalarType::BFloat16:				\
 76 | 	    {								\
 77 | 	      using scalar_t_out = at::BFloat16;			\
 78 | 	      __VA_ARGS__;						\
 79 | 	      break;							\
 80 | 	    }								\
 81 | 	  default:							\
 82 | 	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
 83 | 	  }								\
 84 | 	break;								\
 85 |       }									\
 86 |     case at::ScalarType::Half:						\
 87 |       {									\
 88 | 	using scalar_t_in = at::Half;					\
 89 | 	using scalar_t_out = at::Half;					\
 90 | 	__VA_ARGS__;							\
 91 | 	break;								\
 92 |       }									\
 93 |     case at::ScalarType::BFloat16:					\
 94 |       {									\
 95 | 	using scalar_t_in = at::BFloat16;				\
 96 | 	using scalar_t_out = at::BFloat16;				\
 97 | 	__VA_ARGS__;							\
 98 | 	break;								\
 99 |       }									\
100 |     default:								\
101 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
102 |     }
103 | 
104 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 4 | 
 5 | from .distributed import DistributedDataParallel
 6 | from .bert_model import BertModel
 7 | from .gpt_model import GPTModel
 8 | from .t5_model import T5Model
 9 | from .language_model import get_language_model
10 | from .module import Float16Module
11 | from .enums import ModelType
12 | 


--------------------------------------------------------------------------------
/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class ModelType(enum.Enum):
 6 |     encoder_or_decoder = 1
 7 |     encoder_and_decoder = 2
 8 |     encoder_or_decoder_with_lbl = 3
 9 | 
10 | class LayerType(enum.Enum):
11 |     encoder = 1
12 |     decoder = 2
13 |  
14 | class AttnType(enum.Enum):
15 |     self_attn = 1
16 |     cross_attn = 2
17 | 
18 | class AttnMaskType(enum.Enum):
19 |     padding = 1
20 |     causal = 2
21 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 7 | # 1/sqrt(2*pi)-> 0.3989423
 8 | # 1/sqrt(2)   -> 0.70710678
 9 | # sqrt(2/pi)  -> 0.79788456
10 | # this function is tanh approximation of gelu
11 | # actual gelu is:
12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
13 | 
14 | @torch.jit.script
15 | def bias_gelu(bias, y):
16 |     x = bias + y
17 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18 | 
19 | # gradient of tanh approximation of gelu
20 | # gradient of actual gelu is:
21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
22 | @torch.jit.script
23 | def bias_gelu_back(g, bias, y):
24 |     x = bias + y
25 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
26 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
27 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
28 |     return ff*g
29 | 
30 | class GeLUFunction(torch.autograd.Function):
31 |     @staticmethod
32 |     # bias is an optional argument
33 |     def forward(ctx, input, bias):
34 |         ctx.save_for_backward(input, bias)
35 |         return bias_gelu(bias, input)
36 | 
37 |     @staticmethod
38 |     def backward(ctx, grad_output):
39 |         input, bias = ctx.saved_tensors
40 |         tmp = bias_gelu_back(grad_output, bias, input)
41 |         return tmp, tmp
42 | 
43 | bias_gelu_impl = GeLUFunction.apply
44 | 


--------------------------------------------------------------------------------
/megatron/model/megablocks_utils.py:
--------------------------------------------------------------------------------
 1 | """Adapter to expose MegaBlocks package, if available."""
 2 | try:
 3 |     import megablocks
 4 | except ImportError:
 5 |     megablocks = None
 6 | 
 7 | def megablocks_is_available():
 8 |     return megablocks is not None
 9 | 
10 | def assert_megablocks_is_available():
11 |     assert megablocks_is_available(), (
12 |         'MegaBlocks not available. Please run `pip install megablocks`.')
13 | 
14 | def param_is_expert_model_parallel(param):
15 |     if megablocks_is_available():
16 |         return megablocks.layers.mpu.param_is_expert_model_parallel(param)
17 |     return False
18 | 
19 | def copy_expert_model_parallel_attributes(destination_tensor, source_tensor):
20 |     if not megablocks_is_available():
21 |         return
22 |     megablocks.layers.mpu.copy_expert_model_parallel_attributes(
23 |         destination_tensor, source_tensor)
24 | 
25 | moe = megablocks.layers.moe if megablocks_is_available() else None
26 | dmoe = megablocks.layers.dmoe if megablocks_is_available() else None
27 | arguments = megablocks.layers.arguments if megablocks_is_available() else None
28 | 


--------------------------------------------------------------------------------
/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Utilities for models."""
 4 | 
 5 | import math
 6 | 
 7 | import torch
 8 | 
 9 | from megatron import get_args
10 | 
11 | def init_method_normal(sigma):
12 |     """Init method based on N(0, sigma)."""
13 |     def init_(tensor):
14 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
15 | 
16 |     return init_
17 | 
18 | 
19 | def scaled_init_method_normal(sigma, num_layers):
20 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
21 |     std = sigma / math.sqrt(2.0 * num_layers)
22 | 
23 |     def init_(tensor):
24 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
25 | 
26 |     return init_
27 | 
28 | 
29 | def attention_mask_func(attention_scores, attention_mask):
30 |     attention_scores.masked_fill_(attention_mask, -10000.0)
31 |     return attention_scores
32 | 
33 | 
34 | def get_linear_layer(rows, columns, init_method):
35 |     """Simple linear layer with weight initialization."""
36 |     layer = torch.nn.Linear(rows, columns)
37 |     if get_args().perform_initialization:
38 |         init_method(layer.weight)
39 |     with torch.no_grad():
40 |         layer.bias.zero_()
41 |     return layer
42 | 
43 | @torch.jit.script
44 | def gelu_impl(x):
45 |     """OpenAI's gelu implementation."""
46 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
47 |                                        (1.0 + 0.044715 * x * x)))
48 | def openai_gelu(x):
49 |     return gelu_impl(x)
50 | 
51 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
52 | @torch.jit.script
53 | def erf_gelu(x):
54 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
55 | 


--------------------------------------------------------------------------------
/megatron/model/vision/classification.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Vision Transformer(VIT) model."""
 4 | 
 5 | import torch
 6 | from torch.nn.init import trunc_normal_
 7 | from megatron import get_args
 8 | from megatron.model.utils import get_linear_layer
 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
10 | from megatron.model.vision.mit_backbone import mit_b3_avg
11 | from megatron.model.module import MegatronModule
12 | 
13 | class VitClassificationModel(MegatronModule):
14 |     """Vision Transformer Model."""
15 | 
16 |     def __init__(self, num_classes, finetune=False,
17 |                  pre_process=True, post_process=True):
18 |         super(VitClassificationModel, self).__init__()
19 |         args = get_args()
20 | 
21 |         self.hidden_size = args.hidden_size
22 |         self.num_classes = num_classes
23 |         self.finetune = finetune
24 |         self.pre_process = pre_process
25 |         self.post_process = post_process
26 |         self.backbone = VitBackbone(
27 |             pre_process=self.pre_process,
28 |             post_process=self.post_process,
29 |             single_token_output=True
30 |         )
31 |         
32 |         if self.post_process:
33 |             if not self.finetune:
34 |                 self.head = VitMlpHead(self.hidden_size, self.num_classes)
35 |             else:
36 |                 self.head = get_linear_layer(
37 |                     self.hidden_size,
38 |                     self.num_classes,
39 |                     torch.nn.init.zeros_
40 |                 )
41 | 
42 |     def set_input_tensor(self, input_tensor):
43 |         """See megatron.model.transformer.set_input_tensor()"""
44 |         self.backbone.set_input_tensor(input_tensor)
45 | 
46 |     def forward(self, input):
47 |         hidden_states = self.backbone(input)
48 | 
49 |         if self.post_process:
50 |             hidden_states = self.head(hidden_states)
51 | 
52 |         return hidden_states
53 | 
54 | 
55 | class MitClassificationModel(MegatronModule):
56 |     """Mix vision Transformer Model."""
57 | 
58 |     def __init__(self, num_classes,
59 |                  pre_process=True, post_process=True):
60 |         super(MitClassificationModel, self).__init__()
61 |         args = get_args()
62 | 
63 |         self.hidden_size = args.hidden_size
64 |         self.num_classes = num_classes
65 | 
66 |         self.backbone = mit_b3_avg()
67 |         self.head = torch.nn.Linear(512, num_classes)
68 |         self.apply(self._init_weights)
69 | 
70 |     def _init_weights(self, m):
71 |         if isinstance(m, torch.nn.Linear):
72 |             trunc_normal_(m.weight, std=.02)
73 |             if isinstance(m, torch.nn.Linear) and m.bias is not None:
74 |                 torch.nn.init.constant_(m.bias, 0)
75 | 
76 |     def set_input_tensor(self, input_tensor):
77 |         """See megatron.model.transformer.set_input_tensor()"""
78 |         pass
79 | 
80 |     def forward(self, input):
81 |         hidden_states = self.backbone(input)
82 |         hidden_states = self.head(hidden_states)
83 | 
84 |         return hidden_states
85 | 


--------------------------------------------------------------------------------
/megatron/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def resize(input,
 7 |            size=None,
 8 |            scale_factor=None,
 9 |            mode='nearest',
10 |            align_corners=None,
11 |            warning=True):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
18 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
19 |                         and (output_w - 1) % (input_w - 1)):
20 |                     warnings.warn(
21 |                         f'When align_corners={align_corners}, '
22 |                         'the output would more aligned if '
23 |                         f'input size {(input_h, input_w)} is `x+1` and '
24 |                         f'out size {(output_h, output_w)} is `nx+1`')
25 |     if isinstance(size, torch.Size):
26 |         size = tuple(int(x) for x in size)
27 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
28 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import os
 5 | import random
 6 | import numpy
 7 | import torch
 8 | 
 9 | import mpu
10 | 
11 | 
12 | class IdentityLayer(torch.nn.Module):
13 |     def __init__(self, size, scale=1.0):
14 |         super(IdentityLayer, self).__init__()
15 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
16 | 
17 |     def forward(self):
18 |         return self.weight
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     """Set random seed for reproducability."""
23 |     random.seed(seed)
24 |     numpy.random.seed(seed)
25 |     torch.manual_seed(seed)
26 |     mpu.model_parallel_cuda_manual_seed(seed)
27 | 
28 | 
29 | def initialize_distributed(backend='nccl'):
30 |     """Initialize torch.distributed."""
31 |     # Get local rank in case it is provided.
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--local_rank', type=int, default=None,
34 |                         help='local rank passed from distributed launcher')
35 |     args = parser.parse_args()
36 |     local_rank = args.local_rank
37 | 
38 |     # Get rank and world size.
39 |     rank = int(os.getenv('RANK', '0'))
40 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
41 | 
42 |     print('> initializing torch.distributed with local rank: {}, '
43 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
44 | 
45 |     # Set the device id.
46 |     device = rank % torch.cuda.device_count()
47 |     if local_rank is not None:
48 |         device = local_rank
49 |     torch.cuda.set_device(device)
50 | 
51 |     # Call the init process.
52 |     init_method = 'tcp://'
53 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
54 |     master_port = os.getenv('MASTER_PORT', '6000')
55 |     init_method += master_ip + ':' + master_port
56 |     torch.distributed.init_process_group(
57 |         backend=backend,
58 |         world_size=world_size,
59 |         rank=rank,
60 |         init_method=init_method)
61 | 
62 | 
63 | def print_separator(message):
64 |     torch.distributed.barrier()
65 |     filler_len = (78 - len(message)) // 2
66 |     filler = '-' * filler_len
67 |     string = '\n' + filler + ' {} '.format(message) + filler
68 |     if torch.distributed.get_rank() == 0:
69 |         print(string, flush=True)
70 |     torch.distributed.barrier()
71 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import set_random_seed
 4 | from commons import IdentityLayer
 5 | from commons import print_separator
 6 | from commons import initialize_distributed
 7 | from mpu.cross_entropy import vocab_parallel_cross_entropy
 8 | import mpu
 9 | import torch.nn.functional as F
10 | import torch
11 | import random
12 | import sys
13 | sys.path.append("../..")
14 | 
15 | 
16 | def torch_cross_entropy(batch_size, seq_length, vocab_size,
17 |                         logits_scale, seed):
18 |     set_random_seed(seed)
19 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
20 |                              scale=logits_scale).cuda()
21 |     logits = identity()
22 |     target = torch.cuda.LongTensor(
23 |         size=(batch_size, seq_length)).random_(0, vocab_size)
24 |     loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
25 |                            target.view(-1),
26 |                            reduction='none').view_as(target).mean()
27 |     loss.backward()
28 |     return loss, identity.weight.grad
29 | 
30 | 
31 | def mpu_cross_entropy(batch_size, seq_length, vocab_size,
32 |                       logits_scale, seed):
33 |     set_random_seed(seed)
34 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
35 |                              scale=logits_scale).cuda()
36 |     logits = identity()
37 |     logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits)
38 |     target = torch.cuda.LongTensor(
39 |         size=(batch_size, seq_length)).random_(0, vocab_size)
40 |     loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
41 |     loss.backward()
42 |     return loss, identity.weight.grad
43 | 
44 | 
45 | def test_cross_entropy(tensor_model_parallel_size):
46 | 
47 |     if torch.distributed.get_rank() == 0:
48 |         print('> testing cross entropy with model parallel size {} ...'.
49 |               format(tensor_model_parallel_size))
50 | 
51 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
52 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
53 | 
54 |     batch_size = 13
55 |     seq_length = 17
56 |     vocab_size_per_partition = 11
57 |     logits_scale = 1000.0
58 |     vocab_size = vocab_size_per_partition * tensor_model_parallel_size
59 |     seed = 1234
60 | 
61 |     loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
62 |                                                  vocab_size, logits_scale,
63 |                                                  seed)
64 |     loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
65 |                                            vocab_size, logits_scale,
66 |                                            seed)
67 | 
68 |     error = loss_torch.sub_(loss_mpu).abs().max()
69 |     print('   max error in loss on global rank {}: {}'.format(
70 |         torch.distributed.get_rank(), error))
71 |     assert error < 1.0e-6
72 | 
73 |     error = grad_torch.sub_(grad_mpu).abs().max()
74 |     print('   max error in grad on global rank {}: {}'.format(
75 |         torch.distributed.get_rank(), error))
76 |     assert error < 1.0e-6
77 | 
78 |     # Reset groups
79 |     mpu.destroy_tensor_model_parallel()
80 | 
81 |     torch.distributed.barrier()
82 |     if torch.distributed.get_rank() == 0:
83 |         print('>> passed the test :-)')
84 | 
85 | 
86 | if __name__ == '__main__':
87 | 
88 |     initialize_distributed()
89 |     world_size = torch.distributed.get_world_size()
90 | 
91 |     tensor_model_parallel_size = 1
92 |     while tensor_model_parallel_size <= world_size:
93 |         print_separator('test cross entropy')
94 |         test_cross_entropy(tensor_model_parallel_size)
95 |         tensor_model_parallel_size *= 2
96 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import print_separator
 4 | from commons import initialize_distributed
 5 | from mpu import data as data_utils
 6 | import mpu
 7 | import torch
 8 | import functools
 9 | import operator
10 | import sys
11 | sys.path.append("../..")
12 | 
13 | 
14 | def test_broadcast_data(tensor_model_parallel_size):
15 | 
16 |     if torch.distributed.get_rank() == 0:
17 |         print('> testing broadcast_data with model parallel size {} ...'.
18 |               format(tensor_model_parallel_size))
19 | 
20 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
21 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
22 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
23 | 
24 |     key_size_t = {'key1': [7, 11],
25 |                   'key2': [8, 2, 1],
26 |                   'key3': [13],
27 |                   'key4': [5, 1, 2],
28 |                   'key5': [5, 12]}
29 |     keys = list(key_size_t.keys())
30 | 
31 |     data = {}
32 |     data_t = {}
33 |     for key in key_size_t:
34 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
35 |         data_t[key] = data[key].clone()
36 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
37 |     data_t['keyX'] = data['keyX'].clone()
38 |     if mpu.get_tensor_model_parallel_rank() != 0:
39 |         data = None
40 | 
41 |     data_utils._check_data_types(keys, data_t, torch.int64)
42 |     key_size, key_numel, \
43 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
44 |     for key in keys:
45 |         assert key_size[key] == key_size_t[key]
46 |     total_numel_t = 0
47 |     for key in keys:
48 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
49 |         assert key_numel[key] == target_size
50 |         total_numel_t += target_size
51 |     assert total_numel == total_numel_t
52 | 
53 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
54 |     for key in keys:
55 |         tensor = data_t[key].cuda()
56 |         assert data_b[key].sub(tensor).abs().max() == 0
57 | 
58 |     # Reset groups
59 |     mpu.destroy_tensor_model_parallel()
60 | 
61 |     torch.distributed.barrier()
62 |     if torch.distributed.get_rank() == 0:
63 |         print('>> passed the test :-)')
64 | 
65 | 
66 | if __name__ == '__main__':
67 | 
68 |     initialize_distributed()
69 |     world_size = torch.distributed.get_world_size()
70 | 
71 |     tensor_model_parallel_size = 1
72 |     while tensor_model_parallel_size <= world_size:
73 |         print_separator('test test broadcast data')
74 |         test_broadcast_data(tensor_model_parallel_size)
75 |         tensor_model_parallel_size *= 2
76 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import print_separator
 4 | from commons import initialize_distributed
 5 | import mpu
 6 | import torch
 7 | import sys
 8 | sys.path.append("../..")
 9 | 
10 | 
11 | def test_initialize_model_parallel(tensor_model_parallel_size):
12 | 
13 |     if torch.distributed.get_rank() == 0:
14 |         print('> testing initialize_model_parallel with size {} ...'.format(
15 |             tensor_model_parallel_size))
16 |     tensor_model_parallel_size_ = min(tensor_model_parallel_size,
17 |                                torch.distributed.get_world_size())
18 |     assert not mpu.model_parallel_is_initialized()
19 |     mpu.initialize_model_parallel(tensor_model_parallel_size_)
20 |     assert mpu.model_parallel_is_initialized()
21 | 
22 |     # Checks.
23 |     def check(group, world_size, rank):
24 |         assert world_size == torch.distributed.get_world_size(group=group)
25 |         assert rank == torch.distributed.get_rank(group=group)
26 | 
27 |     # Model parallel.
28 |     world_size = tensor_model_parallel_size_
29 |     rank = torch.distributed.get_rank() % tensor_model_parallel_size_
30 |     assert world_size == mpu.get_tensor_model_parallel_world_size()
31 |     assert rank == mpu.get_tensor_model_parallel_rank()
32 |     check(mpu.get_tensor_model_parallel_group(), world_size, rank)
33 | 
34 |     # Data parallel.
35 |     world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
36 |     rank = torch.distributed.get_rank() // tensor_model_parallel_size
37 |     assert world_size == mpu.get_data_parallel_world_size()
38 |     assert rank == mpu.get_data_parallel_rank()
39 |     check(mpu.get_data_parallel_group(), world_size, rank)
40 | 
41 |     # Reset groups
42 |     mpu.destroy_model_parallel()
43 | 
44 |     torch.distributed.barrier()
45 |     if torch.distributed.get_rank() == 0:
46 |         print('>> passed the test :-)')
47 | 
48 | 
49 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
50 | 
51 |     if torch.distributed.get_rank() == 0:
52 |         print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
53 |             tensor_model_parallel_size_))
54 |     tensor_model_parallel_size = min(tensor_model_parallel_size_,
55 |                               torch.distributed.get_world_size())
56 |     assert not mpu.model_parallel_is_initialized()
57 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
58 |     assert mpu.model_parallel_is_initialized()
59 | 
60 |     # Checks
61 |     src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank()
62 |     assert mpu.get_tensor_model_parallel_src_rank() == src_rank
63 | 
64 |     # Reset groups
65 |     mpu.destroy_model_parallel()
66 | 
67 |     torch.distributed.barrier()
68 |     if torch.distributed.get_rank() == 0:
69 |         print('>> passed the test :-)')
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     initialize_distributed()
75 |     world_size = torch.distributed.get_world_size()
76 |     tensor_model_parallel_size = 1
77 |     while tensor_model_parallel_size <= world_size:
78 |         print_separator('test initialize model parallel')
79 |         test_initialize_model_parallel(tensor_model_parallel_size)
80 |         print_separator('test model parallel source rank')
81 |         test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
82 |         tensor_model_parallel_size *= 2
83 | 


--------------------------------------------------------------------------------
/megatron/static/index.html:
--------------------------------------------------------------------------------
  1 | <!-- coding=utf-8-->
  2 | <!-- Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.-->
  3 | 
  4 | <!DOCTYPE html>
  5 | <html lang="en">
  6 | <head>
  7 | <meta charset="utf-8" />
  8 | <title>Megatron</title>
  9 | <style>
 10 | 
 11 | .wrapper {
 12 | max-width: 75%;
 13 | margin: auto;
 14 | }
 15 | 
 16 | h1 { 
 17 | margin: 3rem 0 1rem 0; 
 18 | padding: 0;
 19 | font-size: 1.5rem;
 20 | }
 21 | 
 22 | textarea {
 23 | width: 100%;
 24 | min-height: 300px;
 25 | resize: none;
 26 | border-radius: 8px;
 27 | border: 1px solid #ddd;
 28 | padding: 0.5rem;
 29 | box-shadow: inset 0 0 0.25rem #ddd;
 30 | &:focus {
 31 | outline: none;
 32 | border: 1px solid darken(#ddd, 5%);
 33 | box-shadow: inset 0 0 0.5rem darken(#ddd, 5%);
 34 | }
 35 | }
 36 | 
 37 | #the-count {
 38 | float: right;
 39 | padding: 0.1rem 0 0 0;
 40 | font-size: 0.875rem;
 41 | }
 42 | /* Chat containers */
 43 | .container {
 44 | font-family: 'Arial', sans-serif;
 45 | font-size: 16px;
 46 | border: 2px solid #dedede;
 47 | background-color: #f1f1f1;
 48 | border-radius: 5px;
 49 | padding: 15px;
 50 | margin: 10px 0;
 51 | }
 52 | 
 53 | 
 54 | /* Clear floats */
 55 | .container::after {
 56 | content: "";
 57 | clear: both;
 58 | display: table;
 59 | }
 60 | 
 61 | /* Style images */
 62 | .container img {
 63 | float: left;
 64 | max-width: 60px;
 65 | width: 100%;
 66 | margin-right: 20px;
 67 | border-radius: 50%;
 68 | }
 69 | 
 70 | </style>
 71 | </head>
 72 | <body>
 73 | <div class="wrapper">
 74 | <h1>Prompt Megatron</h1>
 75 | <textarea name="prompt" id="prompt" maxlength="1024" placeholder="Add prompt"autofocus></textarea>
 76 | <label for="tokens_to_generate">Number tokens to generate (1-1024):</label>
 77 | <input type="number" id="tokens_to_generate" name="tokens_to_generate" min="10" max="256", value=32>
 78 | <button onclick="submit_query()">Submit</button>
 79 | 
 80 | <div id="the-count">
 81 | <span id="current">0</span>
 82 | <span id="maximum">/ 1000</span>
 83 | </div>
 84 | <textarea name="response" id="response" maxlength="2048" placeholder="Megatron response..."></textarea>
 85 | </div>
 86 | <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
 87 | <script type="text/javascript">
 88 | 
 89 | 	function submit_query() {
 90 | 		$("#response").val("Waiting for Megatron response...");
 91 | 		$.ajax({
 92 | 			url:"api",
 93 | 			type:"PUT",
 94 | 			data:JSON.stringify({prompts: [$("#prompt").val()], tokens_to_generate: parseInt($("#tokens_to_generate").val(),10)}),
 95 | 			contentType:"application/json; charset=utf-8",
 96 | 			dataType:"json",
 97 | 			success: function(data){
 98 | 				data.max_len=35;
 99 | 				$("#response").val(data.text);
100 | 			}
101 | 		});
102 | 	}
103 | 	
104 | $('textarea').keyup(function() {
105 | var characterCount = $(this).val().length,
106 | current = $('#current'),
107 | maximum = $('#maximum'),
108 | theCount = $('#the-count');
109 | 
110 | current.text(characterCount);
111 | 
112 | if (characterCount >= 800) {
113 | maximum.css('color', '#8f0001');
114 | current.css('color', '#8f0001');
115 | theCount.css('font-weight','bold');
116 | } else {
117 | maximum.css('color','#666');
118 | theCount.css('font-weight','normal');
119 | }
120 | });
121 | </script>
122 | </body>
123 | </html>
124 | 
125 | 


--------------------------------------------------------------------------------
/megatron/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/megatron/text_generation/beam_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | ## from huggingface beam search
19 | class BeamHypotheses(object):
20 |     def __init__(self, num_beams, length_penalty=1.0, early_stopping=False):
21 |         """
22 |         Initialize n-best list of hypotheses.
23 |         """
24 |         self.length_penalty = length_penalty
25 |         self.early_stopping = early_stopping
26 |         self.num_beams = num_beams
27 |         self.beams = []
28 |         self.worst_score = 1e9
29 | 
30 |     def __len__(self):
31 |         """
32 |         Number of hypotheses in the list.
33 |         """
34 |         return len(self.beams)
35 | 
36 |     def add(self, hyp, sum_logprobs, length):
37 |         """
38 |         Add a new hypothesis to the list.
39 |         """
40 |         score = sum_logprobs / length ** self.length_penalty
41 |         if len(self) < self.num_beams or score > self.worst_score:
42 |             self.beams.append((score, hyp))
43 |             if len(self) > self.num_beams:
44 |                 sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
45 |                 del self.beams[sorted_scores[0][1]]
46 |                 self.worst_score = sorted_scores[1][0]
47 |             else:
48 |                 self.worst_score = min(score, self.worst_score)
49 | 
50 |     def is_done(self, best_sum_logprobs, cur_len):
51 |         """
52 |         If there are enough hypotheses and that none of the hypotheses being generated
53 |         can become better than the worst one in the heap, then we are done with this sentence.
54 |         """
55 | 
56 |         if len(self) < self.num_beams:
57 |             return False
58 |         elif self.early_stopping:
59 |             return True
60 |         else:
61 |             cur_score = best_sum_logprobs / cur_len ** self.length_penalty
62 |             ret = self.worst_score >= cur_score
63 |             return ret
64 | 
65 | 


--------------------------------------------------------------------------------
/megatron/text_generation/sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Sampling utilities.
 4 | Part of this code is inspired by:
 5 |  - https://github.com/ari-holtzman/degen/blob/master/gen.py
 6 |  - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html
 7 | """
 8 | 
 9 | 
10 | import torch
11 | 
12 | 
13 | 
14 | def modify_logits_for_top_k_filtering(logits, top_k):
15 |     """Set the logits for none top-k values to -inf."""
16 | 
17 |     filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
18 |     logits.masked_fill_(filter_, float('-Inf'))
19 | 
20 | 
21 | 
22 | def modify_logits_for_top_p_filtering(logits, top_p):
23 |     """Set the logits for none top-p values to -inf."""
24 | 
25 |     # First sort and calculate cumulative sum of probabilities.
26 |     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
27 |     cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
28 | 
29 |     # Filteration based on the cumulative sum.
30 |     filter_ = cumulative_probs > top_p
31 |     # This shift by 1 is weird and I cannot justify it. This existed
32 |     # in the original implementation:
33 |     #   https://github.com/ari-holtzman/degen/blob/master/gen.py
34 |     # and I guess it is needed so keeping it for now.
35 |     filter_[:, 1:] = filter_[:, :-1].clone()
36 |     # Make sure we at least have one token to select from.
37 |     filter_[..., 0] = 0
38 | 
39 |     # Fill in the filtered part
40 |     filter_ = filter_.scatter(1, sorted_indices, filter_)
41 |     logits.masked_fill_(filter_, float('-Inf'))
42 | 
43 | 
44 | 
45 | def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
46 |     """ Sample and generate a token.
47 |     Note: logits has the dimension [b, v] where b is the batch size
48 |           and v is the vocabulary size.
49 |     If vocab_size is provided, we will make sure the sample that is
50 |     generated is in [0, vocab-size). This will avoid out of vocabulary
51 |     generations due to padding.
52 |     """
53 | 
54 |     # Check logits for consistency.
55 |     assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
56 |     assert logits.type() == 'torch.cuda.FloatTensor', \
57 |         'input logits should be floats.'
58 | 
59 | 
60 |     # Greedy is just simple argmax.
61 |     if top_k == 1:
62 |         assert top_p == 0.0, 'cannot set both greedy and top-p samplings.'
63 |         samples = torch.argmax(logits, dim=-1)
64 | 
65 |     # Top-k or top-p sampling.
66 |     else:
67 |         # Clone so we do not modify the inputs,
68 |         logits = logits.clone()
69 |         # Apply temperature in place.
70 |         if temperature != 1.0:
71 |             logits.div_(temperature)
72 | 
73 |         if top_k > 1:
74 |             assert top_p == 0.0, 'cannot set both top-k and top-p samplings.'
75 |             assert top_k <= logits.size(1), 'top-k is larger than logit size.'
76 |             if vocab_size:
77 |                 assert top_k < vocab_size, 'top-k is larger than vocab size.'
78 |             modify_logits_for_top_k_filtering(logits, top_k)
79 | 
80 |         elif top_p > 0.0:
81 |             assert top_p <= 1.0, 'top-p should be in (0, 1].'
82 |             modify_logits_for_top_p_filtering(logits, top_p)
83 | 
84 |         # After filtering, we need to recalculate the distribution.
85 |         probs = logits.softmax(dim=-1)
86 |         samples = torch.multinomial(probs, num_samples=1).view(-1)
87 | 
88 |     # If vocab size is provided, make sure the samples are in
89 |     # in the range [0, vocab-size).
90 |     if vocab_size:
91 |         samples = torch.clamp(samples, min=0, max=(vocab_size - 1))
92 | 
93 |     return samples
94 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/pretrain_vision_classify.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
  2 | 
  3 | """Pretrain VIT"""
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from functools import partial
  8 | from megatron import get_args, get_timers, print_rank_0
  9 | from megatron.data.vit_dataset import build_train_valid_datasets
 10 | from megatron.model import ModelType
 11 | from megatron.model.vision.classification import VitClassificationModel
 12 | from megatron.model.vision.classification import MitClassificationModel
 13 | from megatron.training import pretrain
 14 | from megatron.utils import average_losses_across_data_parallel_group
 15 | 
 16 | 
 17 | def model_provider(pre_process=True, post_process=True):
 18 |     """Build the model."""
 19 | 
 20 |     args = get_args()
 21 | 
 22 |     if args.vision_backbone_type == 'vit':
 23 |         print_rank_0("building VIT model ...")
 24 |         model = VitClassificationModel(num_classes=args.num_classes,
 25 |                                        pre_process=pre_process,
 26 |                                        post_process=post_process)
 27 |     elif args.vision_backbone_type == 'mit':
 28 |         print_rank_0("building MIT model ...")
 29 |         model = MitClassificationModel(num_classes=args.num_classes,
 30 |                                        pre_process=pre_process,
 31 |                                        post_process=post_process)
 32 |     else:
 33 |         raise Exception('{} vision backbone is not supported.'.format(
 34 |                               args.vision_backbone_type))
 35 |     return model
 36 | 
 37 | 
 38 | def get_batch(data_iterator):
 39 |     """Build the batch."""
 40 |     data = next(data_iterator)
 41 | 
 42 |     # only data parallelism; no need for broadcast
 43 |     images = data[0].cuda()
 44 |     labels = data[1].cuda()
 45 | 
 46 |     return images, labels
 47 | 
 48 | 
 49 | def loss_func(labels, output_tensor):
 50 |     logits = output_tensor.contiguous().float()
 51 |     loss = F.cross_entropy(logits, labels)
 52 | 
 53 |     outputs = torch.argmax(logits, -1)
 54 |     correct = (outputs == labels).float()
 55 |     accuracy = torch.mean(correct)
 56 | 
 57 |     averaged_loss = average_losses_across_data_parallel_group([loss, accuracy])
 58 | 
 59 |     return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
 60 | 
 61 | 
 62 | def forward_step(data_iterator, model):
 63 |     """Forward step."""
 64 |     timers = get_timers()
 65 | 
 66 |     # Get the batch.
 67 |     timers("batch-generator", log_level=2).start()
 68 |     (
 69 |         images,
 70 |         labels,
 71 |     ) = get_batch(data_iterator)
 72 |     timers("batch-generator").stop()
 73 | 
 74 |     # Forward model. lm_labels
 75 |     output_tensor = model(images)
 76 | 
 77 |     return output_tensor, partial(loss_func, labels)
 78 | 
 79 | def train_valid_test_datasets_provider(train_val_test_num_samples):
 80 |     """Build train, valid, and test datasets."""
 81 |     args = get_args()
 82 | 
 83 |     print_rank_0(
 84 |         "> building train, validation, and test datasets " "for VIT ..."
 85 |     )
 86 |     train_ds, valid_ds = build_train_valid_datasets(
 87 |         data_path=args.data_path,
 88 |         image_size=(args.img_h, args.img_w)
 89 |     )
 90 |     print_rank_0("> finished creating VIT datasets ...")
 91 | 
 92 |     return train_ds, valid_ds, None
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 | 
 97 |     pretrain(
 98 |         train_valid_test_datasets_provider,
 99 |         model_provider,
100 |         ModelType.encoder_or_decoder,
101 |         forward_step,
102 |         args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
103 |     )
104 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="megatron.core",
 5 |     version="0.1",
 6 |     description="Core components of Megatron.",
 7 |     packages=find_packages(
 8 |         include=("megatron.core")
 9 |     )
10 | )
11 | 


--------------------------------------------------------------------------------
/tasks/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """ Tasks data utility."""
  4 | 
  5 | import re
  6 | import numpy as np
  7 | 
  8 | 
  9 | def clean_text(text):
 10 |     """Remove new lines and multiple spaces and adjust end of sentence dot."""
 11 | 
 12 |     text = text.replace("\n", " ")
 13 |     text = re.sub(r'\s+', ' ', text)
 14 |     for _ in range(3):
 15 |         text = text.replace(' . ', '. ')
 16 | 
 17 |     return text
 18 | 
 19 | 
 20 | def build_sample(ids, types, paddings, label, unique_id):
 21 |     """Convert to numpy and return a sample consumed by the batch producer."""
 22 | 
 23 |     ids_np = np.array(ids, dtype=np.int64)
 24 |     types_np = np.array(types, dtype=np.int64)
 25 |     paddings_np = np.array(paddings, dtype=np.int64)
 26 |     sample = ({'text': ids_np,
 27 |                'types': types_np,
 28 |                'padding_mask': paddings_np,
 29 |                'label': int(label),
 30 |                'uid': int(unique_id)})
 31 | 
 32 |     return sample
 33 | 
 34 | 
 35 | def build_tokens_types_paddings_from_text(text_a, text_b,
 36 |                                           tokenizer, max_seq_length):
 37 |     """Build token types and paddings, trim if needed, and pad if needed."""
 38 | 
 39 |     text_a_ids = tokenizer.tokenize(text_a)
 40 |     text_b_ids = None
 41 |     if text_b is not None:
 42 |         text_b_ids = tokenizer.tokenize(text_b)
 43 | 
 44 |     return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
 45 |                                                 max_seq_length, tokenizer.cls,
 46 |                                                 tokenizer.sep, tokenizer.pad)
 47 | 
 48 | 
 49 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
 50 |                                          cls_id, sep_id, pad_id):
 51 |     """Build token types and paddings, trim if needed, and pad if needed."""
 52 | 
 53 |     ids = []
 54 |     types = []
 55 |     paddings = []
 56 | 
 57 |     # [CLS].
 58 |     ids.append(cls_id)
 59 |     types.append(0)
 60 |     paddings.append(1)
 61 | 
 62 |     # A.
 63 |     len_text_a = len(text_a_ids)
 64 |     ids.extend(text_a_ids)
 65 |     types.extend([0] * len_text_a)
 66 |     paddings.extend([1] * len_text_a)
 67 | 
 68 |     # [SEP].
 69 |     ids.append(sep_id)
 70 |     types.append(0)
 71 |     paddings.append(1)
 72 | 
 73 |     # B.
 74 |     if text_b_ids is not None:
 75 |         len_text_b = len(text_b_ids)
 76 |         ids.extend(text_b_ids)
 77 |         types.extend([1] * len_text_b)
 78 |         paddings.extend([1] * len_text_b)
 79 | 
 80 |     # Cap the size.
 81 |     trimmed = False
 82 |     if len(ids) >= max_seq_length:
 83 |         max_seq_length_m1 = max_seq_length - 1
 84 |         ids = ids[0:max_seq_length_m1]
 85 |         types = types[0:max_seq_length_m1]
 86 |         paddings = paddings[0:max_seq_length_m1]
 87 |         trimmed = True
 88 | 
 89 |     # [SEP].
 90 |     if (text_b_ids is not None) or trimmed:
 91 |         ids.append(sep_id)
 92 |         if text_b_ids is None:
 93 |             types.append(0)
 94 |         else:
 95 |             types.append(1)
 96 |         paddings.append(1)
 97 | 
 98 |     # Padding.
 99 |     padding_length = max_seq_length - len(ids)
100 |     if padding_length > 0:
101 |         ids.extend([pad_id] * padding_length)
102 |         types.extend([pad_id] * padding_length)
103 |         paddings.extend([0] * padding_length)
104 | 
105 |     return ids, types, paddings
106 | 


--------------------------------------------------------------------------------
/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """GLUE dataset."""
 4 | 
 5 | from abc import ABC
 6 | from abc import abstractmethod
 7 | 
 8 | from torch.utils.data import Dataset
 9 | 
10 | from megatron import print_rank_0
11 | from tasks.data_utils import build_sample
12 | from tasks.data_utils import build_tokens_types_paddings_from_text
13 | 
14 | 
15 | class GLUEAbstractDataset(ABC, Dataset):
16 |     """GLUE base dataset class."""
17 | 
18 |     def __init__(self, task_name, dataset_name, datapaths,
19 |                  tokenizer, max_seq_length):
20 |         # Store inputs.
21 |         self.task_name = task_name
22 |         self.dataset_name = dataset_name
23 |         self.tokenizer = tokenizer
24 |         self.max_seq_length = max_seq_length
25 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
26 |                                                              self.dataset_name))
27 |         # Process the files.
28 |         string = '  > paths:'
29 |         for path in datapaths:
30 |             string += ' ' + path
31 |         print_rank_0(string)
32 |         self.samples = []
33 |         for datapath in datapaths:
34 |             self.samples.extend(self.process_samples_from_single_path(datapath))
35 |         print_rank_0('  >> total number of samples: {}'.format(
36 |             len(self.samples)))
37 | 
38 |     def __len__(self):
39 |         return len(self.samples)
40 | 
41 |     def __getitem__(self, idx):
42 |         raw_sample = self.samples[idx]
43 |         ids, types, paddings = build_tokens_types_paddings_from_text(
44 |             raw_sample['text_a'], raw_sample['text_b'],
45 |             self.tokenizer, self.max_seq_length)
46 |         sample = build_sample(ids, types, paddings,
47 |                               raw_sample['label'], raw_sample['uid'])
48 |         return sample
49 | 
50 |     @abstractmethod
51 |     def process_samples_from_single_path(self, datapath):
52 |         """Abstract method that takes a single path / filename and
53 |         returns a list of dataset samples, each sample being a dict of
54 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
55 |         """
56 |         pass
57 | 


--------------------------------------------------------------------------------
/tasks/glue/finetune.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """GLUE finetuning/evaluation."""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from megatron import get_tokenizer
 8 | from megatron.model.classification import Classification
 9 | from tasks.eval_utils import accuracy_func_provider
10 | from tasks.finetune_utils import finetune
11 | 
12 | 
13 | def glue_classification(num_classes, Dataset,
14 |                         name_from_datapath_func):
15 | 
16 |     def train_valid_datasets_provider():
17 |         """Build train and validation dataset."""
18 |         args = get_args()
19 |         tokenizer = get_tokenizer()
20 | 
21 |         train_dataset = Dataset('training', args.train_data,
22 |                                 tokenizer, args.seq_length)
23 |         valid_dataset = Dataset('validation', args.valid_data,
24 |                                 tokenizer, args.seq_length)
25 | 
26 |         return train_dataset, valid_dataset
27 | 
28 |     def model_provider(pre_process=True, post_process=True):
29 |         """Build the model."""
30 |         args = get_args()
31 | 
32 |         print_rank_0('building classification model for {} ...'.format(
33 |             args.task))
34 |         model = Classification(num_classes=num_classes, num_tokentypes=2,
35 |                                pre_process=pre_process, post_process=post_process)
36 | 
37 |         return model
38 | 
39 |     def metrics_func_provider():
40 |         """Privde metrics callback function."""
41 |         def single_dataset_provider(datapath):
42 |             args = get_args()
43 |             tokenizer = get_tokenizer()
44 | 
45 |             name = name_from_datapath_func(datapath)
46 |             return Dataset(name, [datapath], tokenizer, args.seq_length)
47 |         return accuracy_func_provider(single_dataset_provider)
48 | 
49 |     """Finetune/evaluate."""
50 |     finetune(train_valid_datasets_provider, model_provider,
51 |              end_of_epoch_callback_provider=metrics_func_provider)
52 | 
53 | 
54 | def main():
55 |     args = get_args()
56 | 
57 |     if args.task == 'MNLI':
58 | 
59 |         num_classes = 3
60 |         from tasks.glue.mnli import MNLIDataset as Dataset
61 | 
62 |         def name_from_datapath(datapath):
63 |             return datapath.split('MNLI')[-1].strip(
64 |                 '.tsv').strip('/').replace('_', '-')
65 | 
66 |     elif args.task == 'QQP':
67 | 
68 |         num_classes = 2
69 |         from tasks.glue.qqp import QQPDataset as Dataset
70 | 
71 |         def name_from_datapath(datapath):
72 |             return datapath.split('QQP')[-1].strip(
73 |                 '.tsv').strip('/').replace('_', '-')
74 | 
75 |     else:
76 |         raise NotImplementedError('GLUE task {} is not implemented.'.format(
77 |             args.task))
78 | 
79 |     glue_classification(num_classes, Dataset, name_from_datapath)
80 | 


--------------------------------------------------------------------------------
/tasks/glue/mnli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """MNLI dataset."""
 4 | 
 5 | from megatron import print_rank_0
 6 | from tasks.data_utils import clean_text
 7 | from .data import GLUEAbstractDataset
 8 | 
 9 | 
10 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
11 | 
12 | 
13 | class MNLIDataset(GLUEAbstractDataset):
14 | 
15 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
16 |                  test_label='contradiction'):
17 |         self.test_label = test_label
18 |         super().__init__('MNLI', name, datapaths,
19 |                          tokenizer, max_seq_length)
20 | 
21 |     def process_samples_from_single_path(self, filename):
22 |         """"Implement abstract method."""
23 |         print_rank_0(' > Processing {} ...'.format(filename))
24 | 
25 |         samples = []
26 |         total = 0
27 |         first = True
28 |         is_test = False
29 |         with open(filename, 'r') as f:
30 |             for line in f:
31 |                 row = line.strip().split('\t')
32 |                 if first:
33 |                     first = False
34 |                     if len(row) == 10:
35 |                         is_test = True
36 |                         print_rank_0(
37 |                             '   reading {}, {} and {} columns and setting '
38 |                             'labels to {}'.format(
39 |                                 row[0].strip(), row[8].strip(),
40 |                                 row[9].strip(), self.test_label))
41 |                     else:
42 |                         print_rank_0('    reading {} , {}, {}, and {} columns '
43 |                                      '...'.format(
44 |                                          row[0].strip(), row[8].strip(),
45 |                                          row[9].strip(), row[-1].strip()))
46 |                     continue
47 | 
48 |                 text_a = clean_text(row[8].strip())
49 |                 text_b = clean_text(row[9].strip())
50 |                 unique_id = int(row[0].strip())
51 |                 label = row[-1].strip()
52 |                 if is_test:
53 |                     label = self.test_label
54 | 
55 |                 assert len(text_a) > 0
56 |                 assert len(text_b) > 0
57 |                 assert label in LABELS
58 |                 assert unique_id >= 0
59 | 
60 |                 sample = {'text_a': text_a,
61 |                           'text_b': text_b,
62 |                           'label': LABELS[label],
63 |                           'uid': unique_id}
64 |                 total += 1
65 |                 samples.append(sample)
66 | 
67 |                 if total % 50000 == 0:
68 |                     print_rank_0('  > processed {} so far ...'.format(total))
69 | 
70 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
71 |         return samples
72 | 


--------------------------------------------------------------------------------
/tasks/glue/qqp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """QQP dataset."""
 4 | 
 5 | from megatron import print_rank_0
 6 | from tasks.data_utils import clean_text
 7 | from .data import GLUEAbstractDataset
 8 | 
 9 | 
10 | LABELS = [0, 1]
11 | 
12 | 
13 | class QQPDataset(GLUEAbstractDataset):
14 | 
15 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
16 |                  test_label=0):
17 |         self.test_label = test_label
18 |         super().__init__('QQP', name, datapaths,
19 |                          tokenizer, max_seq_length)
20 | 
21 |     def process_samples_from_single_path(self, filename):
22 |         """"Implement abstract method."""
23 |         print_rank_0(' > Processing {} ...'.format(filename))
24 | 
25 |         samples = []
26 |         total = 0
27 |         first = True
28 |         is_test = False
29 |         with open(filename, 'r') as f:
30 |             for line in f:
31 |                 row = line.strip().split('\t')
32 |                 if first:
33 |                     first = False
34 |                     if len(row) == 3:
35 |                         is_test = True
36 |                         print_rank_0('   reading {}, {}, and {} columns and '
37 |                                      'setting labels to {}'.format(
38 |                                          row[0].strip(), row[1].strip(),
39 |                                          row[2].strip(), self.test_label))
40 |                     else:
41 |                         assert len(row) == 6
42 |                         print_rank_0('    reading {}, {}, {}, and {} columns'
43 |                                      ' ...'.format(
44 |                                          row[0].strip(), row[3].strip(),
45 |                                          row[4].strip(), row[5].strip()))
46 |                     continue
47 | 
48 |                 if is_test:
49 |                     assert len(row) == 3, 'expected length 3: {}'.format(row)
50 |                     uid = int(row[0].strip())
51 |                     text_a = clean_text(row[1].strip())
52 |                     text_b = clean_text(row[2].strip())
53 |                     label = self.test_label
54 |                     assert len(text_a) > 0
55 |                     assert len(text_b) > 0
56 |                 else:
57 |                     if len(row) == 6:
58 |                         uid = int(row[0].strip())
59 |                         text_a = clean_text(row[3].strip())
60 |                         text_b = clean_text(row[4].strip())
61 |                         label = int(row[5].strip())
62 |                     else:
63 |                         print_rank_0('***WARNING*** index error, '
64 |                                      'skipping: {}'.format(row))
65 |                         continue
66 |                     if len(text_a) == 0:
67 |                         print_rank_0('***WARNING*** zero length a, '
68 |                                      'skipping: {}'.format(row))
69 |                         continue
70 |                     if len(text_b) == 0:
71 |                         print_rank_0('***WARNING*** zero length b, '
72 |                                      'skipping: {}'.format(row))
73 |                         continue
74 |                 assert label in LABELS
75 |                 assert uid >= 0
76 | 
77 |                 sample = {'uid': uid,
78 |                           'text_a': text_a,
79 |                           'text_b': text_b,
80 |                           'label': label}
81 |                 total += 1
82 |                 samples.append(sample)
83 | 
84 |                 if total % 50000 == 0:
85 |                     print_rank_0('  > processed {} so far ...'.format(total))
86 | 
87 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
88 |         return samples
89 | 


--------------------------------------------------------------------------------
/tasks/msdp/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
 3 | 
 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
 5 | 
 6 | ## Multi-Stage Dialogue Prompting
 7 | 
 8 | ### Data Preparation
 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
11 | 
12 | ### Stage-1: Prompting for Knowledge Generation
13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
15 | 
16 | ### Stage-2: Prompting for Response Generation
17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
19 | 3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
20 | 


--------------------------------------------------------------------------------
/tasks/msdp/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Model evaluation"""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from tasks.msdp.metrics import F1Metric
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | def evaluate_f1(guess_file, answer_file):
12 |     """Evaluating F1 Score"""
13 | 
14 |     guess_list = []
15 |     print_rank_0('reading %s' % guess_file)
16 |     with open(guess_file, "r") as f:
17 |         for i, line in enumerate(tqdm(f)):
18 |             line = line.strip()
19 |             if "<|endoftext|>" in line:
20 |                 line = line.replace("<|endoftext|>", "")
21 |             guess_list.append(line)
22 | 
23 |     answer_list = []
24 |     print_rank_0('reading %s' % answer_file)
25 |     with open(answer_file, "r") as f:
26 |         for i, line in enumerate(tqdm(f)):
27 |             line = line.strip()
28 |             if line == "no_passages_used":
29 |                 line = ""
30 |             answer_list.append(line)
31 | 
32 |     assert len(guess_list) == len(answer_list), \
33 |         "lengths of guess and answer are different!"
34 | 
35 |     precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
36 |     print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
37 | 
38 |     print_rank_0('done :-)')
39 | 
40 | 
41 | def main():
42 |     args = get_args()
43 |     
44 |     evaluate_f1(args.guess_file, args.answer_file)
45 | 
46 | 


--------------------------------------------------------------------------------
/tasks/msdp/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Run multi-stage dialogue prompting (MSDP)."""
 4 | 
 5 | import os
 6 | import sys
 7 | sys.path.append(os.path.abspath(os.path.join(
 8 |     os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir)))
 9 | from megatron import get_args
10 | from megatron.initialize import initialize_megatron
11 | 
12 | 
13 | def get_tasks_args(parser):
14 |     """Provide extra arguments required for tasks."""
15 |     group = parser.add_argument_group(title='tasks')
16 | 
17 |     # parameters for the knowledgeable dialogue generation
18 |     group.add_argument('--task', type=str, required=True,
19 |                        help='Task name.')
20 |     group.add_argument("--sample-input-file", type=str, default=None,
21 |                        help='Get input from file instead of interactive mode, '
22 |                        'each line is an input.')
23 |     group.add_argument("--sample-output-file", type=str, default=None,
24 |                        help='Output file got from --sample-input-file')
25 |     group.add_argument('--prompt-file', type=str, default=None,
26 |                        help='prompting file')
27 |     group.add_argument('--prompt-type', type=str, default=None, 
28 |                        choices=['knowledge', 'response'],
29 |                        help='prompt type (knowledge or response)')
30 |     group.add_argument('--num-prompt-examples', type=int, default=10,
31 |                        help='number of prompt examples')
32 |     group.add_argument('--guess-file', type=str, default=None,
33 |                        help='datapath for generated sentences')
34 |     group.add_argument('--answer-file', type=str, default=None,
35 |                        help='datapath for golden sentences')
36 |     group.add_argument('--out-seq-length', type=int, default=100,
37 |                        help='output sequence length')
38 |     group.add_argument('--api-prompt', default=False, action="store_true",
39 |                        help='setup model api for prompting')
40 |     group.add_argument('--megatron-api-url', type=str, default=None,
41 |                        help='url of the megatron api')
42 | 
43 |     return parser
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 
48 |     initialize_megatron(extra_args_provider=get_tasks_args)
49 | 
50 |     args = get_args()
51 | 
52 |     if args.num_layers_per_virtual_pipeline_stage is not None:
53 |         print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
54 |         exit()
55 | 
56 |     if args.task == 'MSDP-PROMPT':
57 |         from tasks.msdp.prompt import main
58 | 
59 |     elif args.task == 'MSDP-EVAL-F1':
60 |         from tasks.msdp.evaluate import main
61 | 
62 |     else:
63 |         raise NotImplementedError('Task {} is not implemented.'.format(
64 |             args.task))
65 | 
66 |     main()
67 | 


--------------------------------------------------------------------------------
/tasks/msdp/metrics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # The following code is adapted from
 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
 4 | # which is licensed under the MIT license. More details on the license can be 
 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
 6 | 
 7 | """Provides standard metric evaluations for dialog."""
 8 | 
 9 | from collections import Counter
10 | from typing import List
11 | import numpy as np
12 | import re
13 | 
14 | re_art = re.compile(r'\b(a|an|the)\b')
15 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
16 | 
17 | 
18 | def normalize_answer(s):
19 |     """
20 |     Lower text and remove punctuation, articles and extra whitespace.
21 |     """
22 |     s = s.lower()
23 |     s = re_punc.sub(' ', s)
24 |     s = re_art.sub(' ', s)
25 |     s = ' '.join(s.split())
26 |     return s
27 | 
28 | 
29 | class F1Metric:
30 |     """
31 |     Helper class which computes token-level F1.
32 |     """
33 | 
34 |     @staticmethod
35 |     def _prec_recall_f1_score(pred_items, gold_items):
36 |         """
37 |         Compute precision, recall and f1 given a set of gold and prediction items.
38 |         :param pred_items: iterable of predicted values
39 |         :param gold_items: iterable of gold values
40 |         :return: tuple (p, r, f1) for precision, recall, f1
41 |         """
42 |         common = Counter(gold_items) & Counter(pred_items)
43 |         num_same = sum(common.values())
44 |         if num_same == 0:
45 |             return 0, 0, 0
46 |         precision = 1.0 * num_same / len(pred_items)
47 |         recall = 1.0 * num_same / len(gold_items)
48 |         f1 = (2 * precision * recall) / (precision + recall)
49 |         return precision, recall, f1
50 | 
51 |     @staticmethod
52 |     def compute_each_pair(guess: str, answer: str):
53 |         if answer == "":
54 |             return None, None, None
55 |         if guess == "":
56 |             return 0, 0, 0
57 |         g_tokens = normalize_answer(guess).split()
58 |         a_tokens = normalize_answer(answer).split()
59 | 
60 |         precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
61 |         return precision, recall, f1
62 |         
63 |     @staticmethod
64 |     def compute_all_pairs(guesses: List[str], answers: List[str]):
65 |         # additional augment:
66 |         assert len(guesses) == len(answers)
67 |         
68 |         precision_list, recall_list, f1_list = [], [], []
69 |         for guess, answer in zip(guesses, answers):
70 |             precision, recall, f1 = F1Metric.compute_each_pair(guess, answer)
71 |             if precision is None or recall is None or f1 is None:
72 |                 continue
73 |             precision_list.append(precision)
74 |             recall_list.append(recall)
75 |             f1_list.append(f1)
76 |         
77 |         return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
78 | 


--------------------------------------------------------------------------------
/tasks/orqa/README.md:
--------------------------------------------------------------------------------
 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering
 2 | 
 3 | Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
 4 | 
 5 | ## Retriever Training
 6 | 
 7 | #### Unsupervised pretraining
 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body.
 9 | 
10 | <pre>
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | </pre>
20 | 
21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training.
22 | 
23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf).
24 | 
25 | #### Supervised finetuning
26 | 
27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906).
28 | 
29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model.
30 | 
31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408).
32 | 
33 | ## Reader Training
34 | 
35 | The reader component will be available soon.
36 | 
37 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | from megatron import get_args, print_rank_0
 6 | from megatron.indexer import IndexBuilder
 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator
 8 | 
 9 | def main():
10 |     """
11 |     Main program
12 |     """
13 | 
14 |     args = get_args()
15 | 
16 |     """
17 |     Create a BlockData data structure by running an IndexBuilder over an
18 |     ICT Dataset and then evaluate on NQ task
19 |     """
20 | 
21 |     print_rank_0("Starting index builder!")
22 | 
23 |     index_builder = IndexBuilder()
24 |     index_builder.build_and_save_index()
25 |     print_rank_0("Build and save indices: done!")
26 | 
27 | 
28 |     print_rank_0("Starting evaluations!")
29 | 
30 |     # Set up the model and evaluator
31 |     evaluator = ORQAEvaluator()
32 | 
33 |     # Run evaluation
34 |     if args.qa_data_dev is not None:
35 |         evaluator.evaluate(args.qa_data_dev, "DEV")
36 | 
37 |     if args.qa_data_test is not None:
38 |         evaluator.evaluate(args.qa_data_test, "TEST")
39 | 
40 | 


--------------------------------------------------------------------------------
/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Race."""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from megatron import get_tokenizer
 8 | from megatron.model.multiple_choice import MultipleChoice
 9 | from tasks.eval_utils import accuracy_func_provider
10 | from tasks.finetune_utils import finetune
11 | from tasks.race.data import RaceDataset
12 | 
13 | 
14 | def train_valid_datasets_provider():
15 |     """Provide train and validation datasets."""
16 |     args = get_args()
17 |     tokenizer = get_tokenizer()
18 | 
19 |     train_dataset = RaceDataset('training', args.train_data,
20 |                                 tokenizer, args.seq_length)
21 |     valid_dataset = RaceDataset('validation', args.valid_data,
22 |                                 tokenizer, args.seq_length)
23 | 
24 |     return train_dataset, valid_dataset
25 | 
26 | 
27 | def model_provider(pre_process=True, post_process=True):
28 |     """Build the model."""
29 | 
30 |     print_rank_0('building multichoice model for RACE ...')
31 |     model = MultipleChoice(num_tokentypes=2,
32 |                            pre_process=pre_process,
33 |                            post_process=post_process)
34 | 
35 |     return model
36 | 
37 | 
38 | def metrics_func_provider():
39 |     """Privde metrics callback function."""
40 |     args = get_args()
41 |     tokenizer = get_tokenizer()
42 | 
43 |     def single_dataset_provider(datapath):
44 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
45 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
46 | 
47 |     return accuracy_func_provider(single_dataset_provider)
48 | 
49 | 
50 | def main():
51 | 
52 |     finetune(train_valid_datasets_provider, model_provider,
53 |              end_of_epoch_callback_provider=metrics_func_provider)
54 | 


--------------------------------------------------------------------------------
/tasks/vision/classification/classification.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Vision-classification finetuning/evaluation."""
 4 | 
 5 | import torch.nn.functional as F
 6 | from functools import partial
 7 | from megatron import get_args, get_timers
 8 | from megatron import print_rank_0
 9 | from megatron.model.vision.classification import VitClassificationModel
10 | from megatron.data.vit_dataset import build_train_valid_datasets
11 | from tasks.vision.classification.eval_utils import accuracy_func_provider
12 | from tasks.vision.finetune_utils import finetune
13 | from megatron.utils import average_losses_across_data_parallel_group
14 | 
15 | 
16 | def classification():
17 |     def train_valid_datasets_provider():
18 |         """Build train and validation dataset."""
19 |         args = get_args()
20 | 
21 |         train_ds, valid_ds = build_train_valid_datasets(
22 |             data_path=args.data_path,
23 |             image_size=(args.img_h, args.img_w),
24 |         )
25 |         return train_ds, valid_ds
26 | 
27 |     def model_provider(pre_process=True, post_process=True):
28 |         """Build the model."""
29 |         args = get_args()
30 | 
31 |         print_rank_0("building classification model for ImageNet ...")
32 | 
33 |         return VitClassificationModel(num_classes=args.num_classes, finetune=True,
34 |                                       pre_process=pre_process, post_process=post_process)
35 | 
36 |     def process_batch(batch):
37 |         """Process batch and produce inputs for the model."""
38 |         images = batch[0].cuda().contiguous()
39 |         labels = batch[1].cuda().contiguous()
40 |         return images, labels
41 | 
42 |     def cross_entropy_loss_func(labels, output_tensor):
43 |         logits = output_tensor
44 | 
45 |         # Cross-entropy loss.
46 |         loss = F.cross_entropy(logits.contiguous().float(), labels)
47 | 
48 |         # Reduce loss for logging.
49 |         averaged_loss = average_losses_across_data_parallel_group([loss])
50 | 
51 |         return loss, {'lm loss': averaged_loss[0]}
52 | 
53 |     def _cross_entropy_forward_step(batch, model):
54 |         """Simple forward step with cross-entropy loss."""
55 |         timers = get_timers()
56 | 
57 |         # Get the batch.
58 |         timers("batch generator", log_level=2).start()
59 |         try:
60 |             batch_ = next(batch)
61 |         except BaseException:
62 |             batch_ = batch
63 |         images, labels = process_batch(batch_)
64 |         timers("batch generator").stop()
65 | 
66 |         # Forward model.
67 |         output_tensor = model(images)
68 |       
69 |         return output_tensor, partial(cross_entropy_loss_func, labels)
70 | 
71 |     """Finetune/evaluate."""
72 |     finetune(
73 |         train_valid_datasets_provider,
74 |         model_provider,
75 |         forward_step=_cross_entropy_forward_step,
76 |         end_of_epoch_callback_provider=accuracy_func_provider,
77 |     )
78 | 
79 | def main():
80 |     classification()
81 | 
82 | 


--------------------------------------------------------------------------------
/tasks/vision/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | sys.path.append(
 9 |     os.path.abspath(
10 |         os.path.join(
11 |             os.path.join(os.path.dirname(__file__), os.path.pardir),
12 |             os.path.pardir,
13 |         )
14 |     )
15 | )
16 | from megatron import get_args
17 | from megatron.initialize import initialize_megatron
18 | 
19 | def get_tasks_args(parser):
20 |     """Provide extra arguments required for tasks."""
21 |     group = parser.add_argument_group(title="tasks")
22 | 
23 |     group.add_argument('--task', type=str, default='segment',
24 |                        choices=['classify', 'segment_setr', 'segment_segformer'],
25 |                        help='task name.')
26 |     group.add_argument("--epochs", type=int, default=None,
27 |                        help="Number of finetunning epochs. Zero results in "
28 |                        "evaluation only.")
29 |     group.add_argument('--pretrained-checkpoint-type', type=str, default='default',
30 |                        choices=['default', 'external', 'constrastive'],
31 |                        help='Type of pretrained checkpoint')
32 |     group.add_argument("--pretrained-checkpoint", type=str, default=None,
33 |                        help="Pretrained checkpoint used for finetunning.")
34 |     group.add_argument('--seg-stride', type=int, default=None,
35 |                        help='sliding window stride during evaluation')
36 |     return parser
37 | 
38 | 
39 | if __name__ == "__main__":
40 | 
41 |     initialize_megatron(extra_args_provider=get_tasks_args)
42 |     args = get_args()
43 | 
44 |     if args.task == 'classify':
45 |         from tasks.vision.classification.classification import main
46 |         main()
47 |     elif args.task == 'segment_setr':
48 |         from tasks.vision.segmentation.finetune_setr import main
49 |         main()
50 |     elif args.task == 'segment_segformer':
51 |         from tasks.vision.segmentation.finetune_segformer import main
52 |         main()
53 | 
54 | 


--------------------------------------------------------------------------------
/tasks/vision/segmentation/seg_models.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import math
 3 | import einops
 4 | import torch
 5 | import apex
 6 | import torch.nn.functional as F
 7 | from megatron import get_args
 8 | from megatron.model.module import MegatronModule
 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
10 | from megatron.model.vision.mit_backbone import mit_b3, mit_b5
11 | from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead
12 | 
13 | 
14 | class SetrSegmentationModel(MegatronModule):
15 | 
16 |     def __init__(self,
17 |                  num_classes,
18 |                  pre_process=True,
19 |                  post_process=True):
20 |         super(SetrSegmentationModel, self).__init__()
21 |         args = get_args()
22 |         assert post_process & pre_process
23 |         self.hidden_size = args.hidden_size
24 |         self.num_classes = num_classes
25 |         self.backbone = VitBackbone(
26 |             pre_process=pre_process,
27 |             post_process=post_process,
28 |             class_token=False,
29 |             post_layer_norm=False,
30 |             drop_path_rate=0.1
31 |         )
32 | 
33 |         self.head = SetrSegmentationHead(
34 |             self.hidden_size,
35 |             self.num_classes
36 |         )
37 | 
38 |     def set_input_tensor(self, input_tensor):
39 |         """See megatron.model.transformer.set_input_tensor()"""
40 |         pass
41 | 
42 |     def forward(self, input):
43 |         # [b hw c]
44 |         hidden_states = self.backbone(input)
45 |         result_final = self.head(hidden_states)
46 |         return result_final
47 | 
48 | 
49 | class SegformerSegmentationModel(MegatronModule):
50 | 
51 |     def __init__(self,
52 |                  num_classes,
53 |                  pre_process=True,
54 |                  post_process=True):
55 |         super(SegformerSegmentationModel, self).__init__()
56 |         args = get_args()
57 |         self.hidden_size = args.hidden_size
58 |         self.num_classes = num_classes
59 |         self.pre_process = pre_process
60 |         self.post_process = post_process
61 | 
62 |         self.backbone = mit_b5()
63 |         self.head = SegformerSegmentationHead(
64 |             feature_strides=[4, 8, 16, 32],
65 |             in_channels=[64, 128, 320, 512],
66 |             embedding_dim=768,
67 |             dropout_ratio=0.1
68 |         )
69 | 
70 |     def set_input_tensor(self, input_tensor):
71 |         """See megatron.model.transformer.set_input_tensor()"""
72 |         pass
73 | 
74 |     def forward(self, input):
75 |         # [b hw c]
76 |         hidden_states = self.backbone(input)
77 |         hidden_states = self.head(hidden_states)
78 |         return hidden_states
79 | 
80 | 


--------------------------------------------------------------------------------
/tasks/vision/segmentation/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import numpy as np
 4 | from megatron import get_args
 5 | 
 6 | def slidingcrops(img, mask):
 7 |     # img: [b c h w]
 8 |     # mask: [b h w]
 9 |     args = get_args()
10 |     assert args.img_h == args.img_w
11 |     crop_size = args.img_h
12 |     stride = args.seg_stride
13 |     ignore_index = args.ignore_index
14 |     n, c, h, w = img.shape
15 |     assert h >= crop_size
16 |     assert w >= crop_size
17 |     long_size = max(h, w)
18 | 
19 |     img_slices, mask_slices, slices_info = [], [], []
20 |     if long_size > crop_size:
21 |         assert stride <= crop_size
22 |         h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1
23 |         w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1
24 |         for yy in range(h_step_num):
25 |             for xx in range(w_step_num):
26 |                 sy, sx = yy * stride, xx * stride
27 |                 ey, ex = sy + crop_size, sx + crop_size
28 |                 img_sub = img[:, :, sy: ey, sx: ex]
29 |                 mask_sub = mask[:, sy: ey, sx: ex]
30 | 
31 |                 # padding
32 |                 sub_h, sub_w = img_sub.shape[2:]
33 |                 pad_h = max(crop_size - sub_h, 0)
34 |                 pad_w = max(crop_size - sub_w, 0)
35 |                 img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index)
36 |                 mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h))
37 | 
38 |                 img_slices.append(img_sub)
39 |                 mask_slices.append(mask_sub)
40 |                 slices_info.append([sy, ey, sx, ex, sub_h, sub_w])
41 | 
42 |         return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w)
43 |     else:
44 |         return img, mask, [[0, h, 0, w, h, w]], (h, w)
45 | 
46 | 
47 | def slidingjoins(preds, probs, labels, slices_info, img_size):
48 |     args = get_args()
49 |     num_slices = len(slices_info)
50 | 
51 |     if num_slices == 1:
52 |         return preds, labels
53 | 
54 |     h, w = img_size
55 |     split_size = args.micro_batch_size
56 | 
57 |     preds_split = torch.split(preds, split_size)
58 |     probs_split = torch.split(probs, split_size)
59 |     labels_split = torch.split(labels, split_size)
60 | 
61 |     assert(len(preds_split) == num_slices)
62 | 
63 |     total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda')
64 |     total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
65 |     total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
66 | 
67 |     for i in range(num_slices):
68 |         sy, ey, sx, ex, sub_h, sub_w = slices_info[i]
69 |         assert sy + sub_h <= h
70 |         assert sx + sub_w <= w
71 |         curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w]
72 |         curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w]
73 | 
74 |         local_max_probs = probs_split[i][:, :sub_h, : sub_w]
75 |         local_preds = preds_split[i][:, :sub_h, :sub_w]
76 | 
77 |         result_max_probs = torch.maximum(curr_max_probs, local_max_probs)
78 |         result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds)
79 | 
80 |         total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs
81 |         total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds
82 |         total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w]
83 | 
84 |     return total_preds, total_labels
85 | 
86 | 


--------------------------------------------------------------------------------
/tasks/zeroshot_gpt/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Detokenization."""
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | def ptb_detokenizer(string):
 9 |     string = string.replace(" '", "'")
10 |     string = string.replace(" \n", "\n")
11 |     string = string.replace("\n ", "\n")
12 |     string = string.replace(" n't", "n't")
13 |     string = string.replace(" N ", "1 ")
14 |     string = string.replace("$ 1", "$1")
15 |     string = string.replace("# 1", "#1")
16 |     return string
17 | 
18 | 
19 | def wikitext_detokenizer(string):
20 |     # contractions
21 |     string = string.replace("s '", "s'")
22 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
23 |     # number separators
24 |     string = string.replace(" @-@ ", "-")
25 |     string = string.replace(" @,@ ", ",")
26 |     string = string.replace(" @.@ ", ".")
27 |     # punctuation
28 |     string = string.replace(" : ", ": ")
29 |     string = string.replace(" ; ", "; ")
30 |     string = string.replace(" . ", ". ")
31 |     string = string.replace(" ! ", "! ")
32 |     string = string.replace(" ? ", "? ")
33 |     string = string.replace(" , ", ", ")
34 |     # double brackets
35 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
36 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
37 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
38 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
39 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
40 |     # miscellaneous
41 |     string = string.replace("= = = =", "====")
42 |     string = string.replace("= = =", "===")
43 |     string = string.replace("= =", "==")
44 |     string = string.replace(" " + chr(176) + " ", chr(176))
45 |     string = string.replace(" \n", "\n")
46 |     string = string.replace("\n ", "\n")
47 |     string = string.replace(" N ", " 1 ")
48 |     string = string.replace(" 's", "'s")
49 | 
50 |     return string
51 | 
52 | 
53 | def lambada_detokenizer(string):
54 |     return string
55 | 
56 | 
57 | _DETOKENIZERS = {
58 |     'ptb': ptb_detokenizer,
59 |     'wiki': wikitext_detokenizer,
60 |     'lambada': lambada_detokenizer,
61 | }
62 | 
63 | 
64 | def get_detokenizer(path):
65 |     for key in _DETOKENIZERS.keys():
66 |         if key in path:
67 |             return _DETOKENIZERS[key]
68 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-futuredata/Megatron-LM/85f95aef3b648075fe6f291c86714fdcbd9cd1f5/tests/__init__.py


--------------------------------------------------------------------------------
/tests/tensor_parallel/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
 2 | import torch
 3 | from tests.test_utilities import Utils
 4 | import numpy as np
 5 | 
 6 | def test_vocab_parallel_cross_entropy():
 7 |     Utils.initialize_model_parallel(4,2)
 8 |     vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda()
 9 |     target = torch.arange(0,32,2).cuda()
10 |     output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
11 |     expected_output = torch.tensor([10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309,
12 |         10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309]).cuda()
13 |     assert(torch.equal(torch.round(expected_output), torch.round(output)))
14 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/tensor_parallel/test_data.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.data import broadcast_data
 2 | import torch
 3 | from tests.test_utilities import Utils
 4 | 
 5 | def test_broadcast_data():
 6 |     Utils.initialize_model_parallel(2,4)
 7 |     input_data = {
 8 |         0 : torch.ones((8,8)).cuda() * 0.0,
 9 |         1 : torch.ones((8,8)).cuda() * 1.0,
10 |         2 : torch.ones((8,8)).cuda() * 2.0,
11 |         3 : torch.ones((8,8)).cuda() * 3.0,
12 |         4 : torch.ones((8,8)).cuda() * 4.0,
13 |         5 : torch.ones((8,8)).cuda() * 5.0,
14 |         6 : torch.ones((8,8)).cuda() * 6.0,
15 |         7 : torch.ones((8,8)).cuda() * 7.0
16 |         }
17 |     dtype = torch.float32
18 |     actual_output = broadcast_data([0,1],input_data, dtype)
19 |     assert(torch.equal(actual_output[0], input_data[0]))
20 |     assert(torch.equal(actual_output[1], input_data[1]))
21 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/tensor_parallel/test_random.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 3 | from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER
 4 | from megatron.core.tensor_parallel.random import checkpoint
 5 | from tests.test_utilities import Utils
 6 | import pytest
 7 | import torch
 8 | 
 9 | def test_cuda_rng_states_tracker():
10 |     rng_tracker = CudaRNGStatesTracker()
11 |     rng_tracker.set_states({"state1":1234})
12 |     assert(rng_tracker.get_states()["state1"] == 1234)
13 |     rng_tracker.reset()
14 |     assert(rng_tracker.get_states() == {})
15 |     seed = 1111
16 |     rng_tracker.add("state2",seed)
17 |     with pytest.raises(Exception):
18 |         assert(rng_tracker.add("state3",seed))
19 |     with pytest.raises(Exception):
20 |         assert(rng_tracker.add("state2",111))
21 |     assert(rng_tracker.get_states()['state2'] is not None)
22 |     with pytest.raises(Exception):
23 |         assert()
24 |     
25 |     rng_tracker.fork("state2")
26 |     torch.cuda.manual_seed(seed)
27 |     rng_state = torch.cuda.get_rng_state()
28 |     assert torch.equal(rng_tracker.get_states()['state2'], rng_state)
29 | 
30 | def test_model_parallel_cuda_manual_seed():
31 |     Utils.initialize_model_parallel(4,2)
32 |     model_parallel_cuda_manual_seed(0)
33 |     assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None)
34 |     Utils.destroy_model_parallel()
35 | 
36 | def test_checkpoint():
37 |     def test_forward(*input):
38 |         return input[0]+input[1]
39 |     assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2)))
40 |     Utils.initialize_model_parallel()
41 |     input1 = torch.ones((4,4))
42 |     checkpoint(test_forward, True, input1, torch.ones((4,4))*2)
43 |     assert(torch.equal(torch.ones(input1.numel()).cuda(), input1))
44 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/tensor_parallel/test_tensor_parallel_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import megatron.core.tensor_parallel.utils as util
 3 | import megatron.core.parallel_state as ps
 4 | from tests.test_utilities import Utils
 5 | 
 6 | rank = Utils.rank
 7 | 
 8 | def test_split_tensor_along_last_dim():
 9 |     input_tensor = torch.rand((3,4))
10 |     torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0])
11 |     torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1])
12 | 
13 | def test_split_tensor_into_1d_equal_chunks():
14 |     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
15 |     input_tensor = torch.rand((3,4))
16 |     output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor)
17 |     if rank % 2 == 0 :
18 |         start = 0
19 |         end = int(input_tensor.numel()/2)
20 |     else :
21 |         start = int(input_tensor.numel()/2)
22 |         end = input_tensor.numel()
23 |         
24 |     assert torch.equal(output_tensor, input_tensor.flatten()[start:end])
25 |     Utils.destroy_model_parallel()
26 | 
27 | def test_gather_split_1d_tensor():
28 |     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
29 |     input_tensor = torch.ones((2,4)).cuda() * rank
30 |     actual_output_tensor = util.gather_split_1d_tensor(input_tensor)
31 |     if rank %2 == 0:
32 |         expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1))
33 |     else : 
34 |         expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten()))
35 |     assert(torch.equal(actual_output_tensor, expected_output_tensor))
36 |     Utils.destroy_model_parallel()
37 | 
38 | def test_vocab():
39 |     global_vocab_size = 1600
40 |     per_partition_vocab_size = 1600 / Utils.world_size
41 |     assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size)))
42 |     assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size)))
43 |     


--------------------------------------------------------------------------------
/tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/test_utilities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import megatron.core.parallel_state as ps
 4 | 
 5 | class Utils:
 6 | 
 7 |     world_size = torch.cuda.device_count()
 8 |     rank = int(os.environ['LOCAL_RANK'])
 9 | 
10 |     @staticmethod
11 |     def initialize_distributed():
12 |         print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
13 |         torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
14 |         init_method = 'tcp://'
15 |         master_ip = os.getenv('MASTER_ADDR', 'localhost')
16 |         master_port = os.getenv('MASTER_PORT', '6000')
17 |         init_method += master_ip + ':' + master_port
18 |         torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method)
19 |         
20 |     @staticmethod
21 |     def destroy_model_parallel():
22 |         ps.destroy_model_parallel()
23 |         torch.distributed.barrier()
24 | 
25 |     @staticmethod
26 |     def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None):
27 |         ps.destroy_model_parallel()
28 |         if not torch.distributed.is_initialized():
29 |             Utils.initialize_distributed()
30 |         ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank)


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import megatron.core.utils as util
 4 | import numpy as np
 5 | 
 6 | def test_divide_properly():
 7 |     assert util.divide(4,2) == 2
 8 | 
 9 | def test_divide_improperly():
10 |     with pytest.raises(AssertionError):
11 |         util.divide(4,5)
12 | 
13 | def test_global_memory_buffer():
14 |     global_memory_buffer = util.GlobalMemoryBuffer()
15 |     obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor")
16 |     expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device())
17 |     assert torch.equal(obtained_tensor, expected_tensor)
18 | 
19 | def test_make_viewless_tensor():
20 |     inp = torch.rand((3,4))
21 |     assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True)))
22 |     assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False)))
23 | 
24 | def test_safely_set_viewless_tensor_data():
25 |     tensor = torch.zeros((3,4))
26 |     new_data_tensor = torch.tensor(np.random.rand(3,4))
27 |     util.safely_set_viewless_tensor_data(tensor, new_data_tensor)
28 |     assert(torch.equal(tensor, new_data_tensor))
29 | 
30 | def test_assert_viewless_tensor():
31 |     tensor = torch.rand((3,4))
32 |     assert(torch.equal(util.assert_viewless_tensor(tensor), tensor))
33 |     input_tensor_list=[tensor,tensor,tensor]
34 |     output_tensor_list = util.assert_viewless_tensor(input_tensor_list)
35 |     for inp,out in zip(input_tensor_list, output_tensor_list):
36 |         assert(torch.equal(inp,out))
37 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/merge_datasets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import argparse
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
 6 |                                              os.path.pardir)))
 7 | 
 8 | from megatron.data import indexed_dataset
 9 | 
10 | 
11 | def main(args):
12 | 
13 |     prefixes = set()
14 |     for basename in os.listdir(args.input):
15 |         prefix, ext = os.path.splitext(basename)
16 | 
17 |         if prefix in prefixes:
18 |             continue
19 | 
20 |         if not os.path.isfile(os.path.join(args.input, basename)):
21 |             continue
22 | 
23 |         ext_pair = '.bin' if ext == '.idx' else '.idx'
24 |         assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \
25 |                f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}'
26 | 
27 |         prefixes.add(prefix)
28 | 
29 |     builder = None
30 |     for prefix in sorted(prefixes):
31 |         if builder is None:
32 |             dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer')
33 | 
34 |             if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
35 |                 builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype)
36 |             else:
37 |                 builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin')
38 | 
39 |             del dataset
40 | 
41 |         builder.merge_file_(os.path.join(args.input, prefix))
42 | 
43 |     builder.finalize(args.output_prefix + '.idx')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     parser = argparse.ArgumentParser()
48 | 
49 |     group = parser.add_argument_group(title='input data')
50 |     group.add_argument('--input', type=str, required=True,
51 |                        help='Path to directory containing all document files to merge')
52 | 
53 |     group = parser.add_argument_group(title='output data')
54 |     group.add_argument('--output-prefix', type=str, required=True,
55 |                        help='Path to binary output file without suffix')
56 | 
57 |     args = parser.parse_args()
58 | 
59 |     assert os.path.isdir(args.input), \
60 |            f'ERROR: {args.input} is not a directory or does not exist'
61 | 
62 |     assert os.path.isdir(os.path.dirname(args.output_prefix)), \
63 |            f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist'
64 | 
65 |     main(args)
66 | 
67 | 


--------------------------------------------------------------------------------
/tools/openwebtext/README.md:
--------------------------------------------------------------------------------
 1 | The following steps show how to prepare training dataset to train the mode.
 2 | 
 3 | # Libraries to install
 4 | 
 5 | ```
 6 |     pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 
 7 |     git clone https://github.com/mattilyra/LSH
 8 |     cd LSH
 9 |     python setup.py install
10 | ``` 
11 | 
12 | # Download the dataset
13 | 
14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
15 | 2. Remove blacklisted URLs.
16 | ```
17 | python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
18 | ```
19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
20 | 
21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique.
22 | 
23 | # Prepare the data for GPT training:
24 | 
25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards.
26 | ```
27 | python cleanup_dataset.py <input data file> <output cleaned data filename>
28 | ```
29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`.
30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`.
31 | ```
32 | python find_duplicates.py --inputs <pairlist list of input cleaned data files and keys, e.g. cc.json cc_id news.json news_id> --output <output possible duplicate urls filename>
33 | ```
34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest.
35 | ```
36 | python group_duplicate_urls.py <possible duplicate urls file> <output file containing similar urls>
37 | ```
38 | 4. Remove similar documents that were detected in the last step.
39 | ```
40 | python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
41 | ```
42 | 
43 | 5. Shuffle the dataset.
44 | ```
45 | shuf <cleaned deduped data file> -o train_data.json
46 | ```
47 | 
48 | # Deduplicating ngrams
49 | 
50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command.
51 | 
52 | ```
53 | python filter_ngrams.py --tasks <name of the task, e.g. lambada, squad> --dedup-dataset <training dataset to deduplicate> <json key> --output <output training dataset>
54 | ```
55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments.
56 | 
57 | Only for the lambada task, we need to provide the path, `--lambada-path <path of the lambada test data>`.
58 | 
59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details.
60 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | 
 8 | """
 9 | This code adds id to each json object in a json file. User can add prefix
10 | to the ids.
11 | """
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     print('parsing the arguments ...')
16 | 
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
19 |         ' json file where id needs to be added')
20 |     parser.add_argument('--output-file', type=str, default=None, help=\
21 |         'Output file name with id')
22 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
23 |         'Id prefix')
24 |     parser.add_argument('--log-interval', type=int, default=100,
25 |                        help='Log interval')
26 |     args = parser.parse_args()
27 | 
28 |     print('Adding ids to dataset ...')
29 | 
30 |     f_input = open(args.input_file, 'r', encoding='utf-8')
31 |     f_output = open(args.output_file, 'wb')
32 | 
33 |     unique_ids = 1
34 |     start_time = time.time()
35 |     for row in f_input:
36 |         each_row = json.loads(row)
37 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
38 |         each_row['adlr_id'] = adlr_id_string
39 |         myjson = json.dumps(each_row, ensure_ascii=False)
40 | 
41 |         f_output.write(myjson.encode('utf-8'))
42 |         f_output.write('\n'.encode('utf-8'))
43 | 
44 |         if unique_ids % args.log_interval == 0:
45 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
46 |                     unique_ids, time.time() - start_time), flush=True)
47 | 
48 |         unique_ids += 1
49 | 
50 |     # Close the file.
51 |     f_input.close()
52 |     f_output.close()
53 |     
54 |     print('done :-)', flush=True)
55 | 


--------------------------------------------------------------------------------
/tools/openwebtext/group_duplicate_url.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import json
 4 | import time
 5 | import sys
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 | 
10 | 
11 |     print('grouping duplicate urls ...')
12 | 
13 |     input = sys.argv[1]
14 |     output = sys.argv[2]
15 |     if len(sys.argv) > 3:
16 |         jaccard_similarity_threshold = float(sys.argv[3])
17 |     else:
18 |         jaccard_similarity_threshold = 0.7
19 | 
20 |     url_to_index = {}
21 |     index_to_urls = []
22 |     counter = 0
23 |     start_time = time.time()
24 |     with open(input, 'r') as f:
25 |         for line in f:
26 |             counter += 1
27 |             myjson = json.loads(line)
28 |             urls = []
29 |             for main_url in myjson.keys():
30 |                 urls.append(main_url)
31 |                 for value in myjson[main_url]:
32 |                     for other_url, js in value.items():
33 |                         if js >= jaccard_similarity_threshold:
34 |                             urls.append(other_url)
35 |             current_index = -1
36 |             other_indices = set()
37 |             for url in urls:
38 |                 if url in url_to_index:
39 |                     if current_index == -1:
40 |                         current_index = url_to_index[url]
41 |                     elif current_index != url_to_index[url]:
42 |                         other_indices.add(url_to_index[url])
43 |             if current_index == -1:
44 |                 current_index = len(index_to_urls)
45 |                 index_to_urls.append(set())
46 |             for url in urls:
47 |                 url_to_index[url] = current_index
48 |                 index_to_urls[current_index].add(url)
49 |             for index in other_indices:
50 |                 for url in index_to_urls[index]:
51 |                     index_to_urls[current_index].add(url)
52 |                     url_to_index[url] = current_index
53 |                 index_to_urls[index] = None
54 | 
55 |             if counter % 100000 == 0:
56 |                 print(' > processed {} lines in {} seconds ...'.format(
57 |                     counter, time.time() - start_time))
58 | 
59 | 
60 |     total_remove = 0
61 |     total_remain = 0
62 |     for urls in index_to_urls:
63 |         if urls is not None:
64 |             if len(urls) > 1:
65 |                 total_remove += (len(urls) - 1)
66 |                 total_remain += 1
67 |     print('out of {} urls, only {} are unique and {} should be removed'.format(
68 |         total_remove+total_remain, total_remain, total_remove))
69 | 
70 |     with open(output, 'wb') as f:
71 |         for i, urls in enumerate(index_to_urls):
72 |             if urls is not None:
73 |                 if len(urls) > 1:
74 |                     myjson = json.dumps({str(i): list(urls)},
75 |                                         ensure_ascii=False)
76 |                     f.write(myjson.encode('utf-8'))
77 |                     f.write('\n'.encode('utf-8'))
78 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import glob
 5 | import sys
 6 | import json
 7 | import argparse
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--json_path", type=str, default=".",
13 |         help="path where all the json files are located")
14 | 
15 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
16 |         help="filename where the merged json should go")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     json_path = args.json_path
21 |     out_file = args.output_file
22 | 
23 |     json_files = glob.glob(json_path + '/*.json')
24 | 
25 |     counter = 0
26 | 
27 |     with open(out_file, 'w') as outfile:
28 |         for fname in json_files:
29 |             counter += 1
30 | 
31 |             if counter % 1024 == 0:
32 |                 print("Merging at ", counter, flush=True)
33 | 
34 |             with open(fname, 'r') as infile:
35 |                 for row in infile:
36 |                     each_row = json.loads(row)
37 |                     outfile.write(row)
38 | 
39 | 
40 |     print("Merged file", out_file, flush=True)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import json
 5 | import time
 6 | import sys
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     url_filename = sys.argv[1]
12 |     data_filename = sys.argv[2]
13 |     output_filename = sys.argv[3]
14 | 
15 |     urls = set()
16 |     with open(url_filename, 'r') as f:
17 |         for line in f:
18 |             myjson = json.loads(line)
19 |             for key in myjson:
20 |                 this_urls = myjson[key]
21 |                 for i in range(1, len(this_urls)):
22 |                     urls.add(this_urls[i])
23 |     print('will be removing {} urls'.format(len(urls)), flush=True)
24 | 
25 |     written_docs = 0
26 |     removed_docs = 0
27 |     removed_chars = 0
28 |     start_time = time.time()
29 |     with open(output_filename, 'wb') as fout:
30 |         with open(data_filename, 'r') as fin:
31 |             for line in fin:
32 |                 try:
33 |                     myjson = json.loads(line)
34 |                     url = myjson['url']
35 |                     if url in urls:
36 |                         print('removing', myjson)
37 |                         removed_docs += 1
38 |                         removed_chars += len(myjson['text'])
39 |                         continue
40 |                     myjson = json.dumps(myjson, ensure_ascii=False)
41 |                     fout.write(myjson.encode('utf-8'))
42 |                     fout.write('\n'.encode('utf-8'))
43 |                     written_docs += 1
44 |                     if written_docs % 10000 == 0:
45 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
46 |                               '| removed: {} (char: {})'.format(
47 |                                   time.time() - start_time,
48 |                                   written_docs, removed_docs, removed_chars))
49 |                 except Exception as e:
50 |                     print('[SKIPPING]', line, e)
51 | 
52 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
53 |           '| removed: {} (char: {})'.format(
54 |               time.time() - start_time,
55 |               written_docs, removed_docs, removed_chars))
56 |     print('done :-)')
57 | 


--------------------------------------------------------------------------------
/tools/run_text_generation_server.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Sample Generate GPT"""
 4 | import os
 5 | import sys
 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
 7 |                                              os.path.pardir)))
 8 | import socket
 9 | from megatron import get_args
10 | from megatron import print_rank_0
11 | from megatron.core import mpu
12 | from megatron.checkpointing import load_checkpoint
13 | from megatron.initialize import initialize_megatron
14 | from megatron.model import GPTModel
15 | from megatron.training import get_model
16 | from megatron.text_generation_server import MegatronServer
17 | from megatron.text_generation import generate_and_post_process
18 | from megatron.text_generation import beam_search_and_post_process
19 | import torch
20 | 
21 | def model_provider(pre_process=True, post_process=True):
22 |     """Build the model."""
23 | 
24 |     print_rank_0('building GPT model ...')
25 |     model = GPTModel(num_tokentypes=0, parallel_output=False, pre_process=pre_process, post_process=post_process)
26 | 
27 |     return model
28 | 
29 | def add_text_generate_args(parser):
30 |     group = parser.add_argument_group(title='text generation')
31 | 
32 |     group.add_argument("--temperature", type=float, default=1.0,
33 |                        help='Sampling temperature.')
34 |     group.add_argument("--top_p", type=float, default=0.0,
35 |                        help='Top p sampling.')
36 |     group.add_argument("--top_k", type=int, default=0,
37 |                        help='Top k sampling.')
38 |     group.add_argument("--out-seq-length", type=int, default=1024,
39 |                        help='Size of the output generated text.')
40 |     return parser
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     initialize_megatron(extra_args_provider=add_text_generate_args,
45 |                         args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
46 |                                        'no_load_rng': True,
47 |                                        'no_load_optim': True})
48 | 
49 |     args = get_args()
50 |     if args.num_layers_per_virtual_pipeline_stage is not None:
51 |         print("Interleaved pipeline schedule is not yet supported for text generation.")
52 |         exit()
53 |     # Set up model and load checkpoint
54 |     model = get_model(model_provider, wrap_with_ddp=False)
55 | 
56 |     if args.load is not None:
57 |         _ = load_checkpoint(model, None, None)
58 | 
59 |     assert len(model) == 1, "Above condition should have caught this"
60 |     model = model[0]
61 |     if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
62 |         server = MegatronServer(model)
63 |         server.run("0.0.0.0")
64 | 
65 |     while True:
66 |         choice = torch.cuda.LongTensor(1)
67 |         torch.distributed.broadcast(choice, 0)
68 |         if choice[0].item() == 0:
69 |             try:
70 |                 generate_and_post_process(model)
71 |             except ValueError as ve:
72 |                 pass
73 |         elif choice[0].item() == 1:
74 |             try:
75 |                 beam_search_and_post_process(model)
76 |             except ValueError as ve:
77 |                 pass
78 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import json
 3 | import sys
 4 | import urllib2
 5 | class PutRequest(urllib2.Request):
 6 |     '''class to handling putting with urllib2'''
 7 | 
 8 |     def get_method(self, *args, **kwargs):
 9 |         return 'PUT'
10 | 
11 | if __name__ == "__main__":
12 |     url = sys.argv[1]
13 |     while True:
14 |         sentence = raw_input("Enter prompt: ")
15 |         tokens_to_generate = int(input("Enter number of tokens to generate: "))
16 |         data = json.dumps({"prompts": [sentence], "tokens_to_generate":tokens_to_generate})
17 |         req = PutRequest(url, data, {'Content-Type': 'application/json'})
18 |         response = urllib2.urlopen(req)
19 |         resp_sentences = json.load(response)
20 |         print("Megatron Response: ")
21 |         print(resp_sentences["text"][0])
22 | 


--------------------------------------------------------------------------------