├── .gitignore
├── .gitlab-ci.yml
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── README_ORIGINAL.md
├── examples
    ├── pretrain_gpt3_5B_24layers_bf16.sh
    ├── pretrain_gpt3_5B_24layers_bf16_bs1024_slurm.sh
    ├── pretrain_gpt3_6.7B.slurm
    ├── pretrain_gpt3_6.7B_32layers_bf16.sh
    ├── pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh
    ├── pretrain_gpt3_6.7B_compile.slurm
    └── sc21
    │   ├── CONFIG.sh
    │   ├── README.md
    │   ├── SBATCH.sh
    │   ├── SRUN.sh
    │   ├── run_figure_11.sh
    │   ├── run_figure_12.sh
    │   ├── run_figure_13.sh
    │   ├── run_figure_14.sh
    │   ├── run_figure_15.sh
    │   ├── run_figure_16.sh
    │   ├── run_figure_17.sh
    │   ├── run_figure_18.sh
    │   └── run_table_1.sh
├── images
    └── cases_april2021.png
├── megatron
    ├── __init__.py
    ├── arguments.py
    ├── checkpointing.py
    ├── data
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── bert_dataset.py
    │   ├── biencoder_dataset_utils.py
    │   ├── blendable_dataset.py
    │   ├── data_samplers.py
    │   ├── dataset_utils.py
    │   ├── gpt_dataset.py
    │   ├── helpers.cpp
    │   ├── ict_dataset.py
    │   ├── indexed_dataset.py
    │   ├── orqa_wiki_dataset.py
    │   ├── realm_dataset_utils.py
    │   ├── realm_index.py
    │   ├── t5_dataset.py
    │   ├── test
    │   │   ├── test_indexed_dataset.py
    │   │   └── test_preprocess_data.sh
    │   └── vit_dataset.py
    ├── fp16_deprecated
    │   └── loss_scaler.py
    ├── fused_kernels
    │   ├── __init__.py
    │   ├── compat.h
    │   ├── layer_norm_cuda.cpp
    │   ├── layer_norm_cuda_kernel.cu
    │   ├── scaled_masked_softmax.cpp
    │   ├── scaled_masked_softmax.h
    │   ├── scaled_masked_softmax_cuda.cu
    │   ├── scaled_upper_triang_masked_softmax.cpp
    │   ├── scaled_upper_triang_masked_softmax.h
    │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_fused_kernels.py
    │   └── type_shim.h
    ├── global_vars.py
    ├── indexer.py
    ├── initialize.py
    ├── learning_rates.py
    ├── memory.py
    ├── microbatches.py
    ├── model
    │   ├── __init__.py
    │   ├── bert_model.py
    │   ├── biencoder_model.py
    │   ├── classification.py
    │   ├── distributed.py
    │   ├── enums.py
    │   ├── fused_bias_gelu.py
    │   ├── fused_layer_norm.py
    │   ├── fused_softmax.py
    │   ├── gpt_model.py
    │   ├── language_model.py
    │   ├── module.py
    │   ├── multiple_choice.py
    │   ├── realm_model.py
    │   ├── t5_model.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── vit_model.py
    ├── mpu
    │   ├── __init__.py
    │   ├── cross_entropy.py
    │   ├── data.py
    │   ├── initialize.py
    │   ├── layers.py
    │   ├── mappings.py
    │   ├── random.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_layers.py
    │   │   └── test_random.py
    │   └── utils.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── clip_grads.py
    │   ├── grad_scaler.py
    │   └── optimizer.py
    ├── p2p_communication.py
    ├── schedules.py
    ├── text_generation_server.py
    ├── text_generation_utils.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── bert_tokenization.py
    │   ├── gpt2_tokenization.py
    │   └── tokenizer.py
    ├── training.py
    └── utils.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_gpt_mp.py
├── pretrain_ict.py
├── pretrain_t5.py
├── pretrain_vit.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── main.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification.py
    │   ├── eval_utils.py
    │   ├── finetune_utils.py
    │   └── main.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    └── test_basic.py
└── tools
    ├── linter.py
    ├── merge_mp_partitions.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    ├── preprocess_data.py
    ├── run_text_generation_server.py
    └── text_generation_cli.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel
 2 | 
 3 | test:
 4 |   script:
 5 |     - pytest --junitxml=report.xml tests
 6 |   artifacts:
 7 |     when: always
 8 |     reports:
 9 |       junit: report.xml
10 |     


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # This file creates codeowners for the documentation. It will allow setting code reviewers for all Pull requests to merge to the master branch 
 2 | # Each line is a file pattern followed by one or more owners.
 3 | 
 4 | # Refernce guide - https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-code-owners#example-[…]ners-file
 5 | # Example - These owners will be the default owners for everything in
 6 | # the repo. Unless a later match takes precedence,
 7 | # @global-owner1 and @global-owner2 will be requested for
 8 | # review when someone opens a pull request.
 9 | # *       @global-owner1 @global-owner2
10 | 
11 | *       @aws-maens @aws-mesharma @musunita
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repository is no longer supported.
2 | 
3 | Please start using AWS Neuron reference for NeMo Megatron (https://github.com/aws-neuron/neuronx-nemo-megatron) or Neuron Distributed (https://github.com/aws-neuron/neuronx-distributed) instead.
4 | 
5 | This Megatron-LM library (source https://github.com/NVIDIA/Megatron-LM) is adapted for use with AWS Trainium via Neuron SDK.
6 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_5B_24layers_bf16.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -o pipefail
 3 | 
 4 | MODEL_CONFIG_NAME=gpt3_5B_24layers_bf16
 5 | 
 6 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document
 7 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME}
 8 | 
 9 | NUM_NEURONCORES=32
10 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
11 | 
12 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3
13 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1
14 | 
15 | export NEURON_RT_STOCHASTIC_ROUNDING_SEED=0
16 | export NEURON_FUSE_SOFTMAX=1
17 | export XLA_USE_BF16=1
18 | 
19 | # Workaround "Too many open files" error with GPT training on U20 server AMI
20 | ulimit -n 8192
21 | 
22 | TRAIN_ITERS=10000
23 | TB_DIR=./tb_${MODEL_CONFIG_NAME}
24 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then
25 |     TRAIN_ITERS=65
26 |     TB_DIR=/tmp/parallel_compile_ignored_tb_output
27 | fi
28 | 
29 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
30 |     --tensor-model-parallel-size 8 \
31 |     --num-layers 24 \
32 |     --hidden-size 4096 \
33 |     --num-attention-heads 32 \
34 |     --micro-batch-size 1 \
35 |     --global-batch-size 64 \
36 |     --seq-length 2048 \
37 |     --max-position-embeddings 2048 \
38 |     --train-iters $TRAIN_ITERS \
39 |     --lr-decay-iters 320000 \
40 |     --data-path $DATA_PATH \
41 |     --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \
42 |     --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \
43 |     --data-impl mmap \
44 |     --split 100,0,0 \
45 |     --distributed-backend xla \
46 |     --lr 0.00015 \
47 |     --lr-decay-style cosine \
48 |     --min-lr 1.0e-5 \
49 |     --weight-decay 1e-2 \
50 |     --clip-grad 1 \
51 |     --lr-warmup-fraction .01 \
52 |     --log-interval 1 \
53 |     --tensorboard-log-interval 1 \
54 |     --eval-interval $TRAIN_ITERS \
55 |     --eval-iters 1000 \
56 |     --attention-dropout 0 \
57 |     --hidden-dropout 0 \
58 |     --no-masked-softmax-fusion \
59 |     --no-bias-gelu-fusion \
60 |     --no-bias-dropout-fusion \
61 |     --no-async-tensor-model-parallel-allreduce \
62 |     --no-contiguous-buffers-in-local-ddp \
63 |     --save-xser $CHECKPOINT_PATH \
64 |     --save-interval 2000 \
65 |     --keep-last-checkpoint-only \
66 |     --use-cpu-initialization \
67 |     --tensorboard-dir $TB_DIR \
68 |     |& tee run_log_$MODEL_CONFIG_NAME.txt &
69 | wait %1
70 | 
71 | ret_val=$?
72 | if [ $ret_val -eq 0 ]; then
73 |     success=1
74 | else
75 |     success=0
76 | fi
77 | 
78 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh
79 | if [ -e $dump_to_s3_update_json_scr ]; then
80 |     $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON."
81 | else
82 |     echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON."
83 | fi
84 | 
85 | exit $ret_val
86 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_5B_24layers_bf16_bs1024_slurm.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | set -o pipefail
  3 | 
  4 | MODEL_CONFIG_NAME=gpt3_5B_24layers_bf16
  5 |  
  6 | # Enable Elastic Fabric Adapter for higher networking performance
  7 | export FI_EFA_USE_DEVICE_RDMA=1
  8 | export FI_PROVIDER=efa
  9 | export FI_EFA_FORK_SAFE=1
 10 | 
 11 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document
 12 |  
 13 | MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
 14 | MASTER_PORT=2022
 15 | NUM_NEURONCORES=32
 16 |  
 17 | WORLD_SIZE_JOB=$SLURM_NTASKS
 18 | RANK_NODE=$SLURM_NODEID
 19 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 20 | echo $DISTRIBUTED_ARGS
 21 |  
 22 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME}_${WORLD_SIZE_JOB}
 23 | 
 24 | # Keep only 3 number of graphs loaded in Neuron runtime for each process to reduce device mem usage
 25 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3
 26 | # Mark all parameter transfers as static to enable runtime optimizations for wrapped torch.nn modules
 27 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1
 28 | # Enables custom lowering for Softmax operation to enable compiler optimizations and improve GPT performance
 29 | export NEURON_FUSE_SOFTMAX=1
 30 | # Cast training to BF16 and enable stochastic rounding
 31 | export XLA_USE_BF16=1
 32 | # Increase Neuron RT execution timeout in case slow compilation causes Neuron RT to wait longer than default timeout
 33 | export NEURON_RT_EXEC_TIMEOUT=600
 34 | 
 35 | # Separate NeuronCache dir per node, workaround limitation to file locking on NFS
 36 | export NEURON_CC_FLAGS="--cache_dir=$HOME/neuron_cache/gpt/`hostname`"
 37 | 
 38 | TRAIN_ITERS=143051
 39 | TB_DIR=./tb_${MODEL_CONFIG_NAME}
 40 | # Run fewer steps and ignore tb output when extract graphs only (neuron_parallel_compile)
 41 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then
 42 |     TRAIN_ITERS=65
 43 |     TB_DIR=/tmp/parallel_compile_ignored_tb_output
 44 | fi
 45 |  
 46 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
 47 |     --tensor-model-parallel-size 8 \
 48 |     --num-layers 24 \
 49 |     --hidden-size 4096 \
 50 |     --num-attention-heads 32 \
 51 |     --micro-batch-size 1 \
 52 |     --global-batch-size 1024 \
 53 |     --seq-length 2048 \
 54 |     --max-position-embeddings 2048 \
 55 |     --train-iters $TRAIN_ITERS \
 56 |     --lr-decay-iters 123977 \
 57 |     --data-path $DATA_PATH \
 58 |     --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \
 59 |     --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \
 60 |     --data-impl mmap \
 61 |     --split 100,0,0 \
 62 |     --distributed-backend xla \
 63 |     --lr 0.00012 \
 64 |     --lr-decay-style cosine \
 65 |     --min-lr 1.2e-5 \
 66 |     --weight-decay 1e-1 \
 67 |     --clip-grad 1 \
 68 |     --lr-warmup-fraction 0.00125 \
 69 |     --log-interval 1 \
 70 |     --tensorboard-log-interval 1 \
 71 |     --eval-interval $TRAIN_ITERS \
 72 |     --eval-iters 1000 \
 73 |     --attention-dropout 0 \
 74 |     --hidden-dropout 0 \
 75 |     --no-masked-softmax-fusion \
 76 |     --no-bias-gelu-fusion \
 77 |     --no-bias-dropout-fusion \
 78 |     --no-async-tensor-model-parallel-allreduce \
 79 |     --no-contiguous-buffers-in-local-ddp \
 80 |     --init-method-std 0.006 \
 81 |     --adam-beta1 0.9 \
 82 |     --adam-beta2 0.95 \
 83 |     --save-xser $CHECKPOINT_PATH \
 84 |     --save-interval 2000 \
 85 |     --keep-last-checkpoint-only \
 86 |     --use-cpu-initialization \
 87 |     --tensorboard-dir $TB_DIR \
 88 |     |& tee run_log_$MODEL_CONFIG_NAME.$RANK_NODE.$WORLD_SIZE_JOB.txt &
 89 | wait %1
 90 |  
 91 | ret_val=$?
 92 | 
 93 | if [ $ret_val -eq 0 ] ; then
 94 |     msg="SUCCESS"
 95 | elif [ $ret_val -eq 2 ] ; then
 96 |     msg="SCANCEL/INTERRUPT"
 97 | else
 98 |     msg="INTERNAL FAILURE"
 99 |     # Uncomment lines below to requeue after internal failure (make sure the script doesn't fail)
100 |     #msg="INTERNAL FAILURE - HARDWARE ISSUE? Requeue JOB ID ${SLURM_JOB_ID} - use scancel to terminate"
101 |     #scontrol requeue ${SLURM_JOB_ID}
102 | fi
103 | echo $msg
104 | 
105 | if [ $ret_val -eq 0 ]; then
106 |     success=1
107 | else
108 |     success=0
109 | fi
110 | 
111 | # Below is for testing only, not needed for actual execution
112 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh
113 | if [ -e $dump_to_s3_update_json_scr ]; then
114 |     $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON."
115 | else
116 |     echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON."
117 | fi
118 | 
119 | exit $ret_val
120 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_6.7B.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --nodes=16
3 | #SBATCH --exclusive
4 | #SBATCH --output=slurm-%x-%j.out
5 | #SBATCH --requeue
6 | #SBATCH --open-mode=append
7 | 
8 | srun ./examples/pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh
9 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_6.7B_32layers_bf16.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -o pipefail
 3 | 
 4 | MODEL_CONFIG_NAME=gpt3_6.7B_32layers_bf16
 5 | 
 6 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document
 7 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME}
 8 | 
 9 | NUM_NEURONCORES=32
10 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
11 | 
12 | # Keep only 3 number of graphs loaded in Neuron runtime for each process to reduce device mem usage
13 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3
14 | # Mark all parameter transfers as static to enable runtime optimizations for wrapped torch.nn modules
15 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1
16 | # Enables custom lowering for Softmax operation to enable compiler optimizations and improve GPT performance
17 | export NEURON_FUSE_SOFTMAX=1
18 | # Cast training to BF16 and enable stochastic rounding
19 | export XLA_USE_BF16=1
20 | 
21 | # Workaround "Too many open files" error with GPT training on U20 server AMI
22 | ulimit -n 8192
23 | 
24 | TRAIN_ITERS=10000
25 | TB_DIR=./tb_${MODEL_CONFIG_NAME}
26 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then
27 |     TRAIN_ITERS=65
28 |     TB_DIR=/tmp/parallel_compile_ignored_tb_output
29 | fi
30 | 
31 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
32 |     --tensor-model-parallel-size 8 \
33 |     --num-layers 32 \
34 |     --hidden-size 4096 \
35 |     --num-attention-heads 32 \
36 |     --micro-batch-size 1 \
37 |     --global-batch-size 64 \
38 |     --seq-length 2048 \
39 |     --max-position-embeddings 2048 \
40 |     --train-iters $TRAIN_ITERS \
41 |     --lr-decay-iters 320000 \
42 |     --data-path $DATA_PATH \
43 |     --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \
44 |     --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \
45 |     --data-impl mmap \
46 |     --split 100,0,0 \
47 |     --distributed-backend xla \
48 |     --lr 0.00015 \
49 |     --lr-decay-style cosine \
50 |     --min-lr 1.0e-5 \
51 |     --weight-decay 1e-2 \
52 |     --clip-grad 1 \
53 |     --lr-warmup-fraction .01 \
54 |     --log-interval 1 \
55 |     --tensorboard-log-interval 1 \
56 |     --eval-interval $TRAIN_ITERS \
57 |     --eval-iters 1000 \
58 |     --attention-dropout 0 \
59 |     --hidden-dropout 0 \
60 |     --no-masked-softmax-fusion \
61 |     --no-bias-gelu-fusion \
62 |     --no-bias-dropout-fusion \
63 |     --no-async-tensor-model-parallel-allreduce \
64 |     --no-contiguous-buffers-in-local-ddp \
65 |     --save-xser $CHECKPOINT_PATH \
66 |     --save-interval 2000 \
67 |     --keep-last-checkpoint-only \
68 |     --use-cpu-initialization \
69 |     --tensorboard-dir $TB_DIR \
70 |     |& tee run_log_$MODEL_CONFIG_NAME.txt &
71 | wait %1
72 | 
73 | ret_val=$?
74 | if [ $ret_val -eq 0 ]; then
75 |     success=1
76 | else
77 |     success=0
78 | fi
79 | 
80 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh
81 | if [ -e $dump_to_s3_update_json_scr ]; then
82 |     $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON."
83 | else
84 |     echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON."
85 | fi
86 | 
87 | exit $ret_val
88 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | set -o pipefail
  3 | 
  4 | MODEL_CONFIG_NAME=gpt3_6.7B_32layers_bf16
  5 |  
  6 | # Enable Elastic Fabric Adapter for higher networking performance
  7 | export FI_EFA_USE_DEVICE_RDMA=1
  8 | export FI_PROVIDER=efa
  9 | export FI_EFA_FORK_SAFE=1
 10 | 
 11 | DATA_PATH=~/examples_datasets/gpt2/my-gpt2_text_document
 12 |  
 13 | MASTER_ADDR=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
 14 | MASTER_PORT=2022
 15 | NUM_NEURONCORES=32
 16 |  
 17 | WORLD_SIZE_JOB=$SLURM_NTASKS
 18 | RANK_NODE=$SLURM_NODEID
 19 | DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE_JOB --node_rank $RANK_NODE --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 20 | echo $DISTRIBUTED_ARGS
 21 |  
 22 | CHECKPOINT_PATH=chkpt_${MODEL_CONFIG_NAME}_${WORLD_SIZE_JOB}
 23 | 
 24 | # Keep only 3 number of graphs loaded in Neuron runtime for each process to reduce device mem usage
 25 | export NEURON_NUM_RECENT_MODELS_TO_KEEP=3
 26 | # Mark all parameter transfers as static to enable runtime optimizations for wrapped torch.nn modules
 27 | export NEURON_TRANSFER_ALL_PARAMETERS_WITH_STATIC_RING=1
 28 | # Enables custom lowering for Softmax operation to enable compiler optimizations and improve GPT performance
 29 | export NEURON_FUSE_SOFTMAX=1
 30 | # Cast training to BF16 and enable stochastic rounding
 31 | export XLA_USE_BF16=1
 32 | # Increase Neuron RT execution timeout in case slow compilation causes Neuron RT to wait longer than default timeout
 33 | export NEURON_RT_EXEC_TIMEOUT=600
 34 | 
 35 | # Separate NeuronCache dir per node, workaround limitation to file locking on NFS
 36 | export NEURON_CC_FLAGS="--cache_dir=$HOME/neuron_cache/gpt/`hostname`"
 37 |  
 38 | TRAIN_ITERS=143051
 39 | TB_DIR=./tb_${MODEL_CONFIG_NAME}
 40 | # Run fewer steps and ignore tb output when extract graphs only (neuron_parallel_compile)
 41 | if [[ "$NEURON_EXTRACT_GRAPHS_ONLY" == "1" ]]; then
 42 |     # Using larger trial count to workaround extra recompilation due to https://github.com/pytorch/xla/issues/4994
 43 |     TRAIN_ITERS=325
 44 |     TB_DIR=/tmp/parallel_compile_ignored_tb_output
 45 | fi
 46 |  
 47 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
 48 |     --tensor-model-parallel-size 8 \
 49 |     --num-layers 32 \
 50 |     --hidden-size 4096 \
 51 |     --num-attention-heads 32 \
 52 |     --micro-batch-size 1 \
 53 |     --global-batch-size 1024 \
 54 |     --seq-length 2048 \
 55 |     --max-position-embeddings 2048 \
 56 |     --train-iters $TRAIN_ITERS \
 57 |     --lr-decay-iters 123977 \
 58 |     --data-path $DATA_PATH \
 59 |     --vocab-file ~/examples_datasets/gpt2/gpt2-vocab.json \
 60 |     --merge-file ~/examples_datasets/gpt2/gpt2-merges.txt \
 61 |     --data-impl mmap \
 62 |     --split 100,0,0 \
 63 |     --distributed-backend xla \
 64 |     --lr 0.00012 \
 65 |     --lr-decay-style cosine \
 66 |     --min-lr 1.2e-5 \
 67 |     --weight-decay 1e-1 \
 68 |     --clip-grad 1 \
 69 |     --lr-warmup-fraction 0.00125 \
 70 |     --log-interval 1 \
 71 |     --tensorboard-log-interval 1 \
 72 |     --eval-interval $TRAIN_ITERS \
 73 |     --eval-iters 1000 \
 74 |     --attention-dropout 0 \
 75 |     --hidden-dropout 0 \
 76 |     --no-masked-softmax-fusion \
 77 |     --no-bias-gelu-fusion \
 78 |     --no-bias-dropout-fusion \
 79 |     --no-async-tensor-model-parallel-allreduce \
 80 |     --no-contiguous-buffers-in-local-ddp \
 81 |     --init-method-std 0.006 \
 82 |     --adam-beta1 0.9 \
 83 |     --adam-beta2 0.95 \
 84 |     --save-xser $CHECKPOINT_PATH \
 85 |     --save-interval 2000 \
 86 |     --keep-last-checkpoint-only \
 87 |     --use-cpu-initialization \
 88 |     --tensorboard-dir $TB_DIR \
 89 |     |& tee run_log_$MODEL_CONFIG_NAME.$RANK_NODE.$WORLD_SIZE_JOB.txt &
 90 | wait %1
 91 |  
 92 | ret_val=$?
 93 | 
 94 | if [ $ret_val -eq 0 ] ; then
 95 |     msg="SUCCESS"
 96 | elif [ $ret_val -eq 2 ] ; then
 97 |     msg="SCANCEL/INTERRUPT"
 98 | else
 99 |     msg="INTERNAL FAILURE"
100 |     # Uncomment lines below to requeue after internal failure (make sure the script doesn't fail)
101 |     #msg="INTERNAL FAILURE - HARDWARE ISSUE? Requeue JOB ID ${SLURM_JOB_ID} - use scancel to terminate"
102 |     #scontrol requeue ${SLURM_JOB_ID}
103 | fi
104 | echo $msg
105 | 
106 | if [ $ret_val -eq 0 ]; then
107 |     success=1
108 | else
109 |     success=0
110 | fi
111 | 
112 | # Below is for testing only, not needed for actual execution
113 | dump_to_s3_update_json_scr=../../dump_to_s3_update_test_json.sh
114 | if [ -e $dump_to_s3_update_json_scr ]; then
115 |     $dump_to_s3_update_json_scr $@ --key=inference_success --value=$success || echo "Unable to update test result JSON."
116 | else
117 |     echo "WARNING: Script $dump_to_s3_update_json_scr not found. Not updating test result JSON."
118 | fi
119 | 
120 | exit $ret_val
121 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_6.7B_compile.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --nodes=16
3 | #SBATCH --exclusive
4 | #SBATCH --output=slurm-%x-%j.out
5 | #SBATCH --requeue
6 | #SBATCH --open-mode=append
7 | 
8 | srun neuron_parallel_compile ./examples/pretrain_gpt3_6.7B_32layers_bf16_bs1024_slurm.sh
9 | 


--------------------------------------------------------------------------------
/examples/sc21/CONFIG.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # SLURM options.
 5 | export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
 6 | export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
 7 | 
 8 | 
 9 | # Source code.
10 | export MEGATRON_CODE_DIR=<megatron source code directory>
11 | 
12 | 
13 | # This variable is used to mount the relevant part of the filesystem
14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
15 | # launch directory already get mounted; this variable should be used to
16 | # mount the directories that contain the data and tokenizer files.
17 | export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
18 | 
19 | 
20 | # Data and tokenizer files.
21 | MEGATRON_DATA=<path to megatron processed data>
22 | BPE_VOCAB_FILE=<path to bpe vocab file>
23 | BPE_MERGE_FILE=<path to bpe merges file>
24 | 
25 | 
26 | # Megatron input parameters.
27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
28 | # that are not listed here. 
29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
30 | 	--tensor-model-parallel-size ${TP} \
31 | 	--pipeline-model-parallel-size ${PP} \
32 | 	--micro-batch-size ${MBS} \
33 | 	--global-batch-size ${GBS} \
34 |         --num-layers ${NLS} \
35 |         --hidden-size ${HS} \
36 |         --num-attention-heads ${NAH} \
37 | 	--DDP-impl ${DDP} \
38 | 	--data-path ${MEGATRON_DATA} \
39 | 	--vocab-file ${BPE_VOCAB_FILE} \
40 | 	--merge-file ${BPE_MERGE_FILE} \
41 |         --log-interval 5 \
42 |         --seq-length 2048 \
43 |         --max-position-embeddings 2048 \
44 |         --train-iters 500 \
45 |         --lr-decay-iters 320 \
46 |         --lr 0.0001 \
47 | 	--min-lr 0.00001 \
48 |         --lr-decay-style cosine \
49 |         --lr-warmup-fraction 0.01 \
50 |         --split 969,30,1 \
51 |         --eval-iters 100 \
52 |         --eval-interval 1000 \
53 |         --clip-grad 1.0 \
54 |         --fp16 \
55 | 	--loss-scale 8192 "
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/examples/sc21/README.md:
--------------------------------------------------------------------------------
 1 | # Reproducing Figures in SC21 Paper
 2 | 
 3 | 
 4 | This directory contains some of the scripts that were used to produce the
 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
 9 | schedulers as well.
10 | 
11 | 
12 | ## Setup
13 | 
14 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
15 | update the unspecified values (in angle brackets `<...>`) before launching any
16 | scripts.
17 | 
18 | 
19 | 
20 | ## Scripts
21 | 
22 | Below is a list of scripts that can be used to reproduce various figures in our
23 | [paper](https://arxiv.org/pdf/2104.04473.pdf):
24 | 
25 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
26 | for GPT models ranging from 1 billion to 1 trillion parameters.
27 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
28 | performance of pipeline parallelism.
29 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
30 | the interleaved schedule on a 175B GPT model.
31 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
32 | different degrees of pipeline and tensor model parallelism on a model with
33 | 162.2 billion parameters.
34 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
35 | different degrees of data and pipeline model parallelism on a model with
36 | 5.9 billion parameters.
37 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
38 | different degrees of data and tensor model parallelism on a model with
39 | 5.9 billion parameters.
40 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
41 | microbatch size.
42 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
43 | activation recomputation.
44 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
45 | the scatter-gather communication optimization.
46 | 


--------------------------------------------------------------------------------
/examples/sc21/SBATCH.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | sbatch -p ${SLURM_PARTITION} \
 5 |        -A ${SLURM_ACCOUNT} \
 6 |        --job-name=${JOB_NAME} \
 7 |        --nodes=${NNODES} \
 8 |        --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 9 | 
10 | exit 0
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/sc21/SRUN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 4 | 
 5 | 
 6 | THIS_DIR=`pwd`
 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 8 | mkdir -p ${THIS_DIR}/logs
 9 | 
10 | 
11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
12 | 
13 | 
14 | srun -l \
15 |      --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
16 |      --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
17 |      --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [1, 2, 4, 8].
 8 | PP=1
 9 | 
10 | # Batch size (global batch size) options = [8, 128].
11 | GBS=8
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel size options.
18 | NLS=$((3*PP))
19 | NNODES=${PP}
20 | 
21 | 
22 | # Other params.
23 | TP=8
24 | MBS=1
25 | HS=20480
26 | NAH=128
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Interleaved schedule options = [YES, NO].
 8 | INTERLEAVED=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set interleaved schedule options.
18 | if [ ${INTERLEAVED} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${INTERLEAVED} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and tensor-parallel size options.
18 | TP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | MBS=1
23 | NLS=32
24 | HS=20480
25 | NAH=128
26 | DDP=local
27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
28 | NNODES=8
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and data-parallel size options.
18 | DP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | TP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32].
 8 | TP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set tensor-parallel and data-parallel size options.
18 | DP=$((64/TP))
19 | 
20 | 
21 | # Other params.
22 | PP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Microbatch size options = [1, 2, 4, 8].
 8 | MBS=1
 9 | 
10 | # Batch size (global batch size) options = [128, 512].
11 | GBS=128
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Other params.
18 | TP=8
19 | PP=8
20 | NLS=32
21 | HS=15360
22 | NAH=128
23 | DDP=local
24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
25 | NNODES=8
26 | 
27 | 
28 | # Name of the job.
29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
30 | 
31 | 
32 | # Import the configs.
33 | . `pwd`/CONFIG.sh
34 | 
35 | 
36 | # Submit the job.
37 | . `pwd`/SBATCH.sh
38 | 
39 | 
40 | exit 0
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_17.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Activation recomputation options = [YES, NO].
 8 | ACTIVATION_RECOMPUTATION=YES
 9 | 
10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256].
11 | GBS=1
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set activation recomputation.
18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS=""
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=16
31 | MBS=1
32 | NLS=80
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=16
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_18.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Scatter-gather communication optimization options = [YES, NO].
 8 | SCATTER_GATHER=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set scatter-gather communication optimization options.
18 | if [ ${SCATTER_GATHER} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${SCATTER_GATHER} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_table_1.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # ================================
  4 | # Choose the case to run.
  5 | # ================================
  6 | # model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
  7 | MODEL_SIZE=1.7B
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | if [ ${MODEL_SIZE} == "1.7B" ]; then
 15 |     TP=1
 16 |     PP=1
 17 |     MBS=16
 18 |     GBS=512
 19 |     NLS=24
 20 |     HS=2304
 21 |     NAH=24
 22 |     DDP=torch
 23 |     NNODES=4
 24 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 25 | elif [ ${MODEL_SIZE} == "3.6B" ]; then
 26 |     TP=2
 27 |     PP=1
 28 |     MBS=16
 29 |     GBS=512
 30 |     NLS=30
 31 |     HS=3072
 32 |     NAH=32
 33 |     DDP=torch
 34 |     NNODES=8
 35 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 36 | elif [ ${MODEL_SIZE} == "7.5B" ]; then
 37 |     TP=4
 38 |     PP=1
 39 |     MBS=16
 40 |     GBS=512
 41 |     NLS=36
 42 |     HS=4096
 43 |     NAH=32
 44 |     DDP=torch
 45 |     NNODES=16
 46 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 47 | elif [ ${MODEL_SIZE} == "18B" ]; then
 48 |     TP=8
 49 |     PP=1
 50 |     MBS=8
 51 |     GBS=1024
 52 |     NLS=40
 53 |     HS=6144
 54 |     NAH=48
 55 |     DDP=torch
 56 |     NNODES=32
 57 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 58 | elif [ ${MODEL_SIZE} == "39B" ]; then
 59 |     TP=8
 60 |     PP=2
 61 |     MBS=4
 62 |     GBS=1536
 63 |     NLS=48
 64 |     HS=8192
 65 |     NAH=64
 66 |     DDP=local
 67 |     NNODES=64
 68 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 69 | elif [ ${MODEL_SIZE} == "76B" ]; then
 70 |     TP=8
 71 |     PP=4
 72 |     MBS=2
 73 |     GBS=1792
 74 |     NLS=60
 75 |     HS=10240
 76 |     NAH=80
 77 |     DDP=local
 78 |     NNODES=128
 79 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
 80 | elif [ ${MODEL_SIZE} == "145B" ]; then
 81 |     TP=8
 82 |     PP=8
 83 |     MBS=2
 84 |     GBS=2304
 85 |     NLS=80
 86 |     HS=12288
 87 |     NAH=96
 88 |     DDP=local
 89 |     NNODES=192
 90 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
 91 | elif [ ${MODEL_SIZE} == "310B" ]; then
 92 |     TP=8
 93 |     PP=16
 94 |     MBS=1
 95 |     GBS=2160
 96 |     NLS=96
 97 |     HS=16384
 98 |     NAH=128
 99 |     DDP=local
100 |     NNODES=240
101 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
102 | elif [ ${MODEL_SIZE} == "530B" ]; then
103 |     TP=8
104 |     PP=35
105 |     MBS=1
106 |     GBS=2520
107 |     NLS=105
108 |     HS=20480
109 |     NAH=128
110 |     DDP=local
111 |     NNODES=315
112 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
113 | elif [ ${MODEL_SIZE} == "1T" ]; then
114 |     TP=8
115 |     PP=64
116 |     MBS=1
117 |     GBS=3072
118 |     NLS=128
119 |     HS=25600
120 |     NAH=160
121 |     DDP=local
122 |     NNODES=384
123 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
124 | else
125 |     echo "Invalid configuration"
126 |     exit 1
127 | fi
128 | 
129 | 
130 | # Name of the job
131 | export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
132 | 
133 | 
134 | # Import the configs.
135 | . `pwd`/CONFIG.sh
136 | 
137 | 
138 | # Submit the job.
139 | . `pwd`/SBATCH.sh
140 | 
141 | 
142 | exit 0
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/images/cases_april2021.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-reference-for-megatron-lm/868d46cccb320a05eeac833be4a55c2c07db620a/images/cases_april2021.png


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from .global_vars import get_args
18 | from .global_vars import get_current_global_batch_size
19 | from .global_vars import get_num_microbatches
20 | from .global_vars import update_num_microbatches
21 | from .global_vars import get_tokenizer
22 | from .global_vars import get_tensorboard_writer
23 | from .global_vars import get_adlr_autoresume
24 | from .global_vars import get_timers
25 | from .initialize  import initialize_megatron
26 | from .mpu.initialize import get_tensor_model_parallel_rank, get_data_parallel_rank
27 | 
28 | def print_rank_0(message):
29 |     """If distributed is initialized, print only on rank 0."""
30 |     if torch.distributed.is_initialized():
31 |         if torch.distributed.get_rank() == 0:
32 |             print(message, flush=True)
33 |     else:
34 |         print(message, flush=True)
35 | 
36 | def is_last_rank():
37 |     return torch.distributed.get_rank() == (
38 |         torch.distributed.get_world_size() - 1)
39 | 
40 | def print_rank_last(message):
41 |     """If distributed is initialized, print only on last rank."""
42 |     if torch.distributed.is_initialized():
43 |         if is_last_rank():
44 |             print(message, flush=True)
45 |     else:
46 |         print(message, flush=True)
47 | 
48 | def print_rank_2D(message):
49 |     """If distributed is initialized, print only on rank 0."""
50 |     if torch.distributed.is_initialized():
51 |         tp_rank = get_tensor_model_parallel_rank()
52 |         dp_rank = get_data_parallel_rank()
53 |         print(f'tp:{tp_rank},dp:{dp_rank}:'+message, flush=True)
54 |     else:
55 |         print(message, flush=True)
56 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | PYTHON_PATH = $(shell readlink -f $(shell which python3))
 5 | ifneq ("$(wildcard $(PYTHON_PATH)m-config)","")
 6 | LIBEXT = $(shell $(PYTHON_PATH)m-config --extension-suffix)
 7 | else ifneq ("$(wildcard $(PYTHON_PATH)-config)","")
 8 | LIBEXT = $(shell $(PYTHON_PATH)-config --extension-suffix)
 9 | else
10 | $(error "python3-config not found. Please run 'sudo apt install -y python3-dev' on Ubuntu or 'sudo yum install -y python3-devel' on Amazon Linux.")
11 | endif
12 | 
13 | default: $(LIBNAME)$(LIBEXT)
14 | 
15 | %$(LIBEXT): %.cpp
16 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
17 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Blendable dataset."""
17 | 
18 | import time
19 | 
20 | import numpy as np
21 | import torch
22 | 
23 | from megatron import print_rank_0
24 | from megatron import mpu
25 | 
26 | 
27 | class BlendableDataset(torch.utils.data.Dataset):
28 | 
29 | 
30 |     def __init__(self, datasets, weights):
31 | 
32 |         self.datasets = datasets
33 |         num_datasets = len(datasets)
34 |         assert num_datasets == len(weights)
35 | 
36 |         self.size = 0
37 |         for dataset in self.datasets:
38 |             self.size += len(dataset)
39 | 
40 |         # Normalize weights.
41 |         weights = np.array(weights, dtype=np.float64)
42 |         sum_weights = np.sum(weights)
43 |         assert sum_weights > 0.0
44 |         weights /= sum_weights
45 | 
46 |         # Build indecies.
47 |         start_time = time.time()
48 |         assert num_datasets < 255
49 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
50 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
51 | 
52 |         from megatron.data import helpers
53 |         helpers.build_blending_indices(self.dataset_index,
54 |                                        self.dataset_sample_index,
55 |                                        weights, num_datasets, self.size,
56 |                                        torch.distributed.get_rank() == 0)
57 |         print_rank_0('> elapsed time for building blendable dataset indices: '
58 |                      '{:.2f} (sec)'.format(time.time() - start_time))
59 | 
60 | 
61 |     def __len__(self):
62 |         return self.size
63 | 
64 | 
65 |     def __getitem__(self, idx):
66 |         dataset_idx = self.dataset_index[idx]
67 |         sample_idx = self.dataset_sample_index[idx]
68 |         return self.datasets[dataset_idx][sample_idx]
69 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_indexed_dataset.py:
--------------------------------------------------------------------------------
  1 | # This file isn't really a formal automated test, it's just a place to
  2 | # put some code used during development and manual testing of
  3 | # indexed_dataset.
  4 | 
  5 | from megatron.data import indexed_dataset
  6 | from megatron.tokenizer import build_tokenizer
  7 | import argparse
  8 | import os
  9 | import sys
 10 | 
 11 | import torch
 12 | 
 13 | script_dir = os.path.dirname(os.path.realpath(__file__))
 14 | sys.path.append(os.path.join(script_dir, "../../../"))
 15 | 
 16 | 
 17 | def test_indexed_dataset(args):
 18 |     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 19 |     tokenizer = build_tokenizer(args)
 20 |     print(len(ds.doc_idx))
 21 |     print(len(ds))
 22 |     print(ds.doc_idx[-1])
 23 |     if ds.supports_prefetch:
 24 |         # just prefetch the whole thing in test (so assume it is small)
 25 |         ds.prefetch(range(len(ds)))
 26 |     if args.count > len(ds.doc_idx) - 1:
 27 |         args.count = len(ds.doc_idx) - 1
 28 | 
 29 |     for i in range(args.count):
 30 |         start = ds.doc_idx[i]
 31 |         end = ds.doc_idx[i + 1]
 32 |         ids = ds[start:end]
 33 |         print(f"Document {i}:")
 34 |         print("--------------")
 35 |         for s in ids:
 36 |             assert len(s) > 0
 37 |             l = s.data.tolist()
 38 |             text = tokenizer.detokenize(l)
 39 |             print(text)
 40 |             print("---")
 41 | 
 42 | 
 43 | def test_indexed_dataset_get(args):
 44 |     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 45 |     tokenizer = build_tokenizer(args)
 46 |     size = ds.sizes[0]
 47 |     print(f"size: {size}")
 48 |     full = ds.get(0)
 49 |     print(full)
 50 |     # print(tokenizer.detokenize(full.data.tolist()))
 51 |     print("---")
 52 |     end = ds.get(0, offset=size - 10)
 53 |     print(end)
 54 |     # print(tokenizer.detokenize(end.data.tolist()))
 55 | 
 56 |     start = ds.get(0, length=10)
 57 |     print(start)
 58 |     # print(tokenizer.detokenize(start.data.tolist()))
 59 | 
 60 |     part = ds.get(0, offset=2, length=8)
 61 |     print(part)
 62 |     # print(tokenizer.detokenize(part.data.tolist()))
 63 | 
 64 | # def test_albert_dataset(args):
 65 | #     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
 66 | #     # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 67 | #     # ds = AlbertDataset(idataset, tokenizer)
 68 | #     ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
 69 | #                                   args.epochs, args.max_num_samples,
 70 | #                                   args.masked_lm_prob, args.seq_length,
 71 | #                                   args.short_seq_prob, args.seed)
 72 | #     truncated = 0
 73 | #     total = 0
 74 | #     for i, s in enumerate(ds):
 75 | #         ids = s['text']
 76 | #         tokens = ds.tokenizer.convert_ids_to_tokens(ids)
 77 | #         print(tokens)
 78 | #         if i >= args.count-1:
 79 | #             exit()
 80 | 
 81 | 
 82 | def main():
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument('--data', type=str, help='prefix to data files')
 85 |     parser.add_argument('--dataset-impl', type=str, default='infer',
 86 |                         choices=['lazy', 'cached', 'mmap', 'infer'])
 87 |     parser.add_argument('--count', type=int, default=10,
 88 |                         help='Number of samples/documents to print')
 89 | 
 90 |     group = parser.add_argument_group(title='tokenizer')
 91 |     group.add_argument('--tokenizer-type', type=str, required=True,
 92 |                        choices=['BertWordPieceLowerCase',
 93 |                                 'GPT2BPETokenizer'],
 94 |                        help='What type of tokenizer to use.')
 95 |     group.add_argument('--vocab-file', type=str, default=None,
 96 |                        help='Path to the vocab file')
 97 |     group.add_argument('--merge-file', type=str, default=None,
 98 |                        help='Path to the BPE merge file (if necessary).')
 99 | 
100 |     parser.add_argument('--epochs', type=int, default=5,
101 |                         help='Number of epochs to plan for')
102 |     parser.add_argument('--max-num-samples', type=int, default=None,
103 |                         help='Maximum number of samples to plan for')
104 |     parser.add_argument('--masked-lm-prob', type=float, default=0.15,
105 |                         help='probability of masking tokens')
106 |     parser.add_argument('--seq-length', type=int, default=512,
107 |                         help='maximum sequence length')
108 |     parser.add_argument('--short-seq-prob', type=float, default=0.1,
109 |                         help='probability of creating a short sequence')
110 |     parser.add_argument('--seed', type=int, default=1234,
111 |                         help='random seed')
112 |     args = parser.parse_args()
113 |     args.rank = 0
114 |     args.make_vocab_size_divisible_by = 128
115 |     args.tensor_model_parallel_size = 1
116 | 
117 |     if args.dataset_impl == "infer":
118 |         args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
119 | 
120 | #    test_albert_dataset(args)
121 |     test_indexed_dataset_get(args)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/megatron/data/vit_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import os
16 | import torch
17 | from torchvision import datasets, transforms
18 | from megatron.data.autoaugment import ImageNetPolicy
19 | 
20 | 
21 | def build_train_valid_datasets(data_path, crop_size=224, color_jitter=True):
22 | 
23 |     # training dataset
24 |     train_data_path = os.path.join(data_path[0], "train")
25 |     normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
26 |     process = [
27 |         transforms.RandomResizedCrop(crop_size),
28 |         transforms.RandomHorizontalFlip(),
29 |     ]
30 |     if color_jitter:
31 |         process += [
32 |             transforms.ColorJitter(
33 |                 brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1
34 |             )
35 |         ]
36 |     fp16_t = transforms.ConvertImageDtype(torch.half)
37 |     process += [ImageNetPolicy(), transforms.ToTensor(), normalize, fp16_t]
38 |     transform_train = transforms.Compose(process)
39 |     train_data = datasets.ImageFolder(
40 |         root=train_data_path, transform=transform_train
41 |     )
42 | 
43 |     # validation dataset
44 |     val_data_path = os.path.join(data_path[0], "val")
45 |     transform_val = transforms.Compose(
46 |         [
47 |             transforms.Resize(crop_size),
48 |             transforms.CenterCrop(crop_size),
49 |             transforms.ToTensor(),
50 |             normalize,
51 |             fp16_t
52 |         ]
53 |     )
54 |     val_data = datasets.ImageFolder(
55 |         root=val_data_path, transform=transform_val
56 |     )
57 | 
58 |     return train_data, val_data
59 | 


--------------------------------------------------------------------------------
/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """For backward compatibility, we need the class definitions to deserialize."""
17 | 
18 | class LossScaler:
19 |     def __init__(self, scale=1):
20 |         self.cur_scale = scale
21 | 
22 | class DynamicLossScaler:
23 |     def __init__(self,
24 |                  init_scale=2**32,
25 |                  scale_factor=2.,
26 |                  scale_window=1000,
27 |                  min_scale=1,
28 |                  delayed_shift=1,
29 |                  consecutive_hysteresis=False):
30 |         self.cur_scale = init_scale
31 |         self.cur_iter = 0
32 |         self.last_overflow_iter = -1
33 |         self.scale_factor = scale_factor
34 |         self.scale_window = scale_window
35 |         self.min_scale = min_scale
36 |         self.delayed_shift = delayed_shift
37 |         self.cur_hysteresis = delayed_shift
38 |         self.consecutive_hysteresis = consecutive_hysteresis
39 | 
40 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import os
 17 | import pathlib
 18 | import subprocess
 19 | 
 20 | from torch.utils import cpp_extension
 21 | 
 22 | # Setting this param to a list has a problem of generating different
 23 | # compilation commands (with diferent order of architectures) and
 24 | # leading to recompilation of fused kernels. Set it to empty string
 25 | # to avoid recompilation and assign arch flags explicity in
 26 | # extra_cuda_cflags below
 27 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 28 | 
 29 | 
 30 | def load(args):
 31 | 
 32 |     # Check if cuda 11 is installed for compute capability 8.0
 33 |     cc_flag = []
 34 |     _, bare_metal_major, _ = _get_cuda_bare_metal_version(
 35 |         cpp_extension.CUDA_HOME)
 36 |     if int(bare_metal_major) >= 11:
 37 |         cc_flag.append('-gencode')
 38 |         cc_flag.append('arch=compute_80,code=sm_80')
 39 | 
 40 |     # Build path
 41 |     srcpath = pathlib.Path(__file__).parent.absolute()
 42 |     buildpath = srcpath / 'build'
 43 |     _create_build_dir(buildpath)
 44 | 
 45 |     # Helper function to build the kernels.
 46 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
 47 |         return cpp_extension.load(
 48 |             name=name,
 49 |             sources=sources,
 50 |             build_directory=buildpath,
 51 |             extra_cflags=['-O3',],
 52 |             extra_cuda_cflags=['-O3',
 53 |                                '-gencode', 'arch=compute_70,code=sm_70',
 54 |                                '--use_fast_math'] + extra_cuda_flags + cc_flag,
 55 |             verbose=(args.rank == 0)
 56 |         )
 57 | 
 58 |     # ==============
 59 |     # Fused softmax.
 60 |     # ==============
 61 | 
 62 |     if args.masked_softmax_fusion:
 63 |         extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
 64 |                             '-U__CUDA_NO_HALF_CONVERSIONS__',
 65 |                             '--expt-relaxed-constexpr',
 66 |                             '--expt-extended-lambda']
 67 |         
 68 |         # Upper triangular softmax.
 69 |         sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
 70 |                  srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
 71 |         scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
 72 |             "scaled_upper_triang_masked_softmax_cuda",
 73 |             sources, extra_cuda_flags)
 74 | 
 75 |         # Masked softmax.
 76 |         sources=[srcpath / 'scaled_masked_softmax.cpp',
 77 |                  srcpath / 'scaled_masked_softmax_cuda.cu']
 78 |         scaled_masked_softmax_cuda = _cpp_extention_load_helper(
 79 |             "scaled_masked_softmax_cuda", sources, extra_cuda_flags)
 80 | 
 81 |     # =================================
 82 |     # Mixed precision fused layer norm.
 83 |     # =================================
 84 | 
 85 |     extra_cuda_flags = ['-maxrregcount=50']
 86 |     sources=[srcpath / 'layer_norm_cuda.cpp',
 87 |              srcpath / 'layer_norm_cuda_kernel.cu']
 88 |     fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
 89 |         "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags)
 90 | 
 91 | 
 92 | def _get_cuda_bare_metal_version(cuda_dir):
 93 |     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
 94 |                                          universal_newlines=True)
 95 |     output = raw_output.split()
 96 |     release_idx = output.index("release") + 1
 97 |     release = output[release_idx].split(".")
 98 |     bare_metal_major = release[0]
 99 |     bare_metal_minor = release[1][0]
100 | 
101 |     return raw_output, bare_metal_major, bare_metal_minor
102 | 
103 | 
104 | def _create_build_dir(buildpath):
105 |     try:
106 |         os.mkdir(buildpath)
107 |     except OSError:
108 |         if not os.path.isdir(buildpath):
109 |             print(f"Creation of the build directory {buildpath} failed")
110 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | 
22 | 
23 | #ifndef TORCH_CHECK
24 | #define TORCH_CHECK AT_CHECK
25 | #endif
26 | 
27 | #ifdef VERSION_GE_1_3
28 | #define DATA_PTR data_ptr
29 | #else
30 | #define DATA_PTR data
31 | #endif
32 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     torch::Tensor const& mask,
28 |     float scale_factor);
29 | 
30 | torch::Tensor bwd_cuda(
31 |     torch::Tensor const& output_grads, 
32 |     torch::Tensor const& softmax_results,
33 |     float scale_factor);
34 | 
35 | int get_batch_per_block_cuda(
36 |     int query_seq_len,
37 |     int key_seq_len,
38 |     int batches,
39 |     int attn_heads);
40 | 
41 | torch::Tensor fwd(
42 |     torch::Tensor const& input,
43 |     torch::Tensor const& mask,
44 |     float scale_factor) {
45 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
46 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
47 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
48 |       "Only fp16 and bf16 are supported");
49 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
50 | 
51 |   return fwd_cuda(input, mask, scale_factor);
52 | }
53 | 
54 | torch::Tensor bwd(
55 |     torch::Tensor const& output_grads, 
56 |     torch::Tensor const& softmax_results,
57 |     float scale_factor) {
58 | 
59 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
60 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
61 | 
62 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
63 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
64 |       "Only fp16 and bf16 are supported");
65 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
66 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
67 |       "Only fp16 and bf16 are supported");
68 | 
69 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
70 | }
71 | 
72 | int get_batch_per_block(
73 |     int query_seq_len,
74 |     int key_seq_len,
75 |     int batches,
76 |     int attn_heads) {
77 |     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
78 | }
79 | 
80 | } // end namespace scaled_masked_softmax
81 | } // end namespace fused_softmax
82 | } // end namespace multihead_attn
83 | 
84 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
85 |   m.def("forward", 
86 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
87 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
88 | 
89 |   m.def("backward",
90 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
91 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
92 | 
93 |   m.def("get_batch_per_block",
94 |         &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
95 |         "Return Batch per block size."
96 |   );
97 | }
98 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* coding=utf-8
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ATen/ATen.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_fp16.h>
 21 | #include <cuda_profiler_api.h>
 22 | #include <ATen/cuda/CUDAContext.h>
 23 | #include <torch/extension.h>
 24 | #include "scaled_masked_softmax.h"
 25 | #include "type_shim.h"
 26 | 
 27 | namespace multihead_attn {
 28 | namespace fused_softmax {
 29 | namespace scaled_masked_softmax {
 30 | 
 31 | int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
 32 |     return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
 33 | }
 34 | 
 35 | 
 36 | torch::Tensor fwd_cuda(
 37 |     torch::Tensor const& input,
 38 |     torch::Tensor const& mask,
 39 |     float scale_factor)
 40 | {
 41 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 42 |   const int batches = input.size(0);
 43 |   const int pad_batches = mask.size(0);
 44 |   const int attn_heads = input.size(1);
 45 |   const int query_seq_len = input.size(2);
 46 |   const int key_seq_len = input.size(3);
 47 |   TORCH_INTERNAL_ASSERT(key_seq_len <= 2048);
 48 |   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
 49 |   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
 50 |   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
 51 |   TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
 52 |   TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
 53 | 
 54 |   // Output 
 55 |   auto act_options = input.options().requires_grad(false);
 56 |   torch::Tensor softmax_results = 
 57 |       torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
 58 | 
 59 |   // Softmax Intermediate Result Ptr
 60 |   void* input_ptr = static_cast<void*>(input.data_ptr());
 61 |   void* mask_ptr = static_cast<void*>(mask.data_ptr());
 62 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 63 | 
 64 |   DISPATCH_HALF_AND_BFLOAT(
 65 |       input.scalar_type(),
 66 |       "dispatch_scaled_masked_softmax_forward",
 67 |       dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
 68 |           reinterpret_cast<scalar_t*>(softmax_results_ptr),
 69 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
 70 | 	  reinterpret_cast<const uint8_t*>(mask_ptr),
 71 | 	  scale_factor,
 72 | 	  query_seq_len,
 73 | 	  key_seq_len,
 74 | 	  batches,
 75 | 	  attn_heads,
 76 | 	  pad_batches);
 77 |       );
 78 |   return softmax_results;
 79 | }
 80 | 
 81 | torch::Tensor bwd_cuda(
 82 |     torch::Tensor const& output_grads_, 
 83 |     torch::Tensor const& softmax_results_, 
 84 |     float scale_factor)  {
 85 | 	
 86 |   auto output_grads = output_grads_.contiguous();
 87 |   auto softmax_results = softmax_results_.contiguous();
 88 | 
 89 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 90 |   const int batches = output_grads.size(0);
 91 |   const int attn_heads = output_grads.size(1);
 92 |   const int query_seq_len = output_grads.size(2);
 93 |   const int key_seq_len = output_grads.size(3);
 94 | 
 95 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 96 | 
 97 |   //Softmax Grad
 98 |   DISPATCH_HALF_AND_BFLOAT(
 99 |       output_grads_.scalar_type(),
100 |       "dispatch_scaled_masked_softmax_backward",
101 |       dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
102 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
103 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
104 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
105 | 	  scale_factor,
106 | 	  query_seq_len,
107 | 	  key_seq_len,
108 | 	  batches,
109 | 	  attn_heads);
110 | 			   );
111 |   
112 |   //backward pass is completely in-place
113 |   return output_grads;
114 | }
115 | }
116 | }
117 | }
118 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     float scale_factor);
28 | 
29 | torch::Tensor bwd_cuda(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor);
33 | 
34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
35 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
36 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
37 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
38 |       "Only fp16 and bf16 are supported");
39 | 
40 |   return fwd_cuda(input, scale_factor);
41 | }
42 | 
43 | torch::Tensor bwd(
44 |     torch::Tensor const& output_grads, 
45 |     torch::Tensor const& softmax_results,
46 |     float scale_factor) {
47 | 
48 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
49 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
50 | 
51 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
52 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
53 |       "Only fp16 and bf16 are supported");
54 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
55 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
56 |       "Only fp16 and bf16 are supported");
57 | 
58 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
59 | }
60 | 
61 | } // end namespace scaled_upper_triang_masked_softmax
62 | } // end namespace fused_softmax
63 | } // end namespace multihead_attn
64 | 
65 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
66 |   m.def("forward", 
67 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
68 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
69 |   m.def("backward", 
70 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
71 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
72 | }
73 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include <cuda.h>
19 | #include <cuda_runtime.h>
20 | #include <cuda_fp16.h>
21 | #include <cuda_profiler_api.h>
22 | #include <ATen/cuda/CUDAContext.h>
23 | #include <torch/extension.h>
24 | #include "scaled_upper_triang_masked_softmax.h"
25 | #include "type_shim.h"
26 | 
27 | namespace multihead_attn {
28 | namespace fused_softmax {
29 | namespace scaled_upper_triang_masked_softmax {
30 | 
31 | torch::Tensor fwd_cuda(
32 |     torch::Tensor const& input, 
33 |     float scale_factor)
34 | {
35 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
36 |   const int attn_batches = input.size(0);
37 |   const int seq_len = input.size(1);
38 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
39 | 
40 |   // Output 
41 |   auto act_options = input.options().requires_grad(false);
42 |   torch::Tensor softmax_results = 
43 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
44 | 
45 |   // Softmax Intermediate Result Ptr
46 |   void* input_ptr = static_cast<void*>(input.data_ptr());
47 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
48 | 
49 |   DISPATCH_HALF_AND_BFLOAT(
50 |       input.scalar_type(),
51 |       "dispatch_scaled_upper_triang_masked_softmax_forward",
52 |       dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
53 | 	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
54 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
55 | 	  scale_factor,
56 | 	  seq_len,
57 | 	  seq_len,
58 | 	  attn_batches);
59 |       );
60 |   return softmax_results;
61 | }
62 | 				      
63 | 
64 | torch::Tensor bwd_cuda(
65 |     torch::Tensor const& output_grads_, 
66 |     torch::Tensor const& softmax_results_, 
67 |     float scale_factor)  {
68 | 	
69 |   auto output_grads = output_grads_.contiguous();
70 |   auto softmax_results = softmax_results_.contiguous();
71 | 
72 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
73 |   const int attn_batches = output_grads.size(0);
74 |   const int seq_len = output_grads.size(1);
75 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
76 | 
77 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
78 | 
79 |   //Softmax Grad
80 |   DISPATCH_HALF_AND_BFLOAT(
81 |       output_grads_.scalar_type(),
82 |       "dispatch_scaled_upper_triang_masked_softmax_backward",
83 |       dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
84 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
85 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
86 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
87 | 	  scale_factor,
88 | 	  seq_len,
89 | 	  seq_len,
90 | 	  attn_batches);
91 |       );
92 |   
93 |   //backward pass is completely in-place
94 |   return output_grads;
95 | }
96 | }
97 | }
98 | }
99 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-reference-for-megatron-lm/868d46cccb320a05eeac833be4a55c2c07db620a/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/fused_kernels/type_shim.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | 
18 | #include <ATen/ATen.h>
19 | #include "compat.h"
20 | 
21 | 
22 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
23 |   switch(TYPE)								\
24 |     {									\
25 |     case at::ScalarType::Half:						\
26 |       {									\
27 | 	using scalar_t = at::Half;					\
28 | 	__VA_ARGS__;							\
29 | 	break;								\
30 |       }									\
31 |     case at::ScalarType::BFloat16:					\
32 |       {									\
33 | 	using scalar_t = at::BFloat16;					\
34 | 	__VA_ARGS__;							\
35 | 	break;								\
36 |       }									\
37 |     default:								\
38 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
39 |       }
40 | 
41 | 
42 | 
43 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
44 |   switch(TYPEIN)							\
45 |     {									\
46 |     case at::ScalarType::Float:						\
47 |       {									\
48 | 	using scalar_t_in = float;					\
49 | 	switch(TYPEOUT)							\
50 | 	  {								\
51 | 	  case at::ScalarType::Float:					\
52 | 	    {								\
53 | 	      using scalar_t_out = float;				\
54 | 	      __VA_ARGS__;						\
55 | 	      break;							\
56 | 	    }								\
57 | 	  case at::ScalarType::Half:					\
58 | 	    {								\
59 | 	      using scalar_t_out = at::Half;				\
60 | 	      __VA_ARGS__;						\
61 | 	      break;							\
62 | 	    }								\
63 | 	  case at::ScalarType::BFloat16:				\
64 | 	    {								\
65 | 	      using scalar_t_out = at::BFloat16;			\
66 | 	      __VA_ARGS__;						\
67 | 	      break;							\
68 | 	    }								\
69 | 	  default:							\
70 | 	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
71 | 	  }								\
72 | 	break;								\
73 |       }									\
74 |     case at::ScalarType::Half:						\
75 |       {									\
76 | 	using scalar_t_in = at::Half;					\
77 | 	using scalar_t_out = at::Half;					\
78 | 	__VA_ARGS__;							\
79 | 	break;								\
80 |       }									\
81 |     case at::ScalarType::BFloat16:					\
82 |       {									\
83 | 	using scalar_t_in = at::BFloat16;				\
84 | 	using scalar_t_out = at::BFloat16;				\
85 | 	__VA_ARGS__;							\
86 | 	break;								\
87 |       }									\
88 |     default:								\
89 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
90 |     }
91 | 
92 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # Modifications copyright Amazon Web Services and its affiliates. All rights reserved.
16 | 
17 | #from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
18 | from torch.nn import LayerNorm
19 | 
20 | from .distributed import DistributedDataParallel
21 | from .bert_model import BertModel
22 | from .gpt_model import GPTModel
23 | from .t5_model import T5Model
24 | from .language_model import get_language_model
25 | from .module import Float16Module
26 | from .enums import ModelType
27 | 


--------------------------------------------------------------------------------
/megatron/model/classification.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Classification model."""
 17 | 
 18 | import torch
 19 | 
 20 | from megatron import get_args, print_rank_last
 21 | from megatron import mpu
 22 | from megatron.model.enums import AttnMaskType
 23 | from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
 24 | from megatron.model.language_model import get_language_model
 25 | from megatron.model.utils import get_linear_layer
 26 | from megatron.model.utils import init_method_normal
 27 | from megatron.model.utils import scaled_init_method_normal
 28 | from .module import MegatronModule
 29 | 
 30 | 
 31 | class Classification(MegatronModule):
 32 | 
 33 |     def __init__(self,
 34 |                  num_classes,
 35 |                  num_tokentypes=2,
 36 |                  pre_process=True,
 37 |                  post_process=True):
 38 |         super(Classification, self).__init__(share_word_embeddings=False)
 39 |         args = get_args()
 40 | 
 41 |         self.num_classes = num_classes
 42 |         self.pre_process = pre_process
 43 |         self.post_process = post_process
 44 |         init_method = init_method_normal(args.init_method_std)
 45 | 
 46 |         self.language_model, self._language_model_key = get_language_model(
 47 |             num_tokentypes=num_tokentypes,
 48 |             add_pooler=True,
 49 |             encoder_attn_mask_type=AttnMaskType.padding,
 50 |             init_method=init_method,
 51 |             scaled_init_method=scaled_init_method_normal(args.init_method_std,
 52 |                                                          args.num_layers),
 53 |             pre_process=self.pre_process,
 54 |             post_process=self.post_process)
 55 | 
 56 |         # Multi-choice head.
 57 |         if self.post_process:
 58 |             self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
 59 |             self.classification_head = get_linear_layer(args.hidden_size,
 60 |                                                         self.num_classes,
 61 |                                                         init_method)
 62 |             self._classification_head_key = 'classification_head'
 63 | 
 64 |     def set_input_tensor(self, input_tensor):
 65 |         """See megatron.model.transformer.set_input_tensor()"""
 66 |         self.language_model.set_input_tensor(input_tensor)
 67 | 
 68 |     def forward(self, model_input, attention_mask, tokentype_ids=None):
 69 | 
 70 |         extended_attention_mask = bert_extended_attention_mask(attention_mask)
 71 |         input_ids = model_input
 72 |         position_ids = bert_position_ids(input_ids)
 73 | 
 74 |         lm_output = self.language_model(
 75 |             input_ids,
 76 |             position_ids,
 77 |             extended_attention_mask,
 78 |             tokentype_ids=tokentype_ids
 79 |         )
 80 | 
 81 |         if self.post_process:
 82 |             _, pooled_output = lm_output
 83 |             classification_output = self.classification_dropout(pooled_output)
 84 |             classification_logits = self.classification_head(classification_output)
 85 | 
 86 |             # Reshape back to separate choices.
 87 |             classification_logits = classification_logits.view(-1, self.num_classes)
 88 | 
 89 |             return classification_logits
 90 |         return lm_output
 91 | 
 92 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
 93 |                                        keep_vars=False):
 94 |         """For easy load when model is combined with other heads,
 95 |         add an extra key."""
 96 | 
 97 |         state_dict_ = {}
 98 |         state_dict_[self._language_model_key] \
 99 |             = self.language_model.state_dict_for_save_checkpoint(
100 |                 destination, prefix, keep_vars)
101 |         if self.post_process:
102 |             state_dict_[self._classification_head_key] \
103 |                 = self.classification_head.state_dict(
104 |                     destination, prefix, keep_vars)
105 |         return state_dict_
106 | 
107 |     def load_state_dict(self, state_dict, strict=True):
108 |         """Customized load."""
109 | 
110 |         self.language_model.load_state_dict(
111 |             state_dict[self._language_model_key], strict=strict)
112 |         if self.post_process:
113 |             if self._classification_head_key in state_dict:
114 |                 self.classification_head.load_state_dict(
115 |                     state_dict[self._classification_head_key], strict=strict)
116 |             else:
117 |                 print_rank_last('***WARNING*** could not find {} in the checkpoint, '
118 |                                 'initializing to random'.format(
119 |                                     self._classification_head_key))
120 | 


--------------------------------------------------------------------------------
/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import enum
17 | 
18 | class ModelType(enum.Enum):
19 |     encoder_or_decoder = 1
20 |     encoder_and_decoder = 2
21 | 
22 | class LayerType(enum.Enum):
23 |     encoder = 1
24 |     decoder = 2
25 |  
26 | class AttnType(enum.Enum):
27 |     self_attn = 1
28 |     cross_attn = 2
29 | 
30 | class AttnMaskType(enum.Enum):
31 |     padding = 1
32 |     causal = 2
33 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | 
19 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
20 | # 1/sqrt(2*pi)-> 0.3989423
21 | # 1/sqrt(2)   -> 0.70710678
22 | # sqrt(2/pi)  -> 0.79788456
23 | # this function is tanh approximation of gelu
24 | # actual gelu is:
25 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
26 | 
27 | @torch.jit.script
28 | def bias_gelu(bias, y):
29 |     x = bias + y
30 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
31 | 
32 | # gradient of tanh approximation of gelu
33 | # gradient of actual gelu is:
34 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
35 | @torch.jit.script
36 | def bias_gelu_back(g, bias, y):
37 |     x = bias + y
38 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
39 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
40 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
41 |     return ff*g
42 | 
43 | class GeLUFunction(torch.autograd.Function):
44 |     @staticmethod
45 |     # bias is an optional argument
46 |     def forward(ctx, input, bias):
47 |         ctx.save_for_backward(input, bias)
48 |         return bias_gelu(bias, input)
49 | 
50 |     @staticmethod
51 |     def backward(ctx, grad_output):
52 |         input, bias = ctx.saved_tensors
53 |         tmp = bias_gelu_back(grad_output, bias, input)
54 |         return tmp, tmp
55 | 
56 | bias_gelu_impl = GeLUFunction.apply
57 | 


--------------------------------------------------------------------------------
/megatron/model/fused_layer_norm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """This code is copied fron NVIDIA apex:
17 |       https://github.com/NVIDIA/apex
18 |    with some changes. """
19 | 
20 | import numbers
21 | import torch
22 | from torch.nn.parameter import Parameter
23 | from torch.nn import init
24 | import importlib
25 | 
26 | global fused_mix_prec_layer_norm_cuda
27 | fused_mix_prec_layer_norm_cuda = None
28 | 
29 | 
30 | class FusedLayerNormAffineFunction(torch.autograd.Function):
31 | 
32 |   @staticmethod
33 |   def forward(ctx, input, weight, bias, normalized_shape, eps):
34 | 
35 |     ctx.normalized_shape = normalized_shape
36 |     ctx.eps = eps
37 |     input_ = input.contiguous()
38 |     weight_ = weight.contiguous()
39 |     bias_ = bias.contiguous()
40 |     output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
41 |         input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
42 |     ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
43 | 
44 |     return output
45 | 
46 | 
47 |   @staticmethod
48 |   def backward(ctx, grad_output):
49 | 
50 |     input_, weight_, bias_, mean, invvar = ctx.saved_tensors
51 |     grad_input = grad_weight = grad_bias = None
52 |     grad_input, grad_weight, grad_bias \
53 |       = fused_mix_prec_layer_norm_cuda.backward_affine(
54 |         grad_output.contiguous(), mean, invvar,
55 |         input_, ctx.normalized_shape,
56 |         weight_, bias_, ctx.eps)
57 | 
58 |     return grad_input, grad_weight, grad_bias, None, None
59 | 
60 | 
61 | 
62 | class MixedFusedLayerNorm(torch.nn.Module):
63 | 
64 |   def __init__(self, normalized_shape, eps=1e-5):
65 |         super(MixedFusedLayerNorm, self).__init__()
66 | 
67 |         global fused_mix_prec_layer_norm_cuda
68 |         #fused_mix_prec_layer_norm_cuda = importlib.import_module(
69 |         #  "fused_mix_prec_layer_norm_cuda")
70 |         fused_mix_prec_layer_norm_cuda = importlib.import_module(
71 |           "fused_layer_norm_cuda")
72 | 
73 |         if isinstance(normalized_shape, numbers.Integral):
74 |             normalized_shape = (normalized_shape,)
75 |         self.normalized_shape = torch.Size(normalized_shape)
76 |         self.eps = eps
77 |         self.weight = Parameter(torch.Tensor(*normalized_shape))
78 |         self.bias = Parameter(torch.Tensor(*normalized_shape))
79 |         self.reset_parameters()
80 | 
81 | 
82 |   def reset_parameters(self):
83 | 
84 |     init.ones_(self.weight)
85 |     init.zeros_(self.bias)
86 | 
87 | 
88 |   def forward(self, input):
89 | 
90 |     return FusedLayerNormAffineFunction.apply(
91 |       input, self.weight, self.bias, self.normalized_shape,self.eps)
92 | 
93 | 


--------------------------------------------------------------------------------
/megatron/model/gpt_model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """GPT-2 model."""
 17 | 
 18 | import torch
 19 | 
 20 | from megatron import get_args
 21 | from megatron import mpu
 22 | from .module import MegatronModule
 23 | 
 24 | from .enums import AttnMaskType
 25 | from .language_model import parallel_lm_logits
 26 | from .language_model import get_language_model
 27 | from .utils import init_method_normal
 28 | from .utils import scaled_init_method_normal
 29 | 
 30 | 
 31 | def post_language_model_processing(lm_output, labels, logit_weights,
 32 |                                    parallel_output,
 33 |                                    fp16_lm_cross_entropy):
 34 | 
 35 |     # Output.
 36 |     output = parallel_lm_logits(
 37 |         lm_output,
 38 |         logit_weights,
 39 |         parallel_output)
 40 | 
 41 |     if labels is None:
 42 |         return output
 43 |     else:
 44 |         if fp16_lm_cross_entropy:
 45 |             assert output.dtype == torch.half
 46 |             loss = mpu.vocab_parallel_cross_entropy(output, labels)
 47 |         else:
 48 |             loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
 49 |         return loss
 50 | 
 51 | 
 52 | class GPTModel(MegatronModule):
 53 |     """GPT-2 Language model."""
 54 | 
 55 |     def __init__(self,
 56 |                  num_tokentypes=0,
 57 |                  parallel_output=True,
 58 |                  pre_process=True,
 59 |                  post_process=True):
 60 |         super(GPTModel, self).__init__()
 61 |         args = get_args()
 62 | 
 63 |         self.parallel_output = parallel_output
 64 |         self.pre_process = pre_process
 65 |         self.post_process = post_process
 66 |         self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
 67 | 
 68 |         self.language_model, self._language_model_key = get_language_model(
 69 |             num_tokentypes=num_tokentypes,
 70 |             add_pooler=False,
 71 |             encoder_attn_mask_type=AttnMaskType.causal,
 72 |             init_method=init_method_normal(args.init_method_std),
 73 |             scaled_init_method=scaled_init_method_normal(args.init_method_std,
 74 |                                                          args.num_layers),
 75 |             pre_process=self.pre_process,
 76 |             post_process=self.post_process)
 77 | 
 78 |         self.initialize_word_embeddings(init_method_normal)
 79 | 
 80 |     def set_input_tensor(self, input_tensor):
 81 |         """See megatron.model.transformer.set_input_tensor()"""
 82 |         self.language_model.set_input_tensor(input_tensor)
 83 | 
 84 |     def forward(self, input_ids, position_ids, attention_mask, labels=None,
 85 |                 tokentype_ids=None,
 86 |                 set_inference_key_value_memory=False,
 87 |                 inference_max_sequence_len=None):
 88 | 
 89 |         lm_output = self.language_model(
 90 |             input_ids,
 91 |             position_ids,
 92 |             attention_mask,
 93 |             set_inference_key_value_memory=set_inference_key_value_memory,
 94 |             inference_max_sequence_len=inference_max_sequence_len)
 95 | 
 96 |         if self.post_process:
 97 |             return post_language_model_processing(
 98 |                 lm_output, labels,
 99 |                 self.word_embeddings_weight(),
100 |                 self.parallel_output,
101 |                 self.fp16_lm_cross_entropy)
102 |         else:
103 |             return lm_output
104 | 
105 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
106 |                                        keep_vars=False):
107 | 
108 |         state_dict_ = {}
109 |         state_dict_[self._language_model_key] \
110 |             = self.language_model.state_dict_for_save_checkpoint(
111 |                 destination, prefix, keep_vars)
112 |         # Save word_embeddings.
113 |         if self.post_process and not self.pre_process:
114 |             state_dict_[self._word_embeddings_for_head_key] \
115 |                 = self.word_embeddings.state_dict(destination, prefix, keep_vars)
116 |         return state_dict_
117 | 
118 |     def load_state_dict(self, state_dict, strict=True):
119 |         """Customized load."""
120 | 
121 |         # Load word_embeddings.
122 |         if self.post_process and not self.pre_process:
123 |             self.word_embeddings.load_state_dict(
124 |                 state_dict[self._word_embeddings_for_head_key], strict=strict)
125 |         if self._language_model_key in state_dict:
126 |             state_dict = state_dict[self._language_model_key]
127 |         self.language_model.load_state_dict(state_dict, strict=strict)
128 | 


--------------------------------------------------------------------------------
/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utilities for models."""
17 | 
18 | import math
19 | 
20 | import torch
21 | 
22 | from megatron import get_args
23 | 
24 | def init_method_normal(sigma):
25 |     """Init method based on N(0, sigma)."""
26 |     def init_(tensor):
27 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
28 | 
29 |     return init_
30 | 
31 | 
32 | def scaled_init_method_normal(sigma, num_layers):
33 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
34 |     std = sigma / math.sqrt(2.0 * num_layers)
35 | 
36 |     def init_(tensor):
37 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
38 | 
39 |     return init_
40 | 
41 | 
42 | def attention_mask_func(attention_scores, attention_mask):
43 |     attention_scores.masked_fill_(attention_mask, -10000.0)
44 |     return attention_scores
45 | 
46 | 
47 | def get_linear_layer(rows, columns, init_method):
48 |     """Simple linear layer with weight initialization."""
49 |     layer = torch.nn.Linear(rows, columns)
50 |     init_method(layer.weight)
51 |     with torch.no_grad():
52 |         layer.bias.zero_()
53 |     return layer
54 | 
55 | @torch.jit.script
56 | def gelu_impl(x):
57 |     """OpenAI's gelu implementation."""
58 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
59 |                                        (1.0 + 0.044715 * x * x)))
60 | def openai_gelu(x):
61 |     return gelu_impl(x)
62 | 
63 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
64 | @torch.jit.script
65 | def erf_gelu(x):
66 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
67 | 


--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .initialize import is_unitialized
23 | from .initialize import destroy_model_parallel
24 | from .initialize import get_data_parallel_group
25 | from .initialize import get_data_parallel_rank
26 | from .initialize import get_data_parallel_world_size
27 | from .initialize import get_embedding_group
28 | from .initialize import get_model_parallel_group
29 | from .initialize import get_tensor_model_parallel_group
30 | from .initialize import get_pipeline_model_parallel_group
31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank
32 | from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank
33 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
34 | from .initialize import is_rank_in_embedding_group
35 | from .initialize import is_pipeline_stage_before_split, is_pipeline_stage_after_split
36 | from .initialize import is_pipeline_stage_at_split
37 | from .initialize import get_num_layers
38 | from .initialize import get_tensor_model_parallel_src_rank
39 | from .initialize import get_pipeline_model_parallel_first_rank
40 | from .initialize import get_pipeline_model_parallel_last_rank
41 | from .initialize import get_pipeline_model_parallel_next_rank
42 | from .initialize import get_pipeline_model_parallel_prev_rank
43 | from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size
44 | from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size
45 | from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank
46 | from .initialize import get_next_rank_group, get_prev_rank_group
47 | from .initialize import initialize_model_parallel
48 | from .initialize import model_parallel_is_initialized
49 | 
50 | from .layers import ColumnParallelLinear
51 | from .layers import RowParallelLinear
52 | from .layers import VocabParallelEmbedding
53 | from .layers import (set_tensor_model_parallel_attributes,
54 |                      set_defaults_if_not_set_tensor_model_parallel_attributes,
55 |                      copy_tensor_model_parallel_attributes)
56 |                      
57 | from .mappings import copy_to_tensor_model_parallel_region
58 | from .mappings import gather_from_tensor_model_parallel_region
59 | from .mappings import reduce_from_tensor_model_parallel_region
60 | from .mappings import scatter_to_tensor_model_parallel_region
61 | 
62 | from .random import checkpoint
63 | from .random import get_cuda_rng_tracker
64 | from .random import model_parallel_cuda_manual_seed
65 | from .random import gather_split_1d_tensor
66 | from .random import split_tensor_into_1d_equal_chunks
67 | 
68 | from .utils import divide
69 | from .utils import split_tensor_along_last_dim
70 | 


--------------------------------------------------------------------------------
/megatron/mpu/data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import get_tensor_model_parallel_group
 19 | from .initialize import get_tensor_model_parallel_rank
 20 | from .initialize import get_tensor_model_parallel_src_rank
 21 | 
 22 | 
 23 | _MAX_DATA_DIM = 5
 24 | 
 25 | 
 26 | def _check_data_types(keys, data, target_dtype):
 27 |     """Check that all the keys have the same target data type."""
 28 |     for key in keys:
 29 |         assert data[key].dtype == target_dtype, '{} has data type {} which '\
 30 |             'is different than {}'.format(key, data[key].dtype, target_dtype)
 31 | 
 32 | 
 33 | def _build_key_size_numel_dictionaries(keys, data):
 34 |     """Build the size on rank 0 and broadcast."""
 35 |     max_dim = _MAX_DATA_DIM
 36 |     sizes = [0 for _ in range(max_dim) for _ in keys]
 37 | 
 38 |     # Pack the sizes on rank zero.
 39 |     if get_tensor_model_parallel_rank() == 0:
 40 |         offset = 0
 41 |         for key in keys:
 42 |             assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
 43 |             size = data[key].size()
 44 |             for i, s in enumerate(size):
 45 |                 sizes[i + offset] = s
 46 |             offset += max_dim
 47 | 
 48 |     # Move to GPU and broadcast.
 49 |     sizes_cuda = torch.cuda.LongTensor(sizes)
 50 |     torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
 51 |                                 group=get_tensor_model_parallel_group())
 52 | 
 53 |     # Move back to cpu and unpack.
 54 |     sizes_cpu = sizes_cuda.cpu()
 55 |     key_size = {}
 56 |     key_numel = {}
 57 |     total_numel = 0
 58 |     offset = 0
 59 |     for key in keys:
 60 |         i = 0
 61 |         size = []
 62 |         numel = 1
 63 |         while sizes_cpu[offset + i] > 0:
 64 |             this_size = sizes_cpu[offset + i]
 65 |             size.append(this_size)
 66 |             numel *= this_size
 67 |             i += 1
 68 |         key_size[key] = size
 69 |         key_numel[key] = numel
 70 |         total_numel += numel
 71 |         offset += max_dim
 72 | 
 73 |     return key_size, key_numel, total_numel
 74 | 
 75 | 
 76 | def broadcast_data(keys, data, datatype):
 77 |     """Broadcast data from rank zero of each model parallel group to the
 78 |     members of the same model parallel group.
 79 | 
 80 |     Arguments:
 81 |         keys: list of keys in the data disctionary to be broadcasted
 82 |         data: data dictionary of string keys and cpu tensor values.
 83 |         datatype: torch data type of all tensors in data associated
 84 |                   with keys.
 85 |     """
 86 |     # Build (key, size) and (key, number of elements) dictionaries along
 87 |     # with the total number of elements on all ranks.
 88 |     key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
 89 |                                                                           data)
 90 | 
 91 |     # Pack on rank zero.
 92 |     if get_tensor_model_parallel_rank() == 0:
 93 |         # Check that all keys have the same data type.
 94 |         _check_data_types(keys, data, datatype)
 95 |         # Flatten the data associated with the keys
 96 |         flatten_data = torch.cat(
 97 |             [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
 98 |     else:
 99 |         flatten_data = torch.empty(total_numel,
100 |                                    device=torch.cuda.current_device(),
101 |                                    dtype=datatype)
102 | 
103 |     # Broadcast
104 |     torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),
105 |                                 group=get_tensor_model_parallel_group())
106 | 
107 |     # Unpack
108 |     output = {}
109 |     offset = 0
110 |     for key in keys:
111 |         size = key_size[key]
112 |         numel = key_numel[key]
113 |         output[key] = flatten_data.narrow(0, offset, numel).view(size)
114 |         offset += numel
115 | 
116 |     return output
117 | 


--------------------------------------------------------------------------------
/megatron/mpu/mappings.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # Modifications copyright Amazon Web Services and its affiliates. All rights reserved.
 16 | 
 17 | import torch
 18 | 
 19 | from .initialize import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
 20 | from .utils import split_tensor_along_last_dim
 21 | import torch_xla.core.xla_model as xm
 22 | 
 23 | 
 24 | def _reduce(input_):
 25 |     """All-reduce the input tensor across model parallel group."""
 26 | 
 27 |     # Bypass the function if we are using only 1 GPU.
 28 |     if get_tensor_model_parallel_world_size()==1:
 29 |         return input_
 30 | 
 31 |     # All-reduce.
 32 |     torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group(), async_op=True)
 33 | 
 34 |     return input_
 35 | 
 36 | 
 37 | def _split(input_):
 38 |     """Split the tensor along its last dimension and keep the
 39 |     corresponding slice."""
 40 | 
 41 |     world_size = get_tensor_model_parallel_world_size()
 42 |     # Bypass the function if we are using only 1 GPU.
 43 |     if world_size==1:
 44 |         return input_
 45 | 
 46 |     # Split along last dimension.
 47 |     input_list = split_tensor_along_last_dim(input_, world_size)
 48 | 
 49 |     # Note: torch.split does not create contiguous tensors by default.
 50 |     rank = get_tensor_model_parallel_rank()
 51 |     output = input_list[rank].contiguous()
 52 | 
 53 |     return output
 54 | 
 55 | 
 56 | def _gather(input_):
 57 |     """Gather tensors and concatinate along the last dimension."""
 58 | 
 59 |     world_size = get_tensor_model_parallel_world_size()
 60 |     # Bypass the function if we are using only 1 GPU.
 61 |     if world_size==1:
 62 |         return input_
 63 | 
 64 |     # Size and dimension.
 65 |     last_dim = input_.dim() - 1
 66 |     rank = get_tensor_model_parallel_rank()
 67 | 
 68 |     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
 69 |     tensor_list[rank] = input_
 70 |     torch.distributed.all_gather(tensor_list, input_,
 71 |                                  group=get_tensor_model_parallel_group(),
 72 |                                  async_op=True)
 73 | 
 74 |     # Note: torch.cat already creates a contiguous tensor.
 75 |     output = torch.cat(tensor_list, dim=last_dim).contiguous()
 76 | 
 77 |     return output
 78 | 
 79 | 
 80 | class _CopyToModelParallelRegion(torch.autograd.Function):
 81 |     """Pass the input to the model parallel region."""
 82 | 
 83 |     @staticmethod
 84 |     def symbolic(graph, input_):
 85 |         return input_
 86 |     
 87 |     @staticmethod
 88 |     def forward(ctx, input_):
 89 |         return input_
 90 | 
 91 |     @staticmethod
 92 |     def backward(ctx, grad_output):
 93 |         return _reduce(grad_output)
 94 | 
 95 | 
 96 | class _ReduceFromModelParallelRegion(torch.autograd.Function):
 97 |     """All-reduce the input from the model parallel region."""
 98 | 
 99 |     @staticmethod
100 |     def symbolic(graph, input_):
101 |         return _reduce(input_)
102 |     
103 |     @staticmethod
104 |     def forward(ctx, input_):
105 |         return _reduce(input_)
106 | 
107 |     @staticmethod
108 |     def backward(ctx, grad_output):
109 |         return grad_output
110 | 
111 | 
112 | class _ScatterToModelParallelRegion(torch.autograd.Function):
113 |     """Split the input and keep only the corresponding chuck to the rank."""
114 | 
115 |     @staticmethod
116 |     def symbolic(graph, input_):
117 |         return _split(input_)
118 | 
119 |     @staticmethod
120 |     def forward(ctx, input_):
121 |         return _split(input_)
122 | 
123 |     @staticmethod
124 |     def backward(ctx, grad_output):
125 |         return _gather(grad_output)
126 | 
127 | 
128 | class _GatherFromModelParallelRegion(torch.autograd.Function):
129 |     """Gather the input from model parallel region and concatinate."""
130 | 
131 |     @staticmethod
132 |     def symbolic(graph, input_):
133 |         return _gather(input_)
134 |     
135 |     @staticmethod
136 |     def forward(ctx, input_):
137 |         return _gather(input_)
138 | 
139 |     @staticmethod
140 |     def backward(ctx, grad_output):
141 |         return _split(grad_output)
142 | 
143 | 
144 | # -----------------
145 | # Helper functions.
146 | # -----------------
147 | 
148 | def copy_to_tensor_model_parallel_region(input_):
149 |     return _CopyToModelParallelRegion.apply(input_)
150 | 
151 | 
152 | def reduce_from_tensor_model_parallel_region(input_):
153 |     return _ReduceFromModelParallelRegion.apply(input_)
154 | 
155 | 
156 | def scatter_to_tensor_model_parallel_region(input_):
157 |     return _ScatterToModelParallelRegion.apply(input_)
158 | 
159 | 
160 | def gather_from_tensor_model_parallel_region(input_):
161 |     return _GatherFromModelParallelRegion.apply(input_)
162 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-neuron/aws-neuron-reference-for-megatron-lm/868d46cccb320a05eeac833be4a55c2c07db620a/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import os
18 | import random
19 | import numpy
20 | import torch
21 | 
22 | from megatron import mpu
23 | import torch_xla.distributed.xla_backend #for XLA backend
24 | 
25 | class IdentityLayer(torch.nn.Module):
26 |     def __init__(self, size, scale=1.0):
27 |         super(IdentityLayer, self).__init__()
28 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
29 | 
30 |     def forward(self):
31 |         return self.weight
32 | 
33 | 
34 | def set_random_seed(seed):
35 |     """Set random seed for reproducability."""
36 |     random.seed(seed)
37 |     numpy.random.seed(seed)
38 |     torch.manual_seed(seed)
39 |     mpu.model_parallel_cuda_manual_seed(seed)
40 | 
41 | 
42 | def initialize_distributed(backend='nccl'):
43 |     """Initialize torch.distributed."""
44 |     # Get local rank in case it is provided.
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('--local_rank', type=int, default=None,
47 |                         help='local rank passed from distributed launcher')
48 |     args = parser.parse_args()
49 |     local_rank = args.local_rank
50 | 
51 |     # Get rank and world size.
52 |     rank = int(os.getenv('RANK', '0'))
53 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
54 | 
55 |     print('> initializing torch.distributed with local rank: {}, '
56 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
57 | 
58 |     # Set the device id.
59 |     device = rank % torch.cuda.device_count()
60 |     if local_rank is not None:
61 |         device = local_rank
62 |     torch.cuda.set_device(device)
63 | 
64 |     # Call the init process.
65 |     init_method = 'tcp://'
66 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
67 |     master_port = os.getenv('MASTER_PORT', '6000')
68 |     init_method += master_ip + ':' + master_port
69 |     torch.distributed.init_process_group(
70 |         backend=backend,
71 |         world_size=world_size,
72 |         rank=rank,
73 |         init_method=init_method)
74 | 
75 | 
76 | def print_separator(message):
77 |     torch.distributed.barrier()
78 |     filler_len = (78 - len(message)) // 2
79 |     filler = '-' * filler_len
80 |     string = '\n' + filler + ' {} '.format(message) + filler
81 |     if torch.distributed.get_rank() == 0:
82 |         print(string, flush=True)
83 |     torch.distributed.barrier()
84 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # Modifications copyright Amazon Web Services and its affiliates. All rights reserved.
 16 | 
 17 | from commons import set_random_seed
 18 | from commons import IdentityLayer
 19 | from commons import print_separator
 20 | from commons import initialize_distributed
 21 | from mpu.cross_entropy import vocab_parallel_cross_entropy
 22 | import mpu
 23 | import torch.nn.functional as F
 24 | import torch
 25 | import random
 26 | import sys
 27 | sys.path.append("../..")
 28 | 
 29 | 
 30 | def torch_cross_entropy(batch_size, seq_length, vocab_size,
 31 |                         logits_scale, seed):
 32 |     set_random_seed(seed)
 33 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
 34 |                              scale=logits_scale).cuda()
 35 |     logits = identity()
 36 |     target = torch.cuda.LongTensor(
 37 |         size=(batch_size, seq_length)).random_(0, vocab_size)
 38 |     loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
 39 |                            target.view(-1),
 40 |                            reduction='none').view_as(target).mean()
 41 |     loss.backward()
 42 |     return loss, identity.weight.grad
 43 | 
 44 | 
 45 | def mpu_cross_entropy(batch_size, seq_length, vocab_size,
 46 |                       logits_scale, seed):
 47 |     set_random_seed(seed)
 48 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
 49 |                              scale=logits_scale).cuda()
 50 |     logits = identity()
 51 |     logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits)
 52 |     target = torch.cuda.LongTensor(
 53 |         size=(batch_size, seq_length)).random_(0, vocab_size)
 54 |     loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
 55 |     loss.backward()
 56 |     return loss, identity.weight.grad
 57 | 
 58 | 
 59 | def test_cross_entropy(tensor_model_parallel_size):
 60 | 
 61 |     if torch.distributed.get_rank() == 0:
 62 |         print('> testing cross entropy with model parallel size {} ...'.
 63 |               format(tensor_model_parallel_size))
 64 | 
 65 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
 66 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
 67 | 
 68 |     batch_size = 13
 69 |     seq_length = 17
 70 |     vocab_size_per_partition = 11
 71 |     logits_scale = 1000.0
 72 |     vocab_size = vocab_size_per_partition * tensor_model_parallel_size
 73 |     seed = 1234
 74 | 
 75 |     loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
 76 |                                                  vocab_size, logits_scale,
 77 |                                                  seed)
 78 |     loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
 79 |                                            vocab_size, logits_scale,
 80 |                                            seed)
 81 | 
 82 |     error = loss_torch.sub_(loss_mpu).abs().max()
 83 |     print('   max error in loss on global rank {}: {}'.format(
 84 |         torch.distributed.get_rank(), error))
 85 |     assert error < 1.0e-6
 86 | 
 87 |     error = grad_torch.sub_(grad_mpu).abs().max()
 88 |     print('   max error in grad on global rank {}: {}'.format(
 89 |         torch.distributed.get_rank(), error))
 90 |     assert error < 1.0e-6
 91 | 
 92 |     # Reset groups
 93 |     mpu.destroy_tensor_model_parallel()
 94 | 
 95 |     torch.distributed.barrier()
 96 |     if torch.distributed.get_rank() == 0:
 97 |         print('>> passed the test :-)')
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 | 
102 |     initialize_distributed()
103 |     world_size = torch.distributed.get_world_size()
104 | 
105 |     tensor_model_parallel_size = 1
106 |     while tensor_model_parallel_size <= world_size:
107 |         print_separator('test cross entropy')
108 |         test_cross_entropy(tensor_model_parallel_size)
109 |         tensor_model_parallel_size *= 2
110 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | from mpu import data as data_utils
19 | import mpu
20 | import torch
21 | import functools
22 | import operator
23 | import sys
24 | sys.path.append("../..")
25 | 
26 | 
27 | def test_broadcast_data(tensor_model_parallel_size):
28 | 
29 |     if torch.distributed.get_rank() == 0:
30 |         print('> testing broadcast_data with model parallel size {} ...'.
31 |               format(tensor_model_parallel_size))
32 | 
33 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
34 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
35 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
36 | 
37 |     key_size_t = {'key1': [7, 11],
38 |                   'key2': [8, 2, 1],
39 |                   'key3': [13],
40 |                   'key4': [5, 1, 2],
41 |                   'key5': [5, 12]}
42 |     keys = list(key_size_t.keys())
43 | 
44 |     data = {}
45 |     data_t = {}
46 |     for key in key_size_t:
47 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
48 |         data_t[key] = data[key].clone()
49 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
50 |     data_t['keyX'] = data['keyX'].clone()
51 |     if mpu.get_tensor_model_parallel_rank() != 0:
52 |         data = None
53 | 
54 |     data_utils._check_data_types(keys, data_t, torch.int64)
55 |     key_size, key_numel, \
56 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
57 |     for key in keys:
58 |         assert key_size[key] == key_size_t[key]
59 |     total_numel_t = 0
60 |     for key in keys:
61 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
62 |         assert key_numel[key] == target_size
63 |         total_numel_t += target_size
64 |     assert total_numel == total_numel_t
65 | 
66 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
67 |     for key in keys:
68 |         tensor = data_t[key].cuda()
69 |         assert data_b[key].sub(tensor).abs().max() == 0
70 | 
71 |     # Reset groups
72 |     mpu.destroy_tensor_model_parallel()
73 | 
74 |     torch.distributed.barrier()
75 |     if torch.distributed.get_rank() == 0:
76 |         print('>> passed the test :-)')
77 | 
78 | 
79 | if __name__ == '__main__':
80 | 
81 |     initialize_distributed()
82 |     world_size = torch.distributed.get_world_size()
83 | 
84 |     tensor_model_parallel_size = 1
85 |     while tensor_model_parallel_size <= world_size:
86 |         print_separator('test test broadcast data')
87 |         test_broadcast_data(tensor_model_parallel_size)
88 |         tensor_model_parallel_size *= 2
89 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | import mpu
19 | import torch
20 | import sys
21 | sys.path.append("../..")
22 | 
23 | 
24 | def test_initialize_model_parallel(tensor_model_parallel_size):
25 | 
26 |     if torch.distributed.get_rank() == 0:
27 |         print('> testing initialize_model_parallel with size {} ...'.format(
28 |             tensor_model_parallel_size))
29 |     tensor_model_parallel_size_ = min(tensor_model_parallel_size,
30 |                                torch.distributed.get_world_size())
31 |     assert not mpu.model_parallel_is_initialized()
32 |     mpu.initialize_model_parallel(tensor_model_parallel_size_)
33 |     assert mpu.model_parallel_is_initialized()
34 | 
35 |     # Checks.
36 |     def check(group, world_size, rank):
37 |         assert world_size == torch.distributed.get_world_size(group=group)
38 |         assert rank == torch.distributed.get_rank(group=group)
39 | 
40 |     # Model parallel.
41 |     world_size = tensor_model_parallel_size_
42 |     rank = torch.distributed.get_rank() % tensor_model_parallel_size_
43 |     assert world_size == mpu.get_tensor_model_parallel_world_size()
44 |     assert rank == mpu.get_tensor_model_parallel_rank()
45 |     check(mpu.get_tensor_model_parallel_group(), world_size, rank)
46 | 
47 |     # Data parallel.
48 |     world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
49 |     rank = torch.distributed.get_rank() // tensor_model_parallel_size
50 |     assert world_size == mpu.get_data_parallel_world_size()
51 |     assert rank == mpu.get_data_parallel_rank()
52 |     check(mpu.get_data_parallel_group(), world_size, rank)
53 | 
54 |     # Reset groups
55 |     mpu.destroy_model_parallel()
56 | 
57 |     torch.distributed.barrier()
58 |     if torch.distributed.get_rank() == 0:
59 |         print('>> passed the test :-)')
60 | 
61 | 
62 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
63 | 
64 |     if torch.distributed.get_rank() == 0:
65 |         print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
66 |             tensor_model_parallel_size_))
67 |     tensor_model_parallel_size = min(tensor_model_parallel_size_,
68 |                               torch.distributed.get_world_size())
69 |     assert not mpu.model_parallel_is_initialized()
70 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
71 |     assert mpu.model_parallel_is_initialized()
72 | 
73 |     # Checks
74 |     src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank()
75 |     assert mpu.get_tensor_model_parallel_src_rank() == src_rank
76 | 
77 |     # Reset groups
78 |     mpu.destroy_model_parallel()
79 | 
80 |     torch.distributed.barrier()
81 |     if torch.distributed.get_rank() == 0:
82 |         print('>> passed the test :-)')
83 | 
84 | 
85 | if __name__ == '__main__':
86 | 
87 |     initialize_distributed()
88 |     world_size = torch.distributed.get_world_size()
89 |     tensor_model_parallel_size = 1
90 |     while tensor_model_parallel_size <= world_size:
91 |         print_separator('test initialize model parallel')
92 |         test_initialize_model_parallel(tensor_model_parallel_size)
93 |         print_separator('test model parallel source rank')
94 |         test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
95 |         tensor_model_parallel_size *= 2
96 | 


--------------------------------------------------------------------------------
/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
23 |         numerator, denominator)
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions,
34 |                                 contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |         first and last index of the vocabulary belonging to the `rank`
57 |         partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
61 |                                                   rank, world_size):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size)
71 | 


--------------------------------------------------------------------------------
/megatron/optimizer/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | #from apex.optimizers import FusedAdam as Adam
 17 | #from apex.optimizers import FusedSGD as SGD
 18 | from torch.optim import AdamW as Adam
 19 | from torch.optim import SGD
 20 | 
 21 | from megatron import get_args
 22 | from megatron.model import LayerNorm
 23 | 
 24 | from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 25 | from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
 26 | 
 27 | 
 28 | def _get_params_for_weight_decay_optimization(modules):
 29 |     """Divide params into with-weight-decay and without-weight-decay groups.
 30 |     Layernorms and baises will have no weight decay but the rest will.
 31 |     """
 32 | 
 33 |     weight_decay_params = {'params': []}
 34 |     no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
 35 |     for module in modules:
 36 |         for module_ in module.modules():
 37 |             if isinstance(module_, LayerNorm):
 38 |                 no_weight_decay_params['params'].extend(
 39 |                     [p for p in list(module_._parameters.values())
 40 |                      if p is not None])
 41 |             else:
 42 |                 weight_decay_params['params'].extend(
 43 |                     [p for n, p in list(module_._parameters.items())
 44 |                      if p is not None and n != 'bias'])
 45 |                 no_weight_decay_params['params'].extend(
 46 |                     [p for n, p in list(module_._parameters.items())
 47 |                      if p is not None and n == 'bias'])
 48 | 
 49 |     return weight_decay_params, no_weight_decay_params
 50 | 
 51 | 
 52 | def get_megatron_optimizer(model):
 53 |     args = get_args()
 54 | 
 55 |     # Base optimizer.
 56 |     param_groups = _get_params_for_weight_decay_optimization(model)
 57 |     if args.optimizer == 'adam':
 58 |         optimizer = Adam(param_groups,
 59 |                          lr=args.lr,
 60 |                          weight_decay=args.weight_decay,
 61 |                          betas=(args.adam_beta1, args.adam_beta2),
 62 |                          eps=args.adam_eps)
 63 |     elif args.optimizer == 'sgd':
 64 |         optimizer = SGD(param_groups,
 65 |                         lr=args.lr,
 66 |                         weight_decay=args.weight_decay,
 67 |                         momentum=args.sgd_momentum)
 68 |     else:
 69 |         raise Exception('{} optimizer is not supported.'.format(
 70 |             args.optimizer))
 71 | 
 72 |     # Determine whether the params have main-grad field.
 73 |     params_have_main_grad = False
 74 |     if args.DDP_impl == 'local':
 75 |         params_have_main_grad = True
 76 | 
 77 |     if args.fp16 or args.bf16:
 78 | 
 79 |         # Grad scaler:
 80 |         #    if loss-scale is provided, instantiate the constant scaler.
 81 |         #    if we are using fp16 and loss-scale is not present, use a
 82 |         #       dynamic scaler.
 83 |         #    otherwise we are running in bf16 with no loss-scale so
 84 |         #       leave it as None.
 85 |         grad_scaler = None
 86 |         # Constant loss scale.
 87 |         if args.loss_scale:
 88 |             grad_scaler = ConstantGradScaler(args.loss_scale)
 89 |         # Dynamic loss scale.
 90 |         else:
 91 |             if args.fp16:
 92 |                 grad_scaler = DynamicGradScaler(
 93 |                     initial_scale=args.initial_loss_scale,
 94 |                     min_scale=args.min_loss_scale,
 95 |                     growth_factor=2.0,
 96 |                     backoff_factor=0.5,
 97 |                     growth_interval=args.loss_scale_window,
 98 |                     hysteresis=args.hysteresis)
 99 | 
100 |         # Megatron optimizer.
101 |         return Float16OptimizerWithFloat16Params(optimizer,
102 |                                                  args.clip_grad,
103 |                                                  args.log_num_zeros_in_grad,
104 |                                                  params_have_main_grad,
105 |                                                  args.use_contiguous_buffers_in_local_ddp,
106 |                                                  args.bf16,
107 |                                                  grad_scaler)
108 | 
109 |     # FP32.
110 |     return FP32Optimizer(optimizer, args.clip_grad,
111 |                          args.log_num_zeros_in_grad,
112 |                          params_have_main_grad,
113 |                          args.use_contiguous_buffers_in_local_ddp)
114 | 


--------------------------------------------------------------------------------
/megatron/optimizer/grad_scaler.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Megatron grad scaler."""
 17 | 
 18 | from abc import ABC
 19 | from abc import abstractmethod
 20 | 
 21 | import torch
 22 | 
 23 | import torch_xla.core.xla_model as xm
 24 | torch.cuda.FloatTensor = lambda t: torch.FloatTensor(t).to(xm.xla_device())
 25 | torch.cuda.IntTensor = lambda t: torch.IntTensor(t).to(xm.xla_device())
 26 | 
 27 | class MegatronGradScaler(ABC):
 28 | 
 29 |     def __init__(self, initial_scale):
 30 |         """Initialize scale value with the input initial scale."""
 31 |         assert initial_scale > 0.0
 32 |         self._scale = torch.cuda.FloatTensor([initial_scale])
 33 | 
 34 |     @property
 35 |     def scale(self):
 36 |         return self._scale
 37 | 
 38 |     @property
 39 |     def inv_scale(self):
 40 |         return self._scale.double().reciprocal().float()
 41 | 
 42 |     @abstractmethod
 43 |     def update(self, found_inf):
 44 |         pass
 45 | 
 46 |     @abstractmethod
 47 |     def state_dict(self):
 48 |         pass
 49 | 
 50 |     @abstractmethod
 51 |     def load_state_dict(self, state_dict):
 52 |         pass
 53 | 
 54 | 
 55 | 
 56 | class ConstantGradScaler(MegatronGradScaler):
 57 | 
 58 |     def update(self, found_inf):
 59 |         pass
 60 | 
 61 |     def state_dict(self):
 62 |         return dict()
 63 | 
 64 |     def load_state_dict(self, state_dict):
 65 |         pass
 66 | 
 67 | 
 68 | 
 69 | class DynamicGradScaler(MegatronGradScaler):
 70 | 
 71 |     def __init__(self, initial_scale, min_scale,
 72 |                  growth_factor, backoff_factor,
 73 |                  growth_interval, hysteresis):
 74 |         """"Grad scaler with dynamic scale that gets adjusted
 75 |         during training."""
 76 |         super(DynamicGradScaler, self).__init__(initial_scale)
 77 | 
 78 |         # Lower bound on the scale.
 79 |         assert min_scale > 0.0
 80 |         assert min_scale <= initial_scale
 81 |         self.min_scale = torch.cuda.FloatTensor([min_scale])
 82 |         # Growth and backoff factors for the scale.
 83 |         assert growth_factor > 1.0
 84 |         self.growth_factor = torch.cuda.FloatTensor([growth_factor])
 85 |         assert backoff_factor < 1.0
 86 |         assert backoff_factor > 0.0
 87 |         self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
 88 |         # Interval over which if we don't see any inf/nan,
 89 |         # we will scale the grad scale by the growth factor.
 90 |         assert growth_interval > 0
 91 |         self.growth_interval = growth_interval
 92 |         # Number of inf/nans we should see before scaling down
 93 |         # the grad scale by the backoff factor.
 94 |         assert hysteresis > 0
 95 |         self.hysteresis = hysteresis
 96 | 
 97 |         # Trackers.
 98 |         self._growth_tracker = 0
 99 |         self._hysteresis_tracker = self.hysteresis
100 | 
101 | 
102 |     def update(self, found_inf):
103 | 
104 |         # If we have an inf/nan, growth tracker is set to 0
105 |         # and hysterisis tracker is reduced by 1.
106 |         if found_inf:
107 |             self._growth_tracker = 0
108 |             self._hysteresis_tracker -= 1
109 |             # Now if we are out of hysteresis count, scale down the loss.
110 |             if self._hysteresis_tracker <= 0:
111 |                 self._scale = torch.max(self._scale * self.backoff_factor,
112 |                                         self.min_scale)
113 |         else:
114 |             # If there is no nan/inf, increment the growth tracker.
115 |             self._growth_tracker += 1
116 |             # If we have had enough consequitive intervals with no nan/inf:
117 |             if self._growth_tracker == self.growth_interval:
118 |                 # Reset the tracker and hysteresis trackers,
119 |                 self._growth_tracker = 0
120 |                 self._hysteresis_tracker = self.hysteresis
121 |                 # and scale up the loss scale.
122 |                 self._scale = self._scale * self.growth_factor
123 | 
124 | 
125 |     def state_dict(self):
126 |         state_dict = {}
127 |         state_dict['scale'] = self._scale
128 |         state_dict['growth_tracker'] = self._growth_tracker
129 |         state_dict['hysteresis_tracker'] = self._hysteresis_tracker
130 |         return state_dict
131 | 
132 | 
133 |     def load_state_dict(self, state_dict):
134 |         #self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
135 |         self._scale = state_dict['scale'].to(xm.xla_device())
136 |         self._growth_tracker = state_dict['growth_tracker']
137 |         self._hysteresis_tracker = state_dict['hysteresis_tracker']
138 | 


--------------------------------------------------------------------------------
/megatron/text_generation_server.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import datetime
16 | import torch
17 | import json
18 | import threading
19 | from flask import Flask, request, jsonify, current_app
20 | from flask_restful import Resource, Api
21 | from megatron import get_args
22 | from megatron import mpu
23 | from megatron.text_generation_utils import generate
24 | 
25 | GENERATE_NUM = 0
26 | lock = threading.Lock()
27 | 
28 | class MegatronGenerate(Resource):
29 |     def __init__(self, model):
30 |         self.model = model
31 | 
32 |     @staticmethod
33 |     def send_do_generate():
34 |         choice = torch.cuda.LongTensor([GENERATE_NUM])
35 |         torch.distributed.broadcast(choice, 0)
36 |      
37 |     def put(self):
38 |         args = get_args()
39 |         print("request IP: " + str(request.remote_addr))
40 |         print(json.dumps(request.get_json()),flush=True)
41 |         print("current time: ", datetime.datetime.now())
42 |         
43 |         sentences = request.get_json()["sentences"]
44 |         if len(sentences) > 128:
45 |             return "Maximum number of sentences is 128", 400
46 | 
47 |         tokens_to_generate = 64  # Choosing hopefully sane default.  Full sequence is slow
48 |         if "tokens_to_generate" in request.get_json():
49 |             tokens_to_generate = request.get_json()["tokens_to_generate"]
50 |             if not isinstance(tokens_to_generate, int):
51 |                 return "tokens_to_generate must be an integer greater than 0"
52 |             if tokens_to_generate < 1:
53 |                 return "tokens_to_generate must be an integer greater than 0"
54 | 
55 |         all_probs = False
56 |         if "all_probs" in request.get_json():
57 |             all_probs = request.get_json()["all_probs"]
58 |             if not isinstance(all_probs, bool):
59 |                 return "all_probs must be a boolean value"
60 |         
61 |         temperature = args.temperature
62 |         if "temperature" in request.get_json():
63 |             temperature = request.get_json()["temperature"]
64 |             if not (type(temperature) == int or type(temperature) == float):
65 |                 return "temperature must be a positive number less than or equal to 100.0"
66 |             if not (0.0 < temperature <= 100.0):
67 |                 return "temperature must be a positive number less than or equal to 100.0"
68 |         
69 |         add_BOS = False
70 |         if "add_BOS" in request.get_json():
71 |             add_BOS = request.get_json()["add_BOS"]
72 |             if not isinstance(add_BOS, bool):
73 |                 return "add_BOS must be a boolean value"
74 | 
75 |         with lock:  # Need to get lock to keep multiple threads from hitting code
76 |             MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
77 |             resp_sentences, resp_sentences_seg, output_logits, full_logits, tokens = generate(self.model, sentences, tokens_to_generate, all_probs, temperature, add_BOS) 
78 |         
79 |         if all_probs:
80 |             return jsonify({"sentences": resp_sentences,
81 |                 "segments": resp_sentences_seg,
82 |                 "logits": output_logits,
83 |                 "all_logits": full_logits,
84 |                 "tokens": tokens})
85 |         
86 |         return jsonify({"sentences": resp_sentences,
87 |             "segments": resp_sentences_seg,
88 |             "logits": output_logits})
89 | 
90 | class MegatronServer(object):
91 |     def __init__(self, model):
92 |         self.app = Flask(__name__, static_url_path='')
93 |         api = Api(self.app)
94 |         api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model])
95 |         
96 |     def run(self, url): 
97 |         self.app.run(url, threaded=True, debug=False)
98 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 


--------------------------------------------------------------------------------
/pretrain_gpt.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Pretrain GPT"""
 17 | 
 18 | import torch
 19 | from functools import partial
 20 | from megatron import get_args
 21 | from megatron import print_rank_0
 22 | from megatron import get_timers
 23 | from megatron import get_tokenizer
 24 | from megatron import mpu
 25 | from megatron.data.gpt_dataset import build_train_valid_test_datasets
 26 | from megatron.model import GPTModel, ModelType
 27 | from megatron.training import pretrain
 28 | from megatron.utils import get_ltor_masks_and_position_ids
 29 | from megatron.utils import average_losses_across_data_parallel_group
 30 | import torch_xla.core.xla_model as xm
 31 | import os
 32 | 
 33 | os.environ["NEURON_CC_FLAGS"] = os.environ.get('NEURON_CC_FLAGS', '') + " --model-type transformer"
 34 | 
 35 | def model_provider(pre_process=True, post_process=True):
 36 |     """Build the model."""
 37 | 
 38 |     device = xm.xla_device()
 39 |     print_rank_0('building GPT model ...')
 40 |     model = GPTModel(
 41 |         num_tokentypes=0,
 42 |         parallel_output=True,
 43 |         pre_process=pre_process,
 44 |         post_process=post_process
 45 |     ).to(device)
 46 |     return model
 47 | 
 48 | 
 49 | def get_batch(data_iterator):
 50 |     """Generate a batch"""
 51 |     args = get_args()
 52 |     tokenizer = get_tokenizer()
 53 | 
 54 |     # Items and their type.
 55 |     keys = ['text']
 56 |     datatype = torch.int64
 57 | 
 58 |     # Broadcast data.
 59 |     if data_iterator is not None:
 60 |         data = next(data_iterator)
 61 |     else:
 62 |         data = None
 63 |     #data_b = mpu.broadcast_data(keys, data, datatype)
 64 |     data_b = data
 65 | 
 66 |     # Unpack.
 67 |     #tokens_ = data_b['text'].long()
 68 |     tokens_ = data_b['text'].int()
 69 |     labels = tokens_[:, 1:].contiguous()
 70 |     tokens = tokens_[:, :-1].contiguous()
 71 | 
 72 |     # Get the masks and postition ids.
 73 |     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
 74 |         tokens,
 75 |         tokenizer.eod,
 76 |         args.reset_position_ids,
 77 |         args.reset_attention_mask,
 78 |         args.eod_mask_loss)
 79 | 
 80 |     return tokens, labels, loss_mask, attention_mask, position_ids
 81 | 
 82 | def loss_func(loss_mask, output_tensor):
 83 |     losses = output_tensor.float()
 84 |     loss_mask = loss_mask.view(-1).float()
 85 |     loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
 86 | 
 87 |     if mpu.get_data_parallel_world_size() > 1:
 88 |         # Reduce loss for logging.
 89 |         averaged_loss = average_losses_across_data_parallel_group([loss])
 90 |         return loss, {'lm loss': averaged_loss[0].detach()}
 91 |     else:
 92 |         return loss, {'lm loss': loss.detach()}
 93 | 
 94 | 
 95 | def forward_step(data_iterator, model):
 96 |     """Forward step."""
 97 |     args = get_args()
 98 |     timers = get_timers()
 99 | 
100 |     # Get the batch.
101 |     #commenting the line timer.start and timer.stop out to enable evaluation step
102 |     # timers('batch-generator').start()
103 |     tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
104 |         data_iterator)
105 |     # timers('batch-generator').stop()
106 |     output_tensor = model(tokens, position_ids, attention_mask,
107 |                           labels=labels)
108 | 
109 |     return output_tensor, partial(loss_func, loss_mask)
110 | 
111 | 
112 | def train_valid_test_datasets_provider(train_val_test_num_samples):
113 |     """Build train, valid, and test datasets."""
114 |     args = get_args()
115 | 
116 |     print_rank_0('> building train, validation, and test datasets '
117 |                  'for GPT ...')
118 |     train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
119 |         data_prefix=args.data_path,
120 |         data_impl=args.data_impl,
121 |         splits_string=args.split,
122 |         train_valid_test_num_samples=train_val_test_num_samples,
123 |         seq_length=args.seq_length,
124 |         seed=args.seed,
125 |         skip_warmup=(not args.mmap_warmup))
126 |     print_rank_0("> finished creating GPT datasets ...")
127 |     return train_ds, valid_ds, test_ds
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     pretrain(train_valid_test_datasets_provider, model_provider,
132 |              ModelType.encoder_or_decoder,
133 |              forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
134 |     xm.rendezvous('ending')
135 | 


--------------------------------------------------------------------------------
/pretrain_gpt_mp.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Pretrain GPT"""
 17 | 
 18 | import torch
 19 | from functools import partial
 20 | from megatron import get_args
 21 | from megatron import print_rank_0
 22 | from megatron import get_timers
 23 | from megatron import get_tokenizer
 24 | from megatron import mpu
 25 | from megatron.data.gpt_dataset import build_train_valid_test_datasets
 26 | from megatron.model import GPTModel, ModelType
 27 | from megatron.training import pretrain
 28 | from megatron.utils import get_ltor_masks_and_position_ids
 29 | from megatron.utils import average_losses_across_data_parallel_group
 30 | import torch_xla.core.xla_model as xm
 31 | import torch_xla.distributed.xla_multiprocessing as xmp
 32 | import os
 33 | import torch_xla.debug.metrics as met
 34 | 
 35 | import torch_xla.debug.profiler as xp
 36 | 
 37 | os.environ["NEURON_CC_FLAGS"] = "--model-type transformer"
 38 | 
 39 | def model_provider(pre_process=True, post_process=True):
 40 |     """Build the model."""
 41 | 
 42 |     device = xm.xla_device()
 43 |     print_rank_0('building GPT model ...')
 44 |     model = GPTModel(
 45 |         num_tokentypes=0,
 46 |         parallel_output=True,
 47 |         pre_process=pre_process,
 48 |         post_process=post_process
 49 |     ).to(device)
 50 |     return model
 51 | 
 52 | 
 53 | def get_batch(data_iterator):
 54 |     """Generate a batch"""
 55 |     args = get_args()
 56 |     tokenizer = get_tokenizer()
 57 | 
 58 |     # Items and their type.
 59 |     keys = ['text']
 60 |     datatype = torch.int64
 61 | 
 62 |     # Broadcast data.
 63 |     if data_iterator is not None:
 64 |         data = next(data_iterator)
 65 |     else:
 66 |         data = None
 67 |     #data_b = mpu.broadcast_data(keys, data, datatype)
 68 |     data_b = data
 69 | 
 70 |     # Unpack.
 71 |     #tokens_ = data_b['text'].long()
 72 |     tokens_ = data_b['text'].int()
 73 |     labels = tokens_[:, 1:].contiguous()
 74 |     tokens = tokens_[:, :-1].contiguous()
 75 | 
 76 |     # Get the masks and postition ids.
 77 |     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
 78 |         tokens,
 79 |         tokenizer.eod,
 80 |         args.reset_position_ids,
 81 |         args.reset_attention_mask,
 82 |         args.eod_mask_loss)
 83 | 
 84 |     return tokens, labels, loss_mask, attention_mask, position_ids
 85 | 
 86 | def loss_func(loss_mask, output_tensor):
 87 |     losses = output_tensor.float()
 88 |     loss_mask = loss_mask.view(-1).float()
 89 |     loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
 90 |     
 91 |     if mpu.get_data_parallel_world_size() > 1:
 92 |         # Reduce loss for logging.
 93 |         averaged_loss = average_losses_across_data_parallel_group([loss])
 94 |         return loss, {'lm loss': averaged_loss[0]}
 95 |     else:
 96 |         return loss, {'lm loss': loss}
 97 | 
 98 | 
 99 | def forward_step(data_iterator, model):
100 |     """Forward step."""
101 |     args = get_args()
102 |     timers = get_timers()
103 | 
104 |     # Get the batch.
105 |     timers('batch-generator').start()
106 |     tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
107 |         data_iterator)
108 |     timers('batch-generator').stop()
109 |     output_tensor = model(tokens, position_ids, attention_mask,
110 |                           labels=labels)
111 | 
112 |     return output_tensor, partial(loss_func, loss_mask)
113 | 
114 | 
115 | def train_valid_test_datasets_provider(train_val_test_num_samples):
116 |     """Build train, valid, and test datasets."""
117 |     args = get_args()
118 | 
119 |     print_rank_0('> building train, validation, and test datasets '
120 |                  'for GPT ...')
121 |     train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
122 |         data_prefix=args.data_path,
123 |         data_impl=args.data_impl,
124 |         splits_string=args.split,
125 |         train_valid_test_num_samples=train_val_test_num_samples,
126 |         seq_length=args.seq_length,
127 |         seed=args.seed,
128 |         skip_warmup=(not args.mmap_warmup))
129 |     print_rank_0("> finished creating GPT datasets ...")
130 |     return train_ds, valid_ds, test_ds
131 | 
132 | 
133 | def pretrain_mp(rank, world_size):
134 |     os.environ['RANK'] = str(rank)
135 |     os.environ['WORLD_SIZE'] = str(world_size)
136 |     pretrain(train_valid_test_datasets_provider, model_provider,
137 |              ModelType.encoder_or_decoder,
138 |              forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
139 |     xm.rendezvous('ending')
140 |     #xm.mark_step()
141 | 
142 | if __name__ == '__main__':
143 |     world_size = int(os.environ['NEURON_NUM_DEVICES'])
144 |     xmp.spawn(pretrain_mp,
145 |         args=(world_size,),
146 |         nprocs=world_size,
147 |         join=True)
148 | 


--------------------------------------------------------------------------------
/pretrain_vit.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Pretrain VIT"""
 17 | 
 18 | import torch
 19 | import torch.nn.functional as F
 20 | from functools import partial
 21 | from megatron import get_args, get_timers, mpu, print_rank_0
 22 | from megatron.data.vit_dataset import build_train_valid_datasets
 23 | from megatron.model import ModelType
 24 | from megatron.model.vit_model import VitModel
 25 | from megatron.training import pretrain
 26 | from megatron.utils import average_losses_across_data_parallel_group
 27 | 
 28 | def model_provider(pre_process=True, post_process=True):
 29 |     """Build the model."""
 30 | 
 31 |     print_rank_0("building VIT model ...")
 32 |     args = get_args()
 33 | 
 34 |     model = VitModel(num_classes=args.num_classes,
 35 |                      pre_process=pre_process,
 36 |                      post_process=post_process)
 37 |     return model
 38 | 
 39 | def get_batch(data_iterator):
 40 |     """Build the batch."""
 41 |     data = next(data_iterator)
 42 | 
 43 |     # only data parallelism; no need for broadcast
 44 |     images = data[0].cuda()
 45 |     labels = data[1].cuda()
 46 | 
 47 |     return images, labels
 48 | 
 49 | def loss_func(labels, output_tensor):
 50 |     logits = output_tensor.contiguous().float()
 51 |     loss = F.cross_entropy(logits, labels)
 52 | 
 53 |     outputs = torch.argmax(logits, -1)
 54 |     correct = (outputs == labels).float()
 55 |     accuracy = torch.mean(correct)
 56 | 
 57 |     averaged_loss = average_losses_across_data_parallel_group([loss, accuracy])
 58 | 
 59 |     return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
 60 | 
 61 | def forward_step(data_iterator, model):
 62 |     """Forward step."""
 63 |     timers = get_timers()
 64 | 
 65 |     # Get the batch.
 66 |     timers("batch-generator").start()
 67 |     (
 68 |         images,
 69 |         labels,
 70 |     ) = get_batch(data_iterator)
 71 |     timers("batch-generator").stop()
 72 | 
 73 |     # Forward model. lm_labels
 74 |     output_tensor = model(images)
 75 | 
 76 |     return output_tensor, partial(loss_func, labels)
 77 | 
 78 | def train_valid_test_datasets_provider(train_val_test_num_samples):
 79 |     """Build train, valid, and test datasets."""
 80 |     args = get_args()
 81 | 
 82 |     print_rank_0(
 83 |         "> building train, validation, and test datasets " "for VIT ..."
 84 |     )
 85 |     train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path)
 86 |     print_rank_0("> finished creating VIT datasets ...")
 87 | 
 88 |     return train_ds, valid_ds, None
 89 | 
 90 | 
 91 | if __name__ == "__main__":
 92 | 
 93 |     pretrain(
 94 |         train_valid_test_datasets_provider,
 95 |         model_provider,
 96 |         ModelType.encoder_or_decoder,
 97 |         forward_step,
 98 |         args_defaults={'dataloader_type': 'cyclic'}
 99 |     )
100 | 


--------------------------------------------------------------------------------
/tasks/data_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """ Tasks data utility."""
 17 | 
 18 | import re
 19 | import numpy as np
 20 | 
 21 | 
 22 | def clean_text(text):
 23 |     """Remove new lines and multiple spaces and adjust end of sentence dot."""
 24 | 
 25 |     text = text.replace("\n", " ")
 26 |     text = re.sub(r'\s+', ' ', text)
 27 |     for _ in range(3):
 28 |         text = text.replace(' . ', '. ')
 29 | 
 30 |     return text
 31 | 
 32 | 
 33 | def build_sample(ids, types, paddings, label, unique_id):
 34 |     """Convert to numpy and return a sample consumed by the batch producer."""
 35 | 
 36 |     ids_np = np.array(ids, dtype=np.int64)
 37 |     types_np = np.array(types, dtype=np.int64)
 38 |     paddings_np = np.array(paddings, dtype=np.int64)
 39 |     sample = ({'text': ids_np,
 40 |                'types': types_np,
 41 |                'padding_mask': paddings_np,
 42 |                'label': int(label),
 43 |                'uid': int(unique_id)})
 44 | 
 45 |     return sample
 46 | 
 47 | 
 48 | def build_tokens_types_paddings_from_text(text_a, text_b,
 49 |                                           tokenizer, max_seq_length):
 50 |     """Build token types and paddings, trim if needed, and pad if needed."""
 51 | 
 52 |     text_a_ids = tokenizer.tokenize(text_a)
 53 |     text_b_ids = None
 54 |     if text_b is not None:
 55 |         text_b_ids = tokenizer.tokenize(text_b)
 56 | 
 57 |     return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
 58 |                                                 max_seq_length, tokenizer.cls,
 59 |                                                 tokenizer.sep, tokenizer.pad)
 60 | 
 61 | 
 62 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
 63 |                                          cls_id, sep_id, pad_id):
 64 |     """Build token types and paddings, trim if needed, and pad if needed."""
 65 | 
 66 |     ids = []
 67 |     types = []
 68 |     paddings = []
 69 | 
 70 |     # [CLS].
 71 |     ids.append(cls_id)
 72 |     types.append(0)
 73 |     paddings.append(1)
 74 | 
 75 |     # A.
 76 |     len_text_a = len(text_a_ids)
 77 |     ids.extend(text_a_ids)
 78 |     types.extend([0] * len_text_a)
 79 |     paddings.extend([1] * len_text_a)
 80 | 
 81 |     # [SEP].
 82 |     ids.append(sep_id)
 83 |     types.append(0)
 84 |     paddings.append(1)
 85 | 
 86 |     # B.
 87 |     if text_b_ids is not None:
 88 |         len_text_b = len(text_b_ids)
 89 |         ids.extend(text_b_ids)
 90 |         types.extend([1] * len_text_b)
 91 |         paddings.extend([1] * len_text_b)
 92 | 
 93 |     # Cap the size.
 94 |     trimmed = False
 95 |     if len(ids) >= max_seq_length:
 96 |         max_seq_length_m1 = max_seq_length - 1
 97 |         ids = ids[0:max_seq_length_m1]
 98 |         types = types[0:max_seq_length_m1]
 99 |         paddings = paddings[0:max_seq_length_m1]
100 |         trimmed = True
101 | 
102 |     # [SEP].
103 |     if (text_b_ids is not None) or trimmed:
104 |         ids.append(sep_id)
105 |         if text_b_ids is None:
106 |             types.append(0)
107 |         else:
108 |             types.append(1)
109 |         paddings.append(1)
110 | 
111 |     # Padding.
112 |     padding_length = max_seq_length - len(ids)
113 |     if padding_length > 0:
114 |         ids.extend([pad_id] * padding_length)
115 |         types.extend([pad_id] * padding_length)
116 |         paddings.extend([0] * padding_length)
117 | 
118 |     return ids, types, paddings
119 | 


--------------------------------------------------------------------------------
/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """GLUE dataset."""
17 | 
18 | from abc import ABC
19 | from abc import abstractmethod
20 | 
21 | from torch.utils.data import Dataset
22 | 
23 | from megatron import print_rank_0
24 | from tasks.data_utils import build_sample
25 | from tasks.data_utils import build_tokens_types_paddings_from_text
26 | 
27 | 
28 | class GLUEAbstractDataset(ABC, Dataset):
29 |     """GLUE base dataset class."""
30 | 
31 |     def __init__(self, task_name, dataset_name, datapaths,
32 |                  tokenizer, max_seq_length):
33 |         # Store inputs.
34 |         self.task_name = task_name
35 |         self.dataset_name = dataset_name
36 |         self.tokenizer = tokenizer
37 |         self.max_seq_length = max_seq_length
38 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
39 |                                                              self.dataset_name))
40 |         # Process the files.
41 |         string = '  > paths:'
42 |         for path in datapaths:
43 |             string += ' ' + path
44 |         print_rank_0(string)
45 |         self.samples = []
46 |         for datapath in datapaths:
47 |             self.samples.extend(self.process_samples_from_single_path(datapath))
48 |         print_rank_0('  >> total number of samples: {}'.format(
49 |             len(self.samples)))
50 | 
51 |     def __len__(self):
52 |         return len(self.samples)
53 | 
54 |     def __getitem__(self, idx):
55 |         raw_sample = self.samples[idx]
56 |         ids, types, paddings = build_tokens_types_paddings_from_text(
57 |             raw_sample['text_a'], raw_sample['text_b'],
58 |             self.tokenizer, self.max_seq_length)
59 |         sample = build_sample(ids, types, paddings,
60 |                               raw_sample['label'], raw_sample['uid'])
61 |         return sample
62 | 
63 |     @abstractmethod
64 |     def process_samples_from_single_path(self, datapath):
65 |         """Abstract method that takes a single path / filename and
66 |         returns a list of dataset samples, each sample being a dict of
67 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
68 |         """
69 |         pass
70 | 


--------------------------------------------------------------------------------
/tasks/glue/finetune.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """GLUE finetuning/evaluation."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron import get_tokenizer
21 | from megatron import mpu
22 | from megatron.model.classification import Classification
23 | from tasks.eval_utils import accuracy_func_provider
24 | from tasks.finetune_utils import finetune
25 | 
26 | 
27 | def glue_classification(num_classes, Dataset,
28 |                         name_from_datapath_func):
29 | 
30 |     def train_valid_datasets_provider():
31 |         """Build train and validation dataset."""
32 |         args = get_args()
33 |         tokenizer = get_tokenizer()
34 | 
35 |         train_dataset = Dataset('training', args.train_data,
36 |                                 tokenizer, args.seq_length)
37 |         valid_dataset = Dataset('validation', args.valid_data,
38 |                                 tokenizer, args.seq_length)
39 | 
40 |         return train_dataset, valid_dataset
41 | 
42 |     def model_provider(pre_process=True, post_process=True):
43 |         """Build the model."""
44 |         args = get_args()
45 | 
46 |         print_rank_0('building classification model for {} ...'.format(
47 |             args.task))
48 |         model = Classification(num_classes=num_classes, num_tokentypes=2,
49 |                                pre_process=pre_process, post_process=post_process)
50 | 
51 |         return model
52 | 
53 |     def metrics_func_provider():
54 |         """Privde metrics callback function."""
55 |         def single_dataset_provider(datapath):
56 |             args = get_args()
57 |             tokenizer = get_tokenizer()
58 | 
59 |             name = name_from_datapath_func(datapath)
60 |             return Dataset(name, [datapath], tokenizer, args.seq_length)
61 |         return accuracy_func_provider(single_dataset_provider)
62 | 
63 |     """Finetune/evaluate."""
64 |     finetune(train_valid_datasets_provider, model_provider,
65 |              end_of_epoch_callback_provider=metrics_func_provider)
66 | 
67 | 
68 | def main():
69 |     args = get_args()
70 | 
71 |     if args.task == 'MNLI':
72 | 
73 |         num_classes = 3
74 |         from tasks.glue.mnli import MNLIDataset as Dataset
75 | 
76 |         def name_from_datapath(datapath):
77 |             return datapath.split('MNLI')[-1].strip(
78 |                 '.tsv').strip('/').replace('_', '-')
79 | 
80 |     elif args.task == 'QQP':
81 | 
82 |         num_classes = 2
83 |         from tasks.glue.qqp import QQPDataset as Dataset
84 | 
85 |         def name_from_datapath(datapath):
86 |             return datapath.split('QQP')[-1].strip(
87 |                 '.tsv').strip('/').replace('_', '-')
88 | 
89 |     else:
90 |         raise NotImplementedError('GLUE task {} is not implemented.'.format(
91 |             args.task))
92 | 
93 |     glue_classification(num_classes, Dataset, name_from_datapath)
94 | 


--------------------------------------------------------------------------------
/tasks/glue/mnli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """MNLI dataset."""
17 | 
18 | from megatron import print_rank_0
19 | from tasks.data_utils import clean_text
20 | from .data import GLUEAbstractDataset
21 | 
22 | 
23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
24 | 
25 | 
26 | class MNLIDataset(GLUEAbstractDataset):
27 | 
28 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
29 |                  test_label='contradiction'):
30 |         self.test_label = test_label
31 |         super().__init__('MNLI', name, datapaths,
32 |                          tokenizer, max_seq_length)
33 | 
34 |     def process_samples_from_single_path(self, filename):
35 |         """"Implement abstract method."""
36 |         print_rank_0(' > Processing {} ...'.format(filename))
37 | 
38 |         samples = []
39 |         total = 0
40 |         first = True
41 |         is_test = False
42 |         with open(filename, 'r') as f:
43 |             for line in f:
44 |                 row = line.strip().split('\t')
45 |                 if first:
46 |                     first = False
47 |                     if len(row) == 10:
48 |                         is_test = True
49 |                         print_rank_0(
50 |                             '   reading {}, {} and {} columns and setting '
51 |                             'labels to {}'.format(
52 |                                 row[0].strip(), row[8].strip(),
53 |                                 row[9].strip(), self.test_label))
54 |                     else:
55 |                         print_rank_0('    reading {} , {}, {}, and {} columns '
56 |                                      '...'.format(
57 |                                          row[0].strip(), row[8].strip(),
58 |                                          row[9].strip(), row[-1].strip()))
59 |                     continue
60 | 
61 |                 text_a = clean_text(row[8].strip())
62 |                 text_b = clean_text(row[9].strip())
63 |                 unique_id = int(row[0].strip())
64 |                 label = row[-1].strip()
65 |                 if is_test:
66 |                     label = self.test_label
67 | 
68 |                 assert len(text_a) > 0
69 |                 assert len(text_b) > 0
70 |                 assert label in LABELS
71 |                 assert unique_id >= 0
72 | 
73 |                 sample = {'text_a': text_a,
74 |                           'text_b': text_b,
75 |                           'label': LABELS[label],
76 |                           'uid': unique_id}
77 |                 total += 1
78 |                 samples.append(sample)
79 | 
80 |                 if total % 50000 == 0:
81 |                     print_rank_0('  > processed {} so far ...'.format(total))
82 | 
83 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
84 |         return samples
85 | 


--------------------------------------------------------------------------------
/tasks/glue/qqp.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """QQP dataset."""
 17 | 
 18 | from megatron import print_rank_0
 19 | from tasks.data_utils import clean_text
 20 | from .data import GLUEAbstractDataset
 21 | 
 22 | 
 23 | LABELS = [0, 1]
 24 | 
 25 | 
 26 | class QQPDataset(GLUEAbstractDataset):
 27 | 
 28 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
 29 |                  test_label=0):
 30 |         self.test_label = test_label
 31 |         super().__init__('QQP', name, datapaths,
 32 |                          tokenizer, max_seq_length)
 33 | 
 34 |     def process_samples_from_single_path(self, filename):
 35 |         """"Implement abstract method."""
 36 |         print_rank_0(' > Processing {} ...'.format(filename))
 37 | 
 38 |         samples = []
 39 |         total = 0
 40 |         first = True
 41 |         is_test = False
 42 |         with open(filename, 'r') as f:
 43 |             for line in f:
 44 |                 row = line.strip().split('\t')
 45 |                 if first:
 46 |                     first = False
 47 |                     if len(row) == 3:
 48 |                         is_test = True
 49 |                         print_rank_0('   reading {}, {}, and {} columns and '
 50 |                                      'setting labels to {}'.format(
 51 |                                          row[0].strip(), row[1].strip(),
 52 |                                          row[2].strip(), self.test_label))
 53 |                     else:
 54 |                         assert len(row) == 6
 55 |                         print_rank_0('    reading {}, {}, {}, and {} columns'
 56 |                                      ' ...'.format(
 57 |                                          row[0].strip(), row[3].strip(),
 58 |                                          row[4].strip(), row[5].strip()))
 59 |                     continue
 60 | 
 61 |                 if is_test:
 62 |                     assert len(row) == 3, 'expected length 3: {}'.format(row)
 63 |                     uid = int(row[0].strip())
 64 |                     text_a = clean_text(row[1].strip())
 65 |                     text_b = clean_text(row[2].strip())
 66 |                     label = self.test_label
 67 |                     assert len(text_a) > 0
 68 |                     assert len(text_b) > 0
 69 |                 else:
 70 |                     if len(row) == 6:
 71 |                         uid = int(row[0].strip())
 72 |                         text_a = clean_text(row[3].strip())
 73 |                         text_b = clean_text(row[4].strip())
 74 |                         label = int(row[5].strip())
 75 |                     else:
 76 |                         print_rank_0('***WARNING*** index error, '
 77 |                                      'skipping: {}'.format(row))
 78 |                         continue
 79 |                     if len(text_a) == 0:
 80 |                         print_rank_0('***WARNING*** zero length a, '
 81 |                                      'skipping: {}'.format(row))
 82 |                         continue
 83 |                     if len(text_b) == 0:
 84 |                         print_rank_0('***WARNING*** zero length b, '
 85 |                                      'skipping: {}'.format(row))
 86 |                         continue
 87 |                 assert label in LABELS
 88 |                 assert uid >= 0
 89 | 
 90 |                 sample = {'uid': uid,
 91 |                           'text_a': text_a,
 92 |                           'text_b': text_b,
 93 |                           'label': label}
 94 |                 total += 1
 95 |                 samples.append(sample)
 96 | 
 97 |                 if total % 50000 == 0:
 98 |                     print_rank_0('  > processed {} so far ...'.format(total))
 99 | 
100 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
101 |         return samples
102 | 


--------------------------------------------------------------------------------
/tasks/orqa/README.md:
--------------------------------------------------------------------------------
 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering
 2 | 
 3 | Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
 4 | 
 5 | ## Retriever Training
 6 | 
 7 | #### Unsupervised pretraining
 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body.
 9 | 
10 | <pre>
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | </pre>
20 | 
21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training.
22 | 
23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf).
24 | 
25 | #### Supervised finetuning
26 | 
27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906).
28 | 
29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model.
30 | 
31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408).
32 | 
33 | ## Reader Training
34 | 
35 | The reader component will be available soon.
36 | 
37 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | from megatron import get_args, print_rank_0
19 | from megatron.indexer import IndexBuilder
20 | from tasks.orqa.evaluate_utils import ORQAEvaluator
21 | 
22 | def main():
23 |     """
24 |     Main program
25 |     """
26 | 
27 |     args = get_args()
28 | 
29 |     """
30 |     Create a BlockData data structure by running an IndexBuilder over an
31 |     ICT Dataset and then evaluate on NQ task
32 |     """
33 | 
34 |     print_rank_0("Starting index builder!")
35 | 
36 |     index_builder = IndexBuilder()
37 |     index_builder.build_and_save_index()
38 |     print_rank_0("Build and save indices: done!")
39 | 
40 | 
41 |     print_rank_0("Starting evaluations!")
42 | 
43 |     # Set up the model and evaluator
44 |     evaluator = ORQAEvaluator()
45 | 
46 |     # Run evaluation
47 |     if args.qa_data_dev is not None:
48 |         evaluator.evaluate(args.qa_data_dev, "DEV")
49 | 
50 |     if args.qa_data_test is not None:
51 |         evaluator.evaluate(args.qa_data_test, "TEST")
52 | 
53 | 


--------------------------------------------------------------------------------
/tasks/race/data.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import glob
  3 | import json
  4 | import os
  5 | import time
  6 | 
  7 | from torch.utils.data import Dataset
  8 | 
  9 | from megatron import print_rank_0
 10 | from tasks.data_utils import build_sample
 11 | from tasks.data_utils import build_tokens_types_paddings_from_ids
 12 | from tasks.data_utils import clean_text
 13 | 
 14 | 
 15 | NUM_CHOICES = 4
 16 | MAX_QA_LENGTH = 128
 17 | 
 18 | 
 19 | class RaceDataset(Dataset):
 20 | 
 21 |     def __init__(self, dataset_name, datapaths, tokenizer, max_seq_length,
 22 |                  max_qa_length=MAX_QA_LENGTH):
 23 | 
 24 |         self.dataset_name = dataset_name
 25 |         print_rank_0(' > building RACE dataset for {}:'.format(
 26 |             self.dataset_name))
 27 | 
 28 |         string = '  > paths:'
 29 |         for path in datapaths:
 30 |             string += ' ' + path
 31 |         print_rank_0(string)
 32 | 
 33 |         self.samples = []
 34 |         for datapath in datapaths:
 35 |             self.samples.extend(process_single_datapath(datapath, tokenizer,
 36 |                                                         max_qa_length,
 37 |                                                         max_seq_length))
 38 | 
 39 |         print_rank_0('  >> total number of samples: {}'.format(
 40 |             len(self.samples)))
 41 | 
 42 |         # This indicates that each "sample" has multiple samples that
 43 |         # will collapse into batch dimension
 44 |         self.sample_multiplier = NUM_CHOICES
 45 | 
 46 |     def __len__(self):
 47 |         return len(self.samples)
 48 | 
 49 |     def __getitem__(self, idx):
 50 |         return self.samples[idx]
 51 | 
 52 | 
 53 | def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length):
 54 |     """Read in RACE files, combine, clean-up, tokenize, and convert to
 55 |     samples."""
 56 | 
 57 |     print_rank_0('   > working on {}'.format(datapath))
 58 |     start_time = time.time()
 59 | 
 60 |     # Get list of files.
 61 |     filenames = glob.glob(os.path.join(datapath, '*.txt'))
 62 | 
 63 |     samples = []
 64 |     num_docs = 0
 65 |     num_questions = 0
 66 |     num_samples = 0
 67 |     # Load all the files
 68 |     for filename in filenames:
 69 |         with open(filename, 'r') as f:
 70 |             for line in f:
 71 |                 data = json.loads(line)
 72 |                 num_docs += 1
 73 | 
 74 |                 context = data["article"]
 75 |                 questions = data["questions"]
 76 |                 choices = data["options"]
 77 |                 answers = data["answers"]
 78 |                 # Check the length.
 79 |                 assert len(questions) == len(answers)
 80 |                 assert len(questions) == len(choices)
 81 | 
 82 |                 # Context: clean up and convert to ids.
 83 |                 context = clean_text(context)
 84 |                 context_ids = tokenizer.tokenize(context)
 85 | 
 86 |                 # Loop over questions.
 87 |                 for qi, question in enumerate(questions):
 88 |                     num_questions += 1
 89 |                     # Label.
 90 |                     label = ord(answers[qi]) - ord("A")
 91 |                     assert label >= 0
 92 |                     assert label < NUM_CHOICES
 93 |                     assert len(choices[qi]) == NUM_CHOICES
 94 | 
 95 |                     # For each question, build num-choices samples.
 96 |                     ids_list = []
 97 |                     types_list = []
 98 |                     paddings_list = []
 99 |                     for ci in range(NUM_CHOICES):
100 |                         choice = choices[qi][ci]
101 |                         # Merge with choice.
102 |                         if "_" in question:
103 |                             qa = question.replace("_", choice)
104 |                         else:
105 |                             qa = " ".join([question, choice])
106 |                         # Clean QA.
107 |                         qa = clean_text(qa)
108 |                         # Tokenize.
109 |                         qa_ids = tokenizer.tokenize(qa)
110 |                         # Trim if needed.
111 |                         if len(qa_ids) > max_qa_length:
112 |                             qa_ids = qa_ids[0:max_qa_length]
113 | 
114 |                         # Build the sample.
115 |                         ids, types, paddings \
116 |                             = build_tokens_types_paddings_from_ids(
117 |                                 qa_ids, context_ids, max_seq_length,
118 |                                 tokenizer.cls, tokenizer.sep, tokenizer.pad)
119 | 
120 |                         ids_list.append(ids)
121 |                         types_list.append(types)
122 |                         paddings_list.append(paddings)
123 | 
124 |                     # Convert to numpy and add to samples
125 |                     samples.append(build_sample(ids_list, types_list,
126 |                                                 paddings_list, label,
127 |                                                 num_samples))
128 |                     num_samples += 1
129 | 
130 |     elapsed_time = time.time() - start_time
131 |     print_rank_0('    > processed {} document, {} questions, and {} samples'
132 |                  ' in {:.2f} seconds'.format(num_docs, num_questions,
133 |                                              num_samples, elapsed_time))
134 | 
135 |     return samples
136 | 


--------------------------------------------------------------------------------
/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Race."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron import get_tokenizer
21 | from megatron import mpu
22 | from megatron.model.multiple_choice import MultipleChoice
23 | from tasks.eval_utils import accuracy_func_provider
24 | from tasks.finetune_utils import finetune
25 | from tasks.race.data import RaceDataset
26 | 
27 | 
28 | def train_valid_datasets_provider():
29 |     """Provide train and validation datasets."""
30 |     args = get_args()
31 |     tokenizer = get_tokenizer()
32 | 
33 |     train_dataset = RaceDataset('training', args.train_data,
34 |                                 tokenizer, args.seq_length)
35 |     valid_dataset = RaceDataset('validation', args.valid_data,
36 |                                 tokenizer, args.seq_length)
37 | 
38 |     return train_dataset, valid_dataset
39 | 
40 | 
41 | def model_provider(pre_process=True, post_process=True):
42 |     """Build the model."""
43 | 
44 |     print_rank_0('building multichoice model for RACE ...')
45 |     model = MultipleChoice(num_tokentypes=2,
46 |                            pre_process=pre_process,
47 |                            post_process=post_process)
48 | 
49 |     return model
50 | 
51 | 
52 | def metrics_func_provider():
53 |     """Privde metrics callback function."""
54 |     args = get_args()
55 |     tokenizer = get_tokenizer()
56 | 
57 |     def single_dataset_provider(datapath):
58 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
59 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
60 | 
61 |     return accuracy_func_provider(single_dataset_provider)
62 | 
63 | 
64 | def main():
65 | 
66 |     finetune(train_valid_datasets_provider, model_provider,
67 |              end_of_epoch_callback_provider=metrics_func_provider)
68 | 


--------------------------------------------------------------------------------
/tasks/vision/classification.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Vision-classification finetuning/evaluation."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron.model.vit_model import VitModel
21 | from megatron.data.vit_dataset import build_train_valid_datasets
22 | from tasks.vision.eval_utils import accuracy_func_provider
23 | from tasks.vision.finetune_utils import finetune
24 | 
25 | 
26 | def classification():
27 |     def train_valid_datasets_provider():
28 |         """Build train and validation dataset."""
29 |         args = get_args()
30 | 
31 |         train_ds, valid_ds = build_train_valid_datasets(
32 |             data_path=args.data_path,
33 |             crop_size=args.img_dim,
34 |         )
35 |         return train_ds, valid_ds
36 | 
37 |     def model_provider(pre_process=True, post_process=True):
38 |         """Build the model."""
39 |         args = get_args()
40 | 
41 |         print_rank_0("building classification model for ImageNet ...")
42 | 
43 |         return VitModel(num_classes=args.num_classes, finetune=True,
44 |                         pre_process=pre_process, post_process=post_process)
45 | 
46 |     """Finetune/evaluate."""
47 |     finetune(
48 |         train_valid_datasets_provider,
49 |         model_provider,
50 |         end_of_epoch_callback_provider=accuracy_func_provider,
51 |     )
52 | 
53 | 
54 | def main():
55 |     classification()
56 | 


--------------------------------------------------------------------------------
/tasks/vision/eval_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Evaluation utilities."""
 17 | 
 18 | import os
 19 | from functools import partial
 20 | 
 21 | import torch
 22 | 
 23 | from megatron import get_args
 24 | from megatron import print_rank_0, print_rank_last
 25 | from megatron import mpu
 26 | from megatron.schedules import get_forward_backward_func
 27 | from tasks.vision.finetune_utils import build_data_loader
 28 | from tasks.vision.finetune_utils import process_batch
 29 | from torchvision import datasets, transforms
 30 | 
 31 | 
 32 | def accuracy_func_provider():
 33 |     """Provide function that calculates accuracies."""
 34 |     args = get_args()
 35 |     data_path = args.data_path
 36 |     crop_size = args.img_dim
 37 | 
 38 |     # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
 39 |     # Build dataloaders.
 40 |     val_data_path = os.path.join(data_path[0], "val")
 41 |     normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
 42 |     transform_val = transforms.Compose(
 43 |         [
 44 |             transforms.Resize(crop_size),
 45 |             transforms.CenterCrop(crop_size),
 46 |             transforms.ToTensor(),
 47 |             normalize,
 48 |         ]
 49 |     )
 50 |     dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val)
 51 | 
 52 |     dataloader = build_data_loader(
 53 |         dataset,
 54 |         args.micro_batch_size,
 55 |         num_workers=args.num_workers,
 56 |         drop_last=(mpu.get_data_parallel_world_size() > 1),
 57 |     )
 58 | 
 59 |     def metrics_func(model, epoch):
 60 |         print_rank_0("calculating metrics ...")
 61 |         correct, total = calculate_correct_answers(model, dataloader, epoch)
 62 |         percent = float(correct) * 100.0 / float(total)
 63 |         print_rank_last(
 64 |             " >> |epoch: {}| overall: correct / total = {} / {} = "
 65 |             "{:.4f} %".format(epoch, correct, total, percent)
 66 |         )
 67 | 
 68 |     return metrics_func
 69 | 
 70 | 
 71 | def calculate_correct_answers(model, dataloader, epoch):
 72 |     """Calculate correct over total answers"""
 73 | 
 74 |     args = get_args()
 75 |     forward_backward_func = get_forward_backward_func()
 76 |     for m in model:
 77 |         m.eval()
 78 | 
 79 |     def loss_func(labels, output_tensor):
 80 |         logits = output_tensor
 81 | 
 82 |         loss_dict = {}
 83 |         # Compute the correct answers.
 84 |         predicted = torch.argmax(logits, dim=-1)
 85 |         corrects = (predicted == labels).float()
 86 |         # Add to the counters.
 87 |         loss_dict['total'] = labels.size(0)
 88 |         loss_dict['correct'] = corrects.sum().item()
 89 | 
 90 |         return 0, loss_dict
 91 | 
 92 |     #defined inside to capture output_predictions
 93 |     def correct_answers_forward_step(batch, model):
 94 |         try:
 95 |             batch_ = next(batch)
 96 |         except BaseException:
 97 |             batch_ = batch
 98 |         images, labels = process_batch(batch_)
 99 | 
100 |         # Forward model.
101 |         args = get_args()
102 |         output_tensor = model(images)
103 | 
104 |         return output_tensor, partial(loss_func, labels)
105 | 
106 |     with torch.no_grad():
107 |         # For all the batches in the dataset.
108 |         total = 0
109 |         correct = 0
110 |         for _, batch in enumerate(dataloader):
111 | 
112 |             loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model,
113 |                                                optimizer=None, timers=None, forward_only=True)
114 | 
115 |             for loss_dict in loss_dicts:
116 |                 total += loss_dict['total']
117 |                 correct += loss_dict['correct']
118 | 
119 |     for m in model:
120 |         m.train()
121 | 
122 |     # Reduce.
123 |     if mpu.is_pipeline_last_stage():
124 |         unreduced = torch.cuda.LongTensor([correct, total])
125 |         torch.distributed.all_reduce(unreduced,
126 |                                      group=mpu.get_data_parallel_group())
127 | 
128 |         # Print on screen.
129 |         correct_ans = unreduced[0].item()
130 |         total_count = unreduced[1].item()
131 |         return correct_ans, total_count
132 | 


--------------------------------------------------------------------------------
/tasks/vision/main.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | import os
19 | import sys
20 | 
21 | sys.path.append(
22 |     os.path.abspath(
23 |         os.path.join(
24 |             os.path.join(os.path.dirname(__file__), os.path.pardir),
25 |             os.path.pardir,
26 |         )
27 |     )
28 | )
29 | from megatron import get_args
30 | from megatron.initialize import initialize_megatron
31 | from classification import main
32 | 
33 | 
34 | def get_tasks_args(parser):
35 |     """Provide extra arguments required for tasks."""
36 |     group = parser.add_argument_group(title="tasks")
37 | 
38 |     group.add_argument(
39 |         "--epochs",
40 |         type=int,
41 |         default=None,
42 |         help="Number of finetunning epochs. Zero results in "
43 |         "evaluation only.",
44 |     )
45 |     group.add_argument(
46 |         "--pretrained-checkpoint",
47 |         type=str,
48 |         default=None,
49 |         help="Pretrained checkpoint used for finetunning.",
50 |     )
51 |     group.add_argument(
52 |         "--keep-last",
53 |         action="store_true",
54 |         help="Keep the last batch (maybe incomplete) in" "the data loader",
55 |     )
56 | 
57 |     return parser
58 | 
59 | 
60 | if __name__ == "__main__":
61 | 
62 |     initialize_megatron(extra_args_provider=get_tasks_args)
63 |     args = get_args()
64 |     main()
65 | 


--------------------------------------------------------------------------------
/tasks/zeroshot_gpt/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Detokenization."""
17 | 
18 | import re
19 | 
20 | 
21 | def ptb_detokenizer(string):
22 |     string = string.replace(" '", "'")
23 |     string = string.replace(" \n", "\n")
24 |     string = string.replace("\n ", "\n")
25 |     string = string.replace(" n't", "n't")
26 |     string = string.replace(" N ", "1 ")
27 |     string = string.replace("$ 1", "$1")
28 |     string = string.replace("# 1", "#1")
29 |     return string
30 | 
31 | 
32 | def wikitext_detokenizer(string):
33 |     # contractions
34 |     string = string.replace("s '", "s'")
35 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
36 |     # number separators
37 |     string = string.replace(" @-@ ", "-")
38 |     string = string.replace(" @,@ ", ",")
39 |     string = string.replace(" @.@ ", ".")
40 |     # punctuation
41 |     string = string.replace(" : ", ": ")
42 |     string = string.replace(" ; ", "; ")
43 |     string = string.replace(" . ", ". ")
44 |     string = string.replace(" ! ", "! ")
45 |     string = string.replace(" ? ", "? ")
46 |     string = string.replace(" , ", ", ")
47 |     # double brackets
48 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
49 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
50 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
51 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
52 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
53 |     # miscellaneous
54 |     string = string.replace("= = = =", "====")
55 |     string = string.replace("= = =", "===")
56 |     string = string.replace("= =", "==")
57 |     string = string.replace(" " + chr(176) + " ", chr(176))
58 |     string = string.replace(" \n", "\n")
59 |     string = string.replace("\n ", "\n")
60 |     string = string.replace(" N ", " 1 ")
61 |     string = string.replace(" 's", "'s")
62 | 
63 |     return string
64 | 
65 | 
66 | def lambada_detokenizer(string):
67 |     return string
68 | 
69 | 
70 | _DETOKENIZERS = {
71 |     'ptb': ptb_detokenizer,
72 |     'wiki': wikitext_detokenizer,
73 |     'lambada': lambada_detokenizer,
74 | }
75 | 
76 | 
77 | def get_detokenizer(path):
78 |     for key in _DETOKENIZERS.keys():
79 |         if key in path:
80 |             return _DETOKENIZERS[key]
81 | 


--------------------------------------------------------------------------------
/tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 
4 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/openwebtext/README.md:
--------------------------------------------------------------------------------
 1 | The following steps show how to prepare training dataset to train the mode.
 2 | 
 3 | # Libraries to install
 4 | 
 5 | ```
 6 |     pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 
 7 |     git clone https://github.com/mattilyra/LSH
 8 |     cd LSH
 9 |     python setup.py install
10 | ``` 
11 | 
12 | # Download the dataset
13 | 
14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
15 | 2. Remove blacklisted URLs.
16 | ```
17 | python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
18 | ```
19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
20 | 
21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique.
22 | 
23 | # Prepare the data for GPT training:
24 | 
25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards.
26 | ```
27 | python cleanup_dataset.py <input data file> <output cleaned data filename>
28 | ```
29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`.
30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`.
31 | ```
32 | python find_duplicates.py --inputs <pairlist list of input cleaned data files and keys, e.g. cc.json cc_id news.json news_id> --output <output possible duplicate urls filename>
33 | ```
34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest.
35 | ```
36 | python group_duplicate_urls.py <possible duplicate urls file> <output file containing similar urls>
37 | ```
38 | 4. Remove similar documents that were detected in the last step.
39 | ```
40 | python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
41 | ```
42 | 
43 | 5. Shuffle the dataset.
44 | ```
45 | shuf <cleaned deduped data file> -o train_data.json
46 | ```
47 | 
48 | # Deduplicating ngrams
49 | 
50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command.
51 | 
52 | ```
53 | python filter_ngrams.py --tasks <name of the task, e.g. lambada, squad> --dedup-dataset <training dataset to deduplicate> <json key> --output <output training dataset>
54 | ```
55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments.
56 | 
57 | Only for the lambada task, we need to provide the path, `--lambada-path <path of the lambada test data>`.
58 | 
59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details.
60 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import json
18 | import os
19 | import time
20 | 
21 | """
22 | This code adds id to each json object in a json file. User can add prefix
23 | to the ids.
24 | """
25 | 
26 | if __name__ == '__main__':
27 | 
28 |     print('parsing the arguments ...')
29 | 
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
32 |         ' json file where id needs to be added')
33 |     parser.add_argument('--output-file', type=str, default=None, help=\
34 |         'Output file name with id')
35 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
36 |         'Id prefix')
37 |     parser.add_argument('--log-interval', type=int, default=100,
38 |                        help='Log interval')
39 |     args = parser.parse_args()
40 | 
41 |     print('Adding ids to dataset ...')
42 | 
43 |     f_input = open(args.input_file, 'r', encoding='utf-8')
44 |     f_output = open(args.output_file, 'wb')
45 | 
46 |     unique_ids = 1
47 |     start_time = time.time()
48 |     for row in f_input:
49 |         each_row = json.loads(row)
50 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
51 |         each_row['adlr_id'] = adlr_id_string
52 |         myjson = json.dumps(each_row, ensure_ascii=False)
53 | 
54 |         f_output.write(myjson.encode('utf-8'))
55 |         f_output.write('\n'.encode('utf-8'))
56 | 
57 |         if unique_ids % args.log_interval == 0:
58 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
59 |                     unique_ids, time.time() - start_time), flush=True)
60 | 
61 |         unique_ids += 1
62 | 
63 |     # Close the file.
64 |     f_input.close()
65 |     f_output.close()
66 |     
67 |     print('done :-)', flush=True)
68 | 


--------------------------------------------------------------------------------
/tools/openwebtext/cleanup_dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import ftfy
 18 | import json
 19 | from langdetect import detect
 20 | import numpy as np
 21 | import time
 22 | import os
 23 | import sys
 24 | 
 25 | from tokenizer import Tokenizer
 26 | 
 27 | MIN_DOCUMENT_LENGHT = 128
 28 | 
 29 | 
 30 | def print_progress(prefix, start_time, num_docs, num_fixed_text,
 31 |                    num_non_english_docs, chars_non_english_docs,
 32 |                    num_small_docs, chars_small_docs):
 33 | 
 34 |     string = prefix + ' | '
 35 |     string += 'elapsed time: {:.2f} | '.format(time.time() - start_time)
 36 |     string += 'documents: {} | '.format(num_docs)
 37 |     string += 'fixed text: {} | '.format(num_fixed_text)
 38 |     string += 'non-english: {} | '.format(num_non_english_docs)
 39 |     string += 'non-english chars: {} | '.format(chars_non_english_docs)
 40 |     string += 'small docs: {} | '.format(num_small_docs)
 41 |     string += 'small docs chars: {}'.format(chars_small_docs)
 42 |     print(string, flush=True)
 43 | 
 44 | 
 45 | def filter_corpus(filename, out_filename, print_interval=10000):
 46 | 
 47 |     print(' > filtering {}'.format(filename))
 48 | 
 49 |     tokenizer = Tokenizer(cache_dir='./cache')
 50 | 
 51 |     num_docs = 0
 52 |     num_written_docs = 0
 53 |     num_small_docs = 0
 54 |     num_fixed_text = 0
 55 |     num_non_english_docs = 0
 56 |     chars_non_english_docs = 0
 57 |     chars_small_docs = 0
 58 |     start_time = time.time()
 59 |     with open(out_filename, 'wb') as f:
 60 |         with open(filename, 'r') as fin:
 61 |             for line in fin:
 62 |                 try:
 63 |                     num_docs += 1
 64 |                     myjson = json.loads(line)
 65 |                     # Fix text
 66 |                     text = ftfy.fix_text(myjson['text'])
 67 |                     if text != myjson['text']:
 68 |                         num_fixed_text += 1
 69 |                     myjson['text'] = text
 70 |                     # Detect language.
 71 |                     if detect(text) != 'en':
 72 |                         print('[non-english text]', myjson)
 73 |                         num_non_english_docs += 1
 74 |                         chars_non_english_docs += len(text)
 75 |                         continue
 76 |                     # On average each token is 5 characters so 8 is an
 77 |                     # upper bound.
 78 |                     if len(text) < (8 * MIN_DOCUMENT_LENGHT):
 79 |                         tokens = tokenizer.tokenize_document(text)
 80 |                         if len(tokens) < MIN_DOCUMENT_LENGHT:
 81 |                             print('[small document, skipping]:', myjson)
 82 |                             num_small_docs += 1
 83 |                             chars_small_docs += len(text)
 84 |                             continue
 85 |                     myjson = json.dumps(myjson, ensure_ascii=False)
 86 |                     f.write(myjson.encode('utf-8'))
 87 |                     f.write('\n'.encode('utf-8'))
 88 |                     num_written_docs += 1
 89 |                     if num_docs % print_interval == 0:
 90 |                         print_progress('[PROGRESS]', start_time, num_docs,
 91 |                                        num_fixed_text, num_non_english_docs,
 92 |                                        chars_non_english_docs,
 93 |                                        num_small_docs, chars_small_docs)
 94 |                 except Exception as e:
 95 |                     print('    skipping ', line, e)
 96 | 
 97 |     print_progress('[FINAL]', start_time, num_docs,
 98 |                    num_fixed_text, num_non_english_docs,
 99 |                    chars_non_english_docs,
100 |                    num_small_docs, chars_small_docs)
101 | 
102 | 
103 | if __name__ == '__main__':
104 | 
105 |     print('building gpt2 dataset ...')
106 | 
107 |     input_filename = sys.argv[1]
108 |     output_filename = sys.argv[2]
109 | 
110 |     print('will be reading {}'.format(input_filename))
111 |     print('and will write the results to {}'.format(output_filename))
112 | 
113 |     filter_corpus(input_filename, output_filename)
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/tools/openwebtext/group_duplicate_url.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import time
18 | import sys
19 | 
20 | 
21 | if __name__ == '__main__':
22 | 
23 | 
24 |     print('grouping duplicate urls ...')
25 | 
26 |     input = sys.argv[1]
27 |     output = sys.argv[2]
28 |     if len(sys.argv) > 3:
29 |         jaccard_similarity_threshold = float(sys.argv[3])
30 |     else:
31 |         jaccard_similarity_threshold = 0.7
32 | 
33 |     url_to_index = {}
34 |     index_to_urls = []
35 |     counter = 0
36 |     start_time = time.time()
37 |     with open(input, 'r') as f:
38 |         for line in f:
39 |             counter += 1
40 |             myjson = json.loads(line)
41 |             urls = []
42 |             for main_url in myjson.keys():
43 |                 urls.append(main_url)
44 |                 for value in myjson[main_url]:
45 |                     for other_url, js in value.items():
46 |                         if js >= jaccard_similarity_threshold:
47 |                             urls.append(other_url)
48 |             current_index = -1
49 |             other_indices = set()
50 |             for url in urls:
51 |                 if url in url_to_index:
52 |                     if current_index == -1:
53 |                         current_index = url_to_index[url]
54 |                     elif current_index != url_to_index[url]:
55 |                         other_indices.add(url_to_index[url])
56 |             if current_index == -1:
57 |                 current_index = len(index_to_urls)
58 |                 index_to_urls.append(set())
59 |             for url in urls:
60 |                 url_to_index[url] = current_index
61 |                 index_to_urls[current_index].add(url)
62 |             for index in other_indices:
63 |                 for url in index_to_urls[index]:
64 |                     index_to_urls[current_index].add(url)
65 |                     url_to_index[url] = current_index
66 |                 index_to_urls[index] = None
67 | 
68 |             if counter % 100000 == 0:
69 |                 print(' > processed {} lines in {} seconds ...'.format(
70 |                     counter, time.time() - start_time))
71 | 
72 | 
73 |     total_remove = 0
74 |     total_remain = 0
75 |     for urls in index_to_urls:
76 |         if urls is not None:
77 |             if len(urls) > 1:
78 |                 total_remove += (len(urls) - 1)
79 |                 total_remain += 1
80 |     print('out of {} urls, only {} are unique and {} should be removed'.format(
81 |         total_remove+total_remain, total_remain, total_remove))
82 | 
83 |     with open(output, 'wb') as f:
84 |         for i, urls in enumerate(index_to_urls):
85 |             if urls is not None:
86 |                 if len(urls) > 1:
87 |                     myjson = json.dumps({str(i): list(urls)},
88 |                                         ensure_ascii=False)
89 |                     f.write(myjson.encode('utf-8'))
90 |                     f.write('\n'.encode('utf-8'))
91 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import glob
18 | import sys
19 | import json
20 | import argparse
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--json_path", type=str, default=".",
26 |         help="path where all the json files are located")
27 | 
28 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
29 |         help="filename where the merged json should go")
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     json_path = args.json_path
34 |     out_file = args.output_file
35 | 
36 |     json_files = glob.glob(json_path + '/*.json')
37 | 
38 |     counter = 0
39 | 
40 |     with open(out_file, 'w') as outfile:
41 |         for fname in json_files:
42 |             counter += 1
43 | 
44 |             if counter % 1024 == 0:
45 |                 print("Merging at ", counter, flush=True)
46 | 
47 |             with open(fname, 'r') as infile:
48 |                 for row in infile:
49 |                     each_row = json.loads(row)
50 |                     outfile.write(row)
51 | 
52 | 
53 |     print("Merged file", out_file, flush=True)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import json
18 | import time
19 | import sys
20 | 
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     url_filename = sys.argv[1]
25 |     data_filename = sys.argv[2]
26 |     output_filename = sys.argv[3]
27 | 
28 |     urls = set()
29 |     with open(url_filename, 'r') as f:
30 |         for line in f:
31 |             myjson = json.loads(line)
32 |             for key in myjson:
33 |                 this_urls = myjson[key]
34 |                 for i in range(1, len(this_urls)):
35 |                     urls.add(this_urls[i])
36 |     print('will be removing {} urls'.format(len(urls)), flush=True)
37 | 
38 |     written_docs = 0
39 |     removed_docs = 0
40 |     removed_chars = 0
41 |     start_time = time.time()
42 |     with open(output_filename, 'wb') as fout:
43 |         with open(data_filename, 'r') as fin:
44 |             for line in fin:
45 |                 try:
46 |                     myjson = json.loads(line)
47 |                     url = myjson['url']
48 |                     if url in urls:
49 |                         print('removing', myjson)
50 |                         removed_docs += 1
51 |                         removed_chars += len(myjson['text'])
52 |                         continue
53 |                     myjson = json.dumps(myjson, ensure_ascii=False)
54 |                     fout.write(myjson.encode('utf-8'))
55 |                     fout.write('\n'.encode('utf-8'))
56 |                     written_docs += 1
57 |                     if written_docs % 10000 == 0:
58 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
59 |                               '| removed: {} (char: {})'.format(
60 |                                   time.time() - start_time,
61 |                                   written_docs, removed_docs, removed_chars))
62 |                 except Exception as e:
63 |                     print('[SKIPPING]', line, e)
64 | 
65 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
66 |           '| removed: {} (char: {})'.format(
67 |               time.time() - start_time,
68 |               written_docs, removed_docs, removed_chars))
69 |     print('done :-)')
70 | 


--------------------------------------------------------------------------------
/tools/run_text_generation_server.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Sample Generate GPT"""
17 | import os
18 | import sys
19 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
20 |                                              os.path.pardir)))
21 | import socket
22 | from megatron import get_args
23 | from megatron import print_rank_0
24 | from megatron import mpu
25 | from megatron.checkpointing import load_checkpoint
26 | from megatron.initialize import initialize_megatron
27 | from megatron.model import GPTModel
28 | from megatron.training import get_model
29 | from megatron.text_generation_server import MegatronServer
30 | from megatron.text_generation_utils import generate
31 | import torch
32 | 
33 | def model_provider(pre_process=True, post_process=True):
34 |     """Build the model."""
35 | 
36 |     print_rank_0('building GPT model ...')
37 |     model = GPTModel(num_tokentypes=0, parallel_output=False, pre_process=pre_process, post_process=post_process)
38 | 
39 |     return model
40 | 
41 | def add_text_generate_args(parser):
42 |     group = parser.add_argument_group(title='text generation')
43 | 
44 |     group.add_argument("--temperature", type=float, default=1.0,
45 |                        help='Sampling temperature.')
46 |     group.add_argument("--greedy", action='store_true', default=False,
47 |                        help='Use greedy sampling.')
48 |     group.add_argument("--top_p", type=float, default=0.0,
49 |                        help='Top p sampling.')
50 |     group.add_argument("--top_k", type=int, default=0,
51 |                        help='Top k sampling.')
52 |     group.add_argument("--out-seq-length", type=int, default=1024,
53 |                        help='Size of the output generated text.')
54 |     return parser
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     initialize_megatron(extra_args_provider=add_text_generate_args,
59 |                         args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
60 |                                        'no_load_rng': True,
61 |                                        'no_load_optim': True})
62 | 
63 |     args = get_args()
64 |     if args.num_layers_per_virtual_pipeline_stage is not None:
65 |         print("Interleaved pipeline schedule is not yet supported for text generation.")
66 |         exit()
67 |     # Set up model and load checkpoint
68 |     model = get_model(model_provider, wrap_with_ddp=False)
69 | 
70 |     if args.load is not None:
71 |         _ = load_checkpoint(model, None, None)
72 | 
73 |     assert len(model) == 1, "Above condition should have caught this"
74 |     model = model[0]
75 |     if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
76 |         server = MegatronServer(model)
77 |         server.run("0.0.0.0")
78 | 
79 |     while True:
80 |         choice = torch.cuda.LongTensor(1)
81 |         torch.distributed.broadcast(choice, 0)
82 |         if choice[0].item() == 0:
83 |             generate(model)
84 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import json
16 | import sys
17 | import urllib2
18 | class PutRequest(urllib2.Request):
19 |     '''class to handling putting with urllib2'''
20 | 
21 |     def get_method(self, *args, **kwargs):
22 |         return 'PUT'
23 | 
24 | if __name__ == "__main__":
25 |     url = sys.argv[1]
26 |     while True:
27 |         sentence = raw_input("Enter prompt: ")
28 |         max_len = int(input("Enter number tokens output: "))
29 |         data = json.dumps({"sentences": [sentence], "max_len":max_len})
30 |         req = PutRequest(url, data, {'Content-Type': 'application/json'})
31 |         response = urllib2.urlopen(req)
32 |         resp_sentences = json.load(response)
33 |         print("Megatron Response: ")
34 |         print(resp_sentences["sentences"][0])
35 | 


--------------------------------------------------------------------------------